In [None]:
!pip install transformers
import pandas as pd
pd.options.mode.chained_assignment = None
from pandas import DataFrame
#import pytorch_multibert_embeddings
# Reads Dataframe
exsp = pd.read_csv('../input/extreme-speech/data-export-2021-11-03.csv')

labels = exsp.Label

target_p = exsp.get("Target (protected)")
target_o= exsp.get("Target (others)")

# Returns list of all protected targets
def get_target_protected():
    targets_protected = []
    for line in target_p:
        targets = str(line).split(",")
        for target in targets:
            target = target.strip()
            if target not in targets_protected and target != "nan":
                targets_protected.append(target)
    return targets_protected

# Returns list of target (others)
def get_target_others():
    targets_others = []
    for line in target_o:
        targets = str(line).split(",")
        for target in targets:
            target = target.strip()
            if target not in targets_others and target != "nan":
                targets_o.append(target)
    return targets_others

# Returns list without NAs and duplicates
def unique_non_null(s):
    return s.dropna().unique()

# Returns label distribution
def get_label_distribution():
    return labels.value_counts()

target_dict = {}

# Generates a dictionary that contains countries as keys and the amount of each protected target as values
for country in unique_non_null(exsp.Country):
    target_dict[country] = []
    for target in get_target_protected():
        target_dict[country].append(sum((exsp["Target (protected)"].str.contains(target)) & (exsp["Country"] == country)))

# Generates a new dataframe which displays the distribution of each protected target across the countries
target_df = DataFrame.from_dict(
    target_dict,
    orient = 'index',
    columns = get_target_protected() 
)

target_df = target_df.transpose()



In [None]:
from sklearn.linear_model import LogisticRegression
import nltk

seed = 123
from time import time
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import linear_model
import re, random, string
from nltk import word_tokenize, sent_tokenize, pos_tag
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.feature_extraction import text
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
#from spellchecker import SpellChecker

exsp_kenya = exsp[exsp['Country'] == "Kenya"]

exsp_kenya['Label'].value_counts()

# Preprocessing
def remove_numbers(text):
    result = re.sub(r'\d+', '', text)
    return result

def remove_punctuation(text):
    result = "".join([i for i in text if i not in string.punctuation])
    return result

def remove_whitespace(text):
    return " ".join(text.split())

def remove_newlines(text):
    return re.sub(r'\r+|\n+|\t+','', text)

def clean_text(text):
    text = text.lower()
    remove_numbers(text)
    remove_punctuation(text)
    remove_whitespace(text)
    text = nltk.WordPunctTokenizer().tokenize(text)
    return text


exsp_kenya['Text_Cleaned'] = exsp_kenya.Text.apply(lambda x:remove_punctuation(x))
exsp_kenya['Text_Cleaned'] = exsp_kenya['Text_Cleaned'].apply(lambda x:x.lower())
exsp_kenya['Text_Cleaned'] = exsp_kenya['Text_Cleaned'].apply(lambda x:remove_numbers(x))
exsp_kenya['Text_Cleaned'] = exsp_kenya['Text_Cleaned'].apply(lambda x:remove_newlines(x))

exsp_kenya.head()

exsp_kenya[exsp_kenya.ID ==6909]["Text"]

In [None]:
import logging

vectorizer = CountVectorizer(ngram_range=(1,1), stop_words = 'english')
x = vectorizer.fit_transform(exsp_kenya.Text_Cleaned)
#tfidftransformer = TfidfTransformer()
#x_tfidf = tfidftransformer.fit_transform(x)
features = vectorizer.get_feature_names_out()
features = features.tolist()

print(type(x.A))

In [None]:
ids = exsp_kenya.ID
feature_df = pd.DataFrame(x_.A, columns = features)
ids = ids.to_numpy()

feature_df.insert(0, "ID", ids)

labels = exsp_kenya.Label

label_df = pd.DataFrame(list(zip(ids, labels)), columns = ["ID", "Labels"])
label_df = label_df.replace(["dangerous speech", "derogatory extreme speech", "exclusionary extreme speech"], [0,1,2])
mod_df = pd.DataFrame(list(zip(ids, labels)), columns = ["ID", "Moderation"])
mod_df = mod_df.replace(["dangerous speech", "derogatory extreme speech", "exclusionary extreme speech"], ['R','M','R'])


feature_labels_df = pd.merge(feature_df, label_df, left_on="ID", right_on="ID", how="left")
#print(feature_labels_df.head())

feature_mod_df = pd.merge(feature_df, mod_df, left_on="ID", right_on="ID", how="left")
print(feature_mod_df.head())

In [None]:
X_l = feature_labels_df.iloc[:, 1:-3]
y_l = feature_labels_df.iloc[:, -1]
print(y_l)

X_l_train, X_l_test, y_l_train, y_l_test = train_test_split(X_l, y_l, test_size=0.2, random_state = seed)

In [None]:
X_m = feature_mod_df.iloc[:, 1:-3]
y_m = feature_mod_df.iloc[:, -1]
print(y_m)

X_m_train, X_m_test, y_m_train, y_m_test = train_test_split(X_m, y_m, test_size=0.2, random_state=seed)

In [None]:
def get_id_from_index(index):
    return feature_labels_df.iloc[index]['ID']
    
    
loglabel = linear_model.LogisticRegression(multi_class='ovr', solver='liblinear')
loglabel.fit(X_l_train, y_l_train)

logmod = linear_model.LogisticRegression()
logmod.fit(X_m_train, y_m_train)


In [None]:
#print('Predicted value is=', lm.predict(X_test))
#print('Actual value from test data is ', y_test)
#type(y_test)

test_index = list(X_l_test.index.values)
test_ids = []
for test_idx in test_index:
    test_ids.append(get_id_from_index(test_idx))
test_ids = np.array(test_ids)
    
test_text = []
for test_id in test_ids:
    test_text.append(exsp_kenya[exsp_kenya['ID']==test_id]["Text"])
test_text = np.array(test_text).flatten()
#rint(test_text)
    
test_label = []
for test_id in test_ids:
    test_label.append(exsp_kenya[exsp_kenya['ID'] ==test_id]["Label"])
test_label = np.array(test_label).flatten()

test_mod = []
for test_id in test_ids:
    test_mod.append(feature_mod_df[feature_mod_df['ID']==test_id]["Moderation"])
test_mod = np.array(test_mod).flatten()

label_test_predictions = loglabel.predict(X_l_test)
label_test_predictions = label_test_predictions.astype('str')

label_test_predictions[label_test_predictions == '0'] = "dangerous speech"
label_test_predictions[label_test_predictions == '1'] = "derogatory extreme speech"
label_test_predictions[label_test_predictions == '2'] = "exclusionary extreme speech"
#pint(test_predictions)

mod_test_predictions = logmod.predict(X_m_test)
mod_test_predictions = mod_test_predictions.astype('str')

In [None]:
test_results = np.array([test_ids, test_text, test_label, label_test_predictions, test_mod, mod_test_predictions])



In [None]:

results = pd.DataFrame(data=test_results)
results = results.T
results.columns = ["ID", "Text", "Actual label", "Predicted Label", "Moderation", "Predicted Moderation"]

results.to_csv('results.csv', index=False)
    

In [None]:
# naive bayes
nbclasslabel = MultinomialNB(alpha=0.1).fit(X_l_train, y_l_train)
vect = CountVectorizer().fit(X_l_train)
predict = nbclasslabel.predict(vect.transform(X_l_test))
for el in predict:
    print(el)

In [None]:
import keras
from keras import Sequential
from keras.layers import Dense, Activation, Dropout


In [None]:
max_words, batch_size, epochs = 10000, 256, 3
tokenizer = Tokenizer(num_words=max_words)
x_train = tokenizer.sequences_to_matrix(X_l_train, mode='binary')
x_test = tokenizer.sequences_to_matrix(X_l_test, mode='binary')

y_train = keras.utils.to_categorically(y_l_train, 3)
y_test = keras.utils.to_categorically(y_l_test, 3)

model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('tanh'))
model.add(Dropout(0.5))

model.add(Dense(3))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs)
