In [None]:
import sys
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
data = pd.read_csv("../input/sinhala-hate-speech/final.csv")
print("Before", data.shape)
data = data[pd.notnull(data['full_text_without_emoji'])]
data = data[pd.notnull(data['label'])]
print("After:", data.shape)
data.head()

In [None]:
data.dtypes

In [None]:
data.columns

In [None]:
data.user_id.duplicated().sum()

In [None]:
data["label"] = pd.Categorical(data.label)
data["label"]=data.label.replace({0.0: "Not Offensive", 1.0: "Offensive"})

In [None]:
data.groupby('label').size().plot(kind='pie', autopct='%.2f')


In [None]:
plt.figure(figsize=(15,7.5))

data.groupby('label').size().plot(kind='pie',startangle=90,autopct='%1.1f%%',colors=['C0','C1'],labels = ['Not Offensive', 'Offensive'],textprops={'fontweight':'bold','fontsize': 12});
plt.legend(loc=3,fontsize=10)
plt.ylabel('')
plt.title('Distribution Of Offensive and Non-Offensive Tweets',fontweight="bold",fontsize = 20)
plt.axis('equal')

In [None]:
data.dtypes

In [None]:
data.head(2)

In [None]:
pd.crosstab(data['retweeted'], data['label']).plot(kind='bar', stacked=True,color= ["maroon","lightcoral"])

In [None]:
data.retweeted.unique()

In [None]:
pd.crosstab(data['is_quote_status'], data['label']).apply(lambda r: r/r.sum(), axis=1).plot(kind='bar', stacked=True,color= ["maroon","lightcoral"])

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="ticks", color_codes=True)

In [None]:
rslt_df = data[data['reply_count'] < 100]
data
sns.catplot(x="label", y="reply_count", kind="box", data=rslt_df)


In [None]:
data['possibly_sensitive_editable']=data['possibly_sensitive_editable'].fillna("False")

In [None]:
# possibly_sensitive_editable
pd.crosstab(data['possibly_sensitive_editable'], data['label']).apply(lambda r: r/r.sum(), axis=1).plot(kind='bar', stacked=True,color= ["maroon","lightcoral"])

In [None]:
# retweet_count
data["retweet_count"] = pd.to_numeric(data.retweet_count)
rslt_df2 = data[data['retweet_count'] < 40]

sns.catplot(x="label", y="retweet_count", kind="box", data=rslt_df2)


In [None]:
#favorited
pd.crosstab(data['favorited'], data['label']).apply(lambda r: r/r.sum(), axis=1).plot(kind='bar', stacked=True,color= ["maroon","lightcoral"])

In [None]:
# favorite_count
rslt_df3 = data[data['favorite_count'] < 500]

sns.catplot(x="label", y="favorite_count", kind="box", data=rslt_df3)


In [None]:
# emoji_count
sns.catplot(x="label", y="emoji_count", kind="box", data=data)


In [None]:
data.columns

In [None]:
data.emoji_meanings

In [None]:
data.head(3)

# Tokenizing


In [None]:
#function to remove non-alphanumeric characters
symbols = ["~", "`", "!", "@", "#", "$", "%", "^", "&", "*", "(", ")", "_", "-", "+", "=", ";", ":", '\"', "\'", "<", ",", ">", ".", "?", "/", "{", "[", "}", "]", "|", "\\"]
def removeNonAlphaNumeric(string):
    formattedStr = string
    for symbol in symbols:
        formattedStr = formattedStr.replace(symbol, ' ')
    return formattedStr

In [None]:
data["cleaned_phrase"] = data.full_text_without_emoji.apply(removeNonAlphaNumeric)

In [None]:
# function to tokenize a string using spaces since symbols are removed
def tokenize(string):
    tokens = string.split(" ")
    finalizedTokens = []
    for t in tokens:
        if(len(t) != 0):
            finalizedTokens.append(t)
    return finalizedTokens

In [None]:
tokens=[]

In [None]:
# generate tokens
for index, row in data.iterrows():
    tokens.append(tokenize(row['cleaned_phrase']))

In [None]:
# loading the stopwords from file
myfile = open('../input/stopwords/stopWords.txt', encoding='utf-16')
words = myfile.readlines()
stopwords = []
for word in words:
    stopwords.append(word.split("\t")[0])

In [None]:
# remove stopwords in sinhala dataset
no_stop = []
for tokenSet in tokens:
    temp = []
    for t in tokenSet:
        if(t not in stopwords):
            temp.append(t)
    no_stop.append(temp)

In [None]:
# function to count the total tokens
def getTotalTokens(token_set):
    count = 0
    for tSet in token_set:
        count += len(tSet)
    return count

In [None]:
# function to count the total unique tokens
def getTotalUniqueTokens(token_set):
    count = 0
    for tSet in token_set:
        uniqueTokens = set(tSet)
        count += len(uniqueTokens)
    return count

In [None]:
print("Total tokens before removing stopwords:", getTotalTokens(tokens))
print("Total tokens after removing stopwords:", getTotalTokens(no_stop))

In [None]:
def constructSent(lst):
    return " ".join(lst)
no_stop_sent = []
for lst in no_stop:
    no_stop_sent.append(constructSent(lst))

In [None]:
data["cleaned_phrase"] = no_stop_sent

In [None]:
# count the number of tokens in each sentence
countDict = []
for index, row in data.iterrows():
    tokens = tokenize(row['cleaned_phrase'])
    countDict.append(len(tokens))

In [None]:
data['countDict']= countDict

In [None]:
# retweet_count

sns.catplot(x="label", y="countDict", kind="box", data=data)

In [None]:

x1 = data.loc[data.label=='Offensive', 'countDict']
x2 = data.loc[data.label=='Not Offensive', 'countDict']


kwargs = dict(alpha=0.5, bins=50)
# kwargs = dict(hist_kws={'alpha':.6}, kde_kws={'linewidth':2})

plt.figure(figsize=(7,5), dpi= 80)

plt.hist(x1, **kwargs, color="deeppink" ,label='Offensive')
plt.hist(x2, **kwargs,  color="dodgerblue" , label='Not Offensive')

plt.gca().set(title='Frequency Histogram of Sentence Length', ylabel='Frequency')
plt.xlim(0,50)

plt.legend();

# Model Fitting

In [None]:
data.columns

In [None]:
# bag of words
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
bag_of_words_X = vectorizer.fit_transform(list(data["full_text_without_emoji"]))

In [None]:
bag_of_words_y = list(data.label)

In [None]:
# encoding labels
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
bag_of_words_y = le.fit_transform(bag_of_words_y)

In [None]:
# train test split
from sklearn.model_selection import train_test_split

# sinhala dataset
X_train, X_test, y_train, y_test = train_test_split(bag_of_words_X, bag_of_words_y, test_size=0.2, random_state=42)

In [None]:
# Logistic Regression Model for sinhala
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
prediction = logreg.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, prediction, labels=[0, 1]))

In [None]:
print("Accuracy for  dataset:", logreg.score(X_test, y_test))

In [None]:
# precision, recall and fscore
from sklearn.metrics import precision_recall_fscore_support
p, r, f, s =  precision_recall_fscore_support(y_test, prediction, average='macro')
print("Precision for  dataset:", p)
print("Recall for  dataset:", r)
print("F score for  dataset:", f)

In [None]:
# confusion matrix
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(logreg, X_test, y_test)

## # apply tfidf vectorizer

In [None]:
# apply tfidf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [None]:
X = data.full_text_without_emoji
y = data.label

In [None]:
y = le.fit_transform(list(y))

In [None]:
X = list(data.full_text_without_emoji)

In [None]:
X = vectorizer.fit_transform(X)

In [None]:
cross_val_model = LogisticRegression()

In [None]:
# cross validation accuracy
from sklearn.model_selection import cross_val_score
cross_val_acc = cross_val_score(cross_val_model, X, y, cv=10)
cross_val_acc

In [None]:
print("Average accuracy in cross validation:", sum(cross_val_acc)/len(cross_val_acc))

In [None]:
cross_val_precision = cross_val_score(cross_val_model, X, y, cv=10, scoring="precision")
cross_val_precision

In [None]:
print("Average precision in cross validation:", sum(cross_val_precision)/len(cross_val_precision))

In [None]:
cross_val_recall = cross_val_score(cross_val_model, X, y, cv=10, scoring="recall")
cross_val_recall

In [None]:
print("Average recall in cross validation:", sum(cross_val_recall)/len(cross_val_recall))

In [None]:
cross_val_f1 = cross_val_score(cross_val_model, X, y, cv=10, scoring="f1_macro")
cross_val_f1

In [None]:
print("Average f-score in cross validation:", sum(cross_val_f1)/len(cross_val_f1))

In [None]:
cross_val_auc = cross_val_score(cross_val_model, X, y, cv=10, scoring="roc_auc")
print("Average AUC in cross validation:", sum(cross_val_auc)/len(cross_val_auc))

# SVM MODEL

In [None]:
from sklearn import svm
svm = svm.SVC()

In [None]:
cross_val_acc = cross_val_score(svm, X, y, cv=10)
cross_val_acc
print("Average accuracy in cross validation:", sum(cross_val_acc)/len(cross_val_acc))

In [None]:
cross_val_recall = cross_val_score(svm, X, y, cv=10, scoring="recall")
print("Average recall in cross validation:", sum(cross_val_recall)/len(cross_val_recall))

In [None]:
cross_val_precision = cross_val_score(svm, X, y, cv=10, scoring="precision")
print("Average precision in cross validation:", sum(cross_val_precision)/len(cross_val_precision))

In [None]:
cross_val_f1 = cross_val_score(svm, X, y, cv=10, scoring="f1_macro")
print("Average f-score in cross validation:", sum(cross_val_f1)/len(cross_val_f1))

In [None]:
cross_val_auc = cross_val_score(svm, X, y, cv=10, scoring="roc_auc")
print("Average AUC in cross validation:", sum(cross_val_auc)/len(cross_val_auc))

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
randomForest = RandomForestClassifier(random_state=0)

In [None]:
cross_val_acc = cross_val_score(randomForest, X, y, cv=10)
cross_val_acc
print("Average accuracy in cross validation:", sum(cross_val_acc)/len(cross_val_acc))

In [None]:
cross_val_recall = cross_val_score(randomForest, X, y, cv=10, scoring="recall")
print("Average recall in cross validation:", sum(cross_val_recall)/len(cross_val_recall))

In [None]:
cross_val_precision = cross_val_score(randomForest, X, y, cv=10, scoring="precision")
print("Average precision in cross validation:", sum(cross_val_precision)/len(cross_val_precision))

In [None]:
cross_val_f1 = cross_val_score(randomForest, X, y, cv=10, scoring="f1_macro")
print("Average f-score in cross validation:", sum(cross_val_f1)/len(cross_val_f1))

In [None]:
cross_val_auc = cross_val_score(randomForest, X, y, cv=10, scoring="roc_auc")
print("Average AUC in cross validation:", sum(cross_val_auc)/len(cross_val_auc))

# Naive Bayes

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gradientBoost = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
    max_depth=1, random_state=0).fit(X_train, y_train)

In [None]:
cross_val_acc = cross_val_score(gradientBoost, X, y, cv=10)
print("Average accuracy in cross validation:", sum(cross_val_acc)/len(cross_val_acc))

In [None]:
cross_val_recall = cross_val_score(gradientBoost, X, y, cv=10, scoring="recall")
print("Average recall in cross validation:", sum(cross_val_recall)/len(cross_val_recall))

In [None]:
cross_val_precision = cross_val_score(gradientBoost, X, y, cv=10, scoring="precision")
print("Average precision in cross validation:", sum(cross_val_precision)/len(cross_val_precision))

In [None]:
cross_val_f1 = cross_val_score(gradientBoost, X, y, cv=10, scoring="f1_macro")
print("Average f-score in cross validation:", sum(cross_val_f1)/len(cross_val_f1))

In [None]:
cross_val_auc = cross_val_score(gradientBoost, X, y, cv=10, scoring="roc_auc")
print("Average AUC in cross validation:", sum(cross_val_auc)/len(cross_val_auc))