In [77]:
import jieba

In [78]:
import os
import pickle
import argparse
# For text preprocessing
import re
import numpy as np
from nltk.tokenize import TweetTokenizer
from wordsegment import segment, load
import sys
from sklearn.model_selection import train_test_split, KFold
from datasets import load_dataset
import pandas as pd
from tqdm.auto import tqdm
import pickle
import os

sys.setrecursionlimit(10000)


In [79]:
load()

# English

In [80]:
def text_preprocess(text, tknzr):
    FLAGS = re.MULTILINE | re.DOTALL
    # Different regex parts for smiley faces
    eyes = r"[8:=;]"
    nose = r"['`\-]?"

    # function so code less repetitive
    def re_sub(pattern, repl):
        return re.sub(pattern, repl, text, flags=FLAGS)

    text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>")
    text = re_sub(r"/"," / ")
    text = re_sub(r"@\w+", "<user>")
    text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
    text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>")
    text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
    text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
    text = re_sub(r"<3","<heart>")
    text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>")
    text = re_sub(r"([!?.]){2,}", r"\1 <repeat>")
    text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")
    # print(text)
    text = re_sub(r"#\S+", lambda hashtag: " ".join(segment(hashtag.group()[1:]))) # segment hastags
    # text = text.replace('#','')
    # print(text)
    # exit()

    tokens = tknzr.tokenize(text.lower())
    return " ".join(tokens)

def concat_data(id2entities):
    # 	with open(dir_path+name, "rb") as f:
    # 		id2entities = pickle.load(f)

    ########## Lookup Tables ##########
    labels = list(set([entity[0] for entity in id2entities.values()]))
    num_classes = len(labels)
    
    

    label_lookup = np.zeros((num_classes,num_classes),int)
    np.fill_diagonal(label_lookup, 1)
    ###################################

    text_data, context_data, label_data = [], [], []
    label_dict = {}
    for i, label in enumerate(labels):
        label_dict[label] = i

    # 	load()
    tknzr = TweetTokenizer(reduce_len=True, preserve_case=False, strip_handles=False)
    # 	print("Preprocessing tweets.....")
    for _id in tqdm(id2entities):
        if id2entities[_id][0] in label_dict.keys():
            text_data.append(text_preprocess(id2entities[_id][1], tknzr))
            context_data.append(text_preprocess(id2entities[_id][2], tknzr))

            label_data.append(label_lookup[ label_dict[id2entities[_id][0]] ])

    assert len(text_data) == len(context_data) == len(label_data)

    return text_data, context_data, label_data,label_dict


In [81]:
import os
import sys
import numpy as np
import argparse
import pickle
import re

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.pipeline import Pipeline

from joblib import dump, load

import warnings
from sklearn.metrics import classification_report

import pandas as pd
import functools

sys.path.insert(1, os.path.join(sys.path[0], '..'))
# import helper
import time

import argparse
import re

def train(classifier,n_gram_tuple,feature_level,max_feature_length,_data,labels):
    if classifier == 'NB':
        text_clf = Pipeline([('vect', TfidfVectorizer(ngram_range=(n_gram_tuple[0],n_gram_tuple[1]), analyzer=feature_level, max_features=max_feature_length)),
                        ('clf', MultinomialNB())])

    elif classifier == 'LR':
        text_clf = Pipeline([('vect', TfidfVectorizer(ngram_range=(n_gram_tuple[0],n_gram_tuple[1]), analyzer=feature_level, max_features=max_feature_length)),
                        ('clf', LogisticRegression(multi_class="multinomial", solver="lbfgs"))])

    elif classifier == 'SVM':
        text_clf = Pipeline([('vect', TfidfVectorizer(ngram_range=(n_gram_tuple[0],n_gram_tuple[1]), analyzer=feature_level, max_features=max_feature_length)),
                        ('clf', SGDClassifier(loss='log', penalty='l2'))])

    elif classifier == 'RF':
        text_clf = Pipeline([('vect', TfidfVectorizer(ngram_range=(n_gram_tuple[0],n_gram_tuple[1]), analyzer=feature_level, max_features=max_feature_length)),
                        ('clf', RandomForestClassifier())]) 

    elif classifier == 'GBT':
        text_clf = Pipeline([('vect', TfidfVectorizer(ngram_range=(n_gram_tuple[0],n_gram_tuple[1]), analyzer=feature_level, max_features=max_feature_length)),
                        ('clf', GradientBoostingClassifier(learning_rate=1, max_depth=1))])
        
    
    train_label_data = [i.tolist().index(1) for i in _data['label_train']]
    text_clf.fit(_data["text_train"], train_label_data)
    
    preds = text_clf.predict(_data["text_test"])
    
    test_label_data = [i.tolist().index(1) for i in _data['label_test']]

    print(classifier)
#     print(labels)
    print(classification_report(test_label_data, preds,target_names=labels))
    print("======================")
    
    return text_clf,test_label_data,preds

def preprocessSplit(processed_dict,lang):
    _text, _ctxt, _label,label_dict = concat_data(processed_dict)
    text_train, text_test, label_train, label_test = train_test_split(_text, _label,
                                                    stratify=_label,
                                                    test_size=0.2)
    
    _data = {"text_train": text_train,
                     "label_train": label_train,
                     "text_test": text_test,
                     "label_test": label_test,
                     "label_dict":label_dict
            }
    
    with open(f"processed_data/{lang}_processed.pkl", "wb") as f:
        pickle.dump(_data, f)
        
    print(label_dict)
        
    return _data


In [82]:
all_class_dict ={'abusive':'offensive',
 'hate':'offensive',
 'hateful':'offensive',
 'neither':'not_offensive',
 'non-sexist':'not_offensive',
 'none':'not_offensive',
 'normal':'not_offensive',
 'not_hate':'not_offensive',
 'offensive':'offensive',
 'sexist':'offensive'}

# English

## Loading Data and Formatting for Preprocessing Pipeline

In [83]:
import random

In [84]:
def getLimited(pd,ratio=3):
    keys = random.sample(list(pd), 8000)
    nc = {}
    for c,k in enumerate(keys):
        nc[c] = pd[k]
    return nc

In [86]:
dataset = load_dataset('hate_speech_offensive', 'mrpc', split='train')

class_list = []
for i in dataset:
    class_list.append(i['class'])

pd.Series(class_list).value_counts()

processed_dict = {}

class_dict = {0:'hate',1:'offensive',2:'neither'}

labels = []

for c,i in tqdm(enumerate(dataset),total=len(dataset)):
    processed_dict[c] = [all_class_dict[class_dict[i['class']]],i['tweet'],'']
    
    labels.append(class_dict[i['class']])

# Preprocess + Train test Split + Save to file
spl = preprocessSplit(processed_dict,'english_1')

Using custom data configuration mrpc
Reusing dataset hate_speech_offensive (/Users/rehanahmed/.cache/huggingface/datasets/hate_speech_offensive/mrpc/1.0.0/5f5dfc7b42b5c650fe30a8c49df90b7dbb9c7a4b3fe43ae2e66fabfea35113f5)


  0%|          | 0/24783 [00:00<?, ?it/s]

  0%|          | 0/24783 [00:00<?, ?it/s]

{'offensive': 0, 'not_offensive': 1}


In [87]:
len(labels)

24783

In [88]:
pd.Series(labels).value_counts()

offensive    19190
neither       4163
hate          1430
dtype: int64

# Filipino

In [90]:
dataset = load_dataset('hate_speech_filipino', 'mrpc')

class_list = []
for j in ['train','test','validation']:
    for i in dataset[j]:
        class_list.append(i['label'])

pd.Series(class_list).value_counts()

processed_dict = {}

class_dict = {0:'not_hate',1:'hate'}

c=0
labels = []
for j in tqdm(['train','test','validation']):
    for i in (dataset[j]):
        processed_dict[c] = [all_class_dict[class_dict[i['label']]],i['text'],'']
        c = c+1
        labels.append(class_dict[i['label']])
        
    
sp = preprocessSplit(processed_dict,'filipino')

Using custom data configuration mrpc
Reusing dataset hate_speech_filipino (/Users/rehanahmed/.cache/huggingface/datasets/hate_speech_filipino/mrpc/1.0.0/89001ab1965f35d6d74585e59f982bbdd09c82a645bf702f32a52ad95404dd83)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/24232 [00:00<?, ?it/s]

{'offensive': 0, 'not_offensive': 1}


In [91]:
len(labels)

24232

In [92]:
pd.Series(labels).value_counts()

not_hate    12979
hate        11253
dtype: int64

# Chinese 

In [69]:
my_ls = ['<url>','<user>','<smile>','<lolface>','<sadface>','<neutralface>','<heart>','<number>','<repeat>','<elong>']
e_ls = list(map(lambda a:a.replace('<','').replace('>',''),my_ls))

In [70]:
def text_preprocess(text, tknzr):
    FLAGS = re.MULTILINE | re.DOTALL
    # Different regex parts for smiley faces
    eyes = r"[8:=;]"
    nose = r"['`\-]?"

    # function so code less repetitive
    def re_sub(pattern, repl):
        return re.sub(pattern, repl, text, flags=FLAGS)

    text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>")
    text = re_sub(r"/"," / ")
    text = re_sub(r"@\w+", "<user>")
    text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
    text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>")
    text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
    text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
    text = re_sub(r"<3","<heart>")
    text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>")
    text = re_sub(r"([!?.]){2,}", r"\1 <repeat>")
    text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")
    # print(text)
    text = re_sub(r"#\S+", lambda hashtag: " ".join(segment(hashtag.group()[1:]))) # segment hastags
    # text = text.replace('#','')
    # print(text)
    # exit()

#     tokens = tknzr.tokenize(text.lower())
    tokens = jieba.lcut(text, cut_all=False)
    ret_text = " ".join(tokens)

    for i in e_ls:
        if i in ret_text:
            ret_text = ret_text.replace('< '+i+' >','<'+i+'>')


    return ret_text


### Place file SexComment.csv in Raw_datasets

In [104]:
!ls raw_datasets | grep SexComment.csv

SexComment.csv


In [93]:
df = pd.read_csv('raw_datasets/SexComment.csv')

df.head()

processed_dict = {}

labels = []

class_dict = {0:'non-sexist',1:'sexist'}

for index,row in tqdm(df.iterrows()):
    processed_dict[index] = [all_class_dict[class_dict[row['label']]],row['comment_text'],'']
    labels.append(class_dict[row['label']])

    
# processed_dict =   getLimited(processed_dict)

sp = preprocessSplit(processed_dict,'chinese')

0it [00:00, ?it/s]

  0%|          | 0/8969 [00:00<?, ?it/s]

{'offensive': 0, 'not_offensive': 1}


In [94]:
len(labels)

8969

In [95]:
pd.Series(labels).value_counts()

non-sexist    5876
sexist        3093
dtype: int64

# Korean

In [96]:
def text_preprocess(text, tknzr):
    FLAGS = re.MULTILINE | re.DOTALL
    # Different regex parts for smiley faces
    eyes = r"[8:=;]"
    nose = r"['`\-]?"

    # function so code less repetitive
    def re_sub(pattern, repl):
        return re.sub(pattern, repl, text, flags=FLAGS)

    text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>")
    text = re_sub(r"/"," / ")
    text = re_sub(r"@\w+", "<user>")
    text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
    text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>")
    text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
    text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
    text = re_sub(r"<3","<heart>")
    text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>")
    text = re_sub(r"([!?.]){2,}", r"\1 <repeat>")
    text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")
    
    # print(text)
    text = re_sub(r"#\S+", lambda hashtag: " ".join(segment(hashtag.group()[1:]))) # segment hastags
    # text = text.replace('#','')
    # print(text)
    # exit()

    # tokens = tknzr.tokenize(text.lower())
    tokens = hannanum.morphs(text)
    ret_text = " ".join(tokens)

    for i in e_ls:
        if i in ret_text:
            ret_text = ret_text.replace('< '+i+' >','<'+i+'>')


    return ret_text


In [99]:

import koco

train_dev = koco.load_dataset('korean-hate-speech', mode='train_dev')
# test_dev = koco.load_dataset('korean-hate-speech', mode='test')

processed_dict = {}

labels = []

c = 0
for i in train_dev['train']:
    processed_dict[c] = [all_class_dict[i['hate']],i['comments'],'']
    c=c+1
    labels.append(i['hate'])
for i in train_dev['dev']:
    processed_dict[c] = [all_class_dict[i['hate']],i['comments'],'']
    c=c+1
    labels.append(i['hate'])

# sp = preprocessSplit(processed_dict,'korean')

In [100]:
len(labels)

8367

In [101]:
pd.Series(labels).value_counts()

none         3646
offensive    2688
hate         2033
dtype: int64

In [50]:
from joblib import dump, load


In [55]:
models = ['NB','LR','SVM','RF','GBT']
languages = ['english_1','filipino','chinese','korean']

In [56]:
# import os
# import pickle

# Total Labels

In [72]:
args = {'feature_level':'word',
        'clf':'LR',
        'language':None,
        'ngram_range':(1,3),
        'max_features':14000,
        
       }

for lang in languages:

    with open(f"processed_data/{lang}_processed.pkl", "rb") as f:
            _data = pickle.load(f)
    print(lang)
    print(f'Train Size:{len(_data["text_train"])}')
    print(f'Train Size:{len(_data["text_test"])}')
    print()

    

english_1
Train Size:6400
Train Size:1600

filipino
Train Size:6400
Train Size:1600

chinese
Train Size:6400
Train Size:1600

korean
Train Size:6400
Train Size:1600



# Word Level

In [75]:
args = {'feature_level':'word',
        'clf':'LR',
        'language':None,
        'ngram_range':(1,3),
        'max_features':14000,
        
       }

for lang in languages:
    args['language'] = lang

    print(args['language'])
    for m in tqdm(models):

        with open(f"processed_data/{args['language']}_processed.pkl", "rb") as f:
            _data = pickle.load(f)

        clf = train(m,args['ngram_range'],args['feature_level'],args['max_features'],_data,list(_data['label_dict'].keys()))

        model_name = args['language']+'_'+m+'_'+args["feature_level"]+'.joblib'
        dump(clf, f'output/{model_name}') 


english_1


  0%|          | 0/5 [00:00<?, ?it/s]

NB
               precision    recall  f1-score   support

    offensive       0.84      1.00      0.91      1328
not_offensive       1.00      0.08      0.14       272

     accuracy                           0.84      1600
    macro avg       0.92      0.54      0.53      1600
 weighted avg       0.87      0.84      0.78      1600

LR
               precision    recall  f1-score   support

    offensive       0.92      0.98      0.95      1328
not_offensive       0.89      0.58      0.70       272

     accuracy                           0.92      1600
    macro avg       0.90      0.78      0.83      1600
 weighted avg       0.91      0.92      0.91      1600

SVM
               precision    recall  f1-score   support

    offensive       0.91      0.99      0.95      1328
not_offensive       0.91      0.53      0.67       272

     accuracy                           0.91      1600
    macro avg       0.91      0.76      0.81      1600
 weighted avg       0.91      0.91      0.90   

  0%|          | 0/5 [00:00<?, ?it/s]

NB
               precision    recall  f1-score   support

    offensive       0.72      0.75      0.73       747
not_offensive       0.77      0.74      0.76       853

     accuracy                           0.75      1600
    macro avg       0.75      0.75      0.75      1600
 weighted avg       0.75      0.75      0.75      1600

LR
               precision    recall  f1-score   support

    offensive       0.78      0.75      0.77       747
not_offensive       0.79      0.82      0.80       853

     accuracy                           0.79      1600
    macro avg       0.79      0.78      0.78      1600
 weighted avg       0.79      0.79      0.79      1600

SVM
               precision    recall  f1-score   support

    offensive       0.77      0.75      0.76       747
not_offensive       0.79      0.81      0.80       853

     accuracy                           0.78      1600
    macro avg       0.78      0.78      0.78      1600
 weighted avg       0.78      0.78      0.78   

  0%|          | 0/5 [00:00<?, ?it/s]

NB
               precision    recall  f1-score   support

    offensive       0.77      0.23      0.35       555
not_offensive       0.70      0.96      0.81      1045

     accuracy                           0.71      1600
    macro avg       0.74      0.60      0.58      1600
 weighted avg       0.73      0.71      0.65      1600

LR
               precision    recall  f1-score   support

    offensive       0.72      0.56      0.63       555
not_offensive       0.79      0.88      0.83      1045

     accuracy                           0.77      1600
    macro avg       0.75      0.72      0.73      1600
 weighted avg       0.77      0.77      0.76      1600

SVM
               precision    recall  f1-score   support

    offensive       0.72      0.54      0.62       555
not_offensive       0.78      0.89      0.83      1045

     accuracy                           0.77      1600
    macro avg       0.75      0.71      0.72      1600
 weighted avg       0.76      0.77      0.76   

  0%|          | 0/5 [00:00<?, ?it/s]

NB
               precision    recall  f1-score   support

not_offensive       0.73      0.39      0.51       697
    offensive       0.66      0.89      0.75       903

     accuracy                           0.67      1600
    macro avg       0.69      0.64      0.63      1600
 weighted avg       0.69      0.67      0.65      1600

LR
               precision    recall  f1-score   support

not_offensive       0.65      0.50      0.56       697
    offensive       0.67      0.80      0.73       903

     accuracy                           0.67      1600
    macro avg       0.66      0.65      0.65      1600
 weighted avg       0.66      0.67      0.66      1600

SVM
               precision    recall  f1-score   support

not_offensive       0.67      0.48      0.56       697
    offensive       0.67      0.82      0.74       903

     accuracy                           0.67      1600
    macro avg       0.67      0.65      0.65      1600
 weighted avg       0.67      0.67      0.66   

# Char Level

In [76]:
args = {'feature_level':'char',
        'clf':'LR',
        'language':None,
        'ngram_range':(1,3),
        'max_features':53000,
        
       }

for lang in languages:
    args['language'] = lang

    print(args['language'])
    for m in tqdm(models):

        with open(f"processed_data/{args['language']}_processed.pkl", "rb") as f:
            _data = pickle.load(f)

        clf = train(m,args['ngram_range'],args['feature_level'],args['max_features'],_data,list(_data['label_dict'].keys()))

        model_name = args['language']+'_'+m+'_'+args["feature_level"]+'.joblib'
        dump(clf, f'output/{model_name}') 


english_1


  0%|          | 0/5 [00:00<?, ?it/s]

NB
               precision    recall  f1-score   support

    offensive       0.83      1.00      0.91      1328
not_offensive       0.77      0.04      0.07       272

     accuracy                           0.83      1600
    macro avg       0.80      0.52      0.49      1600
 weighted avg       0.82      0.83      0.77      1600

LR
               precision    recall  f1-score   support

    offensive       0.93      0.98      0.95      1328
not_offensive       0.86      0.65      0.74       272

     accuracy                           0.92      1600
    macro avg       0.90      0.82      0.85      1600
 weighted avg       0.92      0.92      0.92      1600

SVM
               precision    recall  f1-score   support

    offensive       0.92      0.98      0.95      1328
not_offensive       0.85      0.61      0.71       272

     accuracy                           0.92      1600
    macro avg       0.89      0.79      0.83      1600
 weighted avg       0.91      0.92      0.91   

  0%|          | 0/5 [00:00<?, ?it/s]

NB
               precision    recall  f1-score   support

    offensive       0.67      0.73      0.70       747
not_offensive       0.74      0.69      0.72       853

     accuracy                           0.71      1600
    macro avg       0.71      0.71      0.71      1600
 weighted avg       0.71      0.71      0.71      1600

LR
               precision    recall  f1-score   support

    offensive       0.76      0.74      0.75       747
not_offensive       0.78      0.80      0.79       853

     accuracy                           0.77      1600
    macro avg       0.77      0.77      0.77      1600
 weighted avg       0.77      0.77      0.77      1600

SVM
               precision    recall  f1-score   support

    offensive       0.77      0.72      0.75       747
not_offensive       0.77      0.81      0.79       853

     accuracy                           0.77      1600
    macro avg       0.77      0.77      0.77      1600
 weighted avg       0.77      0.77      0.77   

  0%|          | 0/5 [00:00<?, ?it/s]

NB
               precision    recall  f1-score   support

    offensive       0.86      0.10      0.18       555
not_offensive       0.67      0.99      0.80      1045

     accuracy                           0.68      1600
    macro avg       0.77      0.55      0.49      1600
 weighted avg       0.74      0.68      0.59      1600

LR
               precision    recall  f1-score   support

    offensive       0.74      0.55      0.63       555
not_offensive       0.79      0.90      0.84      1045

     accuracy                           0.78      1600
    macro avg       0.77      0.72      0.74      1600
 weighted avg       0.77      0.78      0.77      1600

SVM
               precision    recall  f1-score   support

    offensive       0.76      0.52      0.62       555
not_offensive       0.78      0.91      0.84      1045

     accuracy                           0.78      1600
    macro avg       0.77      0.72      0.73      1600
 weighted avg       0.78      0.78      0.77   

  0%|          | 0/5 [00:00<?, ?it/s]

NB
               precision    recall  f1-score   support

not_offensive       0.88      0.31      0.46       697
    offensive       0.64      0.97      0.77       903

     accuracy                           0.68      1600
    macro avg       0.76      0.64      0.61      1600
 weighted avg       0.75      0.68      0.64      1600

LR
               precision    recall  f1-score   support

not_offensive       0.75      0.62      0.68       697
    offensive       0.74      0.84      0.79       903

     accuracy                           0.74      1600
    macro avg       0.74      0.73      0.73      1600
 weighted avg       0.74      0.74      0.74      1600

SVM
               precision    recall  f1-score   support

not_offensive       0.73      0.62      0.67       697
    offensive       0.74      0.82      0.78       903

     accuracy                           0.74      1600
    macro avg       0.74      0.72      0.73      1600
 weighted avg       0.74      0.74      0.73   

# Next

- Implement Preprocessing Pipeline - Done
- Transform and  new english dataset - Done
- Work on CNN/RNN - To Do
- Work on CNN/RNN with encodings - To Do
- Work on Bert - To Do
- Test if translating languages and increasing dataset size helps - To Do