In [77]:
import jieba

In [78]:
import os
import pickle
import argparse
# For text preprocessing
import re
import numpy as np
from nltk.tokenize import TweetTokenizer
from wordsegment import segment, load
import sys
from sklearn.model_selection import train_test_split, KFold
from datasets import load_dataset
import pandas as pd
from tqdm.auto import tqdm
import pickle
import os

sys.setrecursionlimit(10000)


In [79]:
load()

# English

In [80]:
def text_preprocess(text, tknzr):
    FLAGS = re.MULTILINE | re.DOTALL
    # Different regex parts for smiley faces
    eyes = r"[8:=;]"
    nose = r"['`\-]?"

    # function so code less repetitive
    def re_sub(pattern, repl):
        return re.sub(pattern, repl, text, flags=FLAGS)

    text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>")
    text = re_sub(r"/"," / ")
    text = re_sub(r"@\w+", "<user>")
    text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
    text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>")
    text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
    text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
    text = re_sub(r"<3","<heart>")
    text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>")
    text = re_sub(r"([!?.]){2,}", r"\1 <repeat>")
    text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")
    # print(text)
    text = re_sub(r"#\S+", lambda hashtag: " ".join(segment(hashtag.group()[1:]))) # segment hastags
    # text = text.replace('#','')
    # print(text)
    # exit()

    tokens = tknzr.tokenize(text.lower())
    return " ".join(tokens)

def concat_data(id2entities):
    # 	with open(dir_path+name, "rb") as f:
    # 		id2entities = pickle.load(f)

    ########## Lookup Tables ##########
    labels = list(set([entity[0] for entity in id2entities.values()]))
    num_classes = len(labels)
    
    

    label_lookup = np.zeros((num_classes,num_classes),int)
    np.fill_diagonal(label_lookup, 1)
    ###################################

    text_data, context_data, label_data = [], [], []
    label_dict = {}
    for i, label in enumerate(labels):
        label_dict[label] = i

    # 	load()
    tknzr = TweetTokenizer(reduce_len=True, preserve_case=False, strip_handles=False)
    # 	print("Preprocessing tweets.....")
    for _id in tqdm(id2entities):
        if id2entities[_id][0] in label_dict.keys():
            text_data.append(text_preprocess(id2entities[_id][1], tknzr))
            context_data.append(text_preprocess(id2entities[_id][2], tknzr))

            label_data.append(label_lookup[ label_dict[id2entities[_id][0]] ])

    assert len(text_data) == len(context_data) == len(label_data)

    return text_data, context_data, label_data,label_dict


In [81]:
import os
import sys
import numpy as np
import argparse
import pickle
import re

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.pipeline import Pipeline

from joblib import dump, load

import warnings
from sklearn.metrics import classification_report

import pandas as pd
import functools

sys.path.insert(1, os.path.join(sys.path[0], '..'))
# import helper
import time

import argparse
import re


def preprocessSplit(processed_dict,lang):
    _text, _ctxt, _label,label_dict = concat_data(processed_dict)
    text_train, text_test, label_train, label_test = train_test_split(_text, _label,
                                                    stratify=_label,
                                                    test_size=0.2)
    
    _data = {"text_train": text_train,
                     "label_train": label_train,
                     "text_test": text_test,
                     "label_test": label_test,
                     "label_dict":label_dict
            }
    
    with open(f"processed_data/{lang}_processed.pkl", "wb") as f:
        pickle.dump(_data, f)
        
    print(label_dict)
        
    return _data


In [82]:
all_class_dict ={'abusive':'offensive',
 'hate':'offensive',
 'hateful':'offensive',
 'neither':'not_offensive',
 'non-sexist':'not_offensive',
 'none':'not_offensive',
 'normal':'not_offensive',
 'not_hate':'not_offensive',
 'offensive':'offensive',
 'sexist':'offensive'}

# English

## Loading Data and Formatting for Preprocessing Pipeline

In [83]:
import random

In [84]:
def getLimited(pd,ratio=3):
    keys = random.sample(list(pd), 8000)
    nc = {}
    for c,k in enumerate(keys):
        nc[c] = pd[k]
    return nc

In [86]:
dataset = load_dataset('hate_speech_offensive', 'mrpc', split='train')

class_list = []
for i in dataset:
    class_list.append(i['class'])

pd.Series(class_list).value_counts()

processed_dict = {}

class_dict = {0:'hate',1:'offensive',2:'neither'}

labels = []

for c,i in tqdm(enumerate(dataset),total=len(dataset)):
    processed_dict[c] = [all_class_dict[class_dict[i['class']]],i['tweet'],'']
    
    labels.append(class_dict[i['class']])

# Preprocess + Train test Split + Save to file
spl = preprocessSplit(processed_dict,'english_1')

Using custom data configuration mrpc
Reusing dataset hate_speech_offensive (/Users/rehanahmed/.cache/huggingface/datasets/hate_speech_offensive/mrpc/1.0.0/5f5dfc7b42b5c650fe30a8c49df90b7dbb9c7a4b3fe43ae2e66fabfea35113f5)


  0%|          | 0/24783 [00:00<?, ?it/s]

  0%|          | 0/24783 [00:00<?, ?it/s]

{'offensive': 0, 'not_offensive': 1}


In [87]:
len(labels)

24783

In [88]:
pd.Series(labels).value_counts()

offensive    19190
neither       4163
hate          1430
dtype: int64

# Filipino

In [90]:
dataset = load_dataset('hate_speech_filipino', 'mrpc')

class_list = []
for j in ['train','test','validation']:
    for i in dataset[j]:
        class_list.append(i['label'])

pd.Series(class_list).value_counts()

processed_dict = {}

class_dict = {0:'not_hate',1:'hate'}

c=0
labels = []
for j in tqdm(['train','test','validation']):
    for i in (dataset[j]):
        processed_dict[c] = [all_class_dict[class_dict[i['label']]],i['text'],'']
        c = c+1
        labels.append(class_dict[i['label']])
        
    
sp = preprocessSplit(processed_dict,'filipino')

Using custom data configuration mrpc
Reusing dataset hate_speech_filipino (/Users/rehanahmed/.cache/huggingface/datasets/hate_speech_filipino/mrpc/1.0.0/89001ab1965f35d6d74585e59f982bbdd09c82a645bf702f32a52ad95404dd83)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/24232 [00:00<?, ?it/s]

{'offensive': 0, 'not_offensive': 1}


In [91]:
len(labels)

24232

In [92]:
pd.Series(labels).value_counts()

not_hate    12979
hate        11253
dtype: int64

# Chinese 

In [69]:
my_ls = ['<url>','<user>','<smile>','<lolface>','<sadface>','<neutralface>','<heart>','<number>','<repeat>','<elong>']
e_ls = list(map(lambda a:a.replace('<','').replace('>',''),my_ls))

In [70]:
def text_preprocess(text, tknzr):
    FLAGS = re.MULTILINE | re.DOTALL
    # Different regex parts for smiley faces
    eyes = r"[8:=;]"
    nose = r"['`\-]?"

    # function so code less repetitive
    def re_sub(pattern, repl):
        return re.sub(pattern, repl, text, flags=FLAGS)

    text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>")
    text = re_sub(r"/"," / ")
    text = re_sub(r"@\w+", "<user>")
    text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
    text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>")
    text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
    text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
    text = re_sub(r"<3","<heart>")
    text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>")
    text = re_sub(r"([!?.]){2,}", r"\1 <repeat>")
    text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")
    # print(text)
    text = re_sub(r"#\S+", lambda hashtag: " ".join(segment(hashtag.group()[1:]))) # segment hastags
    # text = text.replace('#','')
    # print(text)
    # exit()

#     tokens = tknzr.tokenize(text.lower())
    tokens = jieba.lcut(text, cut_all=False)
    ret_text = " ".join(tokens)

    for i in e_ls:
        if i in ret_text:
            ret_text = ret_text.replace('< '+i+' >','<'+i+'>')


    return ret_text


### Place file SexComment.csv in Raw_datasets

In [104]:
!ls raw_datasets | grep SexComment.csv

SexComment.csv


In [93]:
df = pd.read_csv('raw_datasets/SexComment.csv')

df.head()

processed_dict = {}

labels = []

class_dict = {0:'non-sexist',1:'sexist'}

for index,row in tqdm(df.iterrows()):
    processed_dict[index] = [all_class_dict[class_dict[row['label']]],row['comment_text'],'']
    labels.append(class_dict[row['label']])

    
# processed_dict =   getLimited(processed_dict)

sp = preprocessSplit(processed_dict,'chinese')

0it [00:00, ?it/s]

  0%|          | 0/8969 [00:00<?, ?it/s]

{'offensive': 0, 'not_offensive': 1}


In [94]:
len(labels)

8969

In [95]:
pd.Series(labels).value_counts()

non-sexist    5876
sexist        3093
dtype: int64

# Korean

In [96]:
def text_preprocess(text, tknzr):
    FLAGS = re.MULTILINE | re.DOTALL
    # Different regex parts for smiley faces
    eyes = r"[8:=;]"
    nose = r"['`\-]?"

    # function so code less repetitive
    def re_sub(pattern, repl):
        return re.sub(pattern, repl, text, flags=FLAGS)

    text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>")
    text = re_sub(r"/"," / ")
    text = re_sub(r"@\w+", "<user>")
    text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
    text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>")
    text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
    text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
    text = re_sub(r"<3","<heart>")
    text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>")
    text = re_sub(r"([!?.]){2,}", r"\1 <repeat>")
    text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")
    
    # print(text)
    text = re_sub(r"#\S+", lambda hashtag: " ".join(segment(hashtag.group()[1:]))) # segment hastags
    # text = text.replace('#','')
    # print(text)
    # exit()

    # tokens = tknzr.tokenize(text.lower())
    tokens = hannanum.morphs(text)
    ret_text = " ".join(tokens)

    for i in e_ls:
        if i in ret_text:
            ret_text = ret_text.replace('< '+i+' >','<'+i+'>')


    return ret_text


In [99]:

import koco

train_dev = koco.load_dataset('korean-hate-speech', mode='train_dev')
# test_dev = koco.load_dataset('korean-hate-speech', mode='test')

processed_dict = {}

labels = []

c = 0
for i in train_dev['train']:
    processed_dict[c] = [all_class_dict[i['hate']],i['comments'],'']
    c=c+1
    labels.append(i['hate'])
for i in train_dev['dev']:
    processed_dict[c] = [all_class_dict[i['hate']],i['comments'],'']
    c=c+1
    labels.append(i['hate'])

# sp = preprocessSplit(processed_dict,'korean')

In [100]:
len(labels)

8367

In [101]:
pd.Series(labels).value_counts()

none         3646
offensive    2688
hate         2033
dtype: int64