In [25]:
import os
import json
import tweepy
import numpy as np
import pandas as pd
from statistics import mode

# Monolingual Dataset preparation

### Function to read Ousidhoum Data Files

In [2]:
def Ousidhoum_data(Directory, file_name):
    file_name = os.path.join(Directory, file_name)
    df_Ousidhoum = pd.read_csv(file_name)
    hateful_df = df_Ousidhoum[df_Ousidhoum['sentiment'].str.contains("hateful")]
    normal_df = df_Ousidhoum[df_Ousidhoum['sentiment']=='normal']
    df_Ousidhoum = pd.concat([hateful_df, normal_df], axis =0)
    df_Ousidhoum.drop(['HITId','directness', 'annotator_sentiment', 'target','group' ], axis =1, inplace= True)
    df_Ousidhoum.columns = ['tweet','label']
    df_Ousidhoum.label[df_Ousidhoum.label!='normal'] =1
    df_Ousidhoum.label[df_Ousidhoum.label=='normal'] =0
    print(df_Ousidhoum.label.value_counts())
    df_Ousidhoum=df_Ousidhoum.reset_index(drop= True)
    return df_Ousidhoum

# English 

### Gilbert et al

In [3]:
filename_df = pd.read_csv(r"C:\Users\samsu\Desktop\Fall20\CS 695-002\Project\Datasets\English\Gilbert et al\annotations_metadata.csv")
Directory = r"C:\Users\samsu\Desktop\Fall20\CS 695-002\Project\Datasets\English\Gilbert et al\all_files"

def Gilbert_dataprep(Directory, filename_df):
    data = []
    for file_name in filename_df['file_id']:
        file_name = file_name+".txt"
        file_name = os.path.join(Directory, file_name)
        with open (file_name,'r', encoding= 'utf-8') as infile:
             line = infile.readlines()
             data.append(line)
    df = pd.DataFrame(data,columns=['Tweets'])
    df = pd.concat([df,filename_df],axis =1)
    df.drop(['file_id', 'user_id','subforum_id', 'num_contexts'], axis =1, inplace= True)
    return df

df_Gilbert = Gilbert_dataprep(Directory, filename_df)
df_Gilbert.columns =['tweet', 'label']
df_Gilbert.label[df_Gilbert.label=='hate'] =1
df_Gilbert.label[df_Gilbert.label=='noHate'] =0
df_Gilbert = df_Gilbert[df_Gilbert['label'] != 'relation']
df_Gilbert = df_Gilbert[df_Gilbert['label'] != 'idk/skip']
print(df_Gilbert.label.value_counts())
df_Gilbert.head()

0    9507
1    1196
Name: label, dtype: int64


Unnamed: 0,tweet,label
0,"As of March 13th , 2014 , the booklet had been...",0
1,In order to help increase the booklets downloa...,0
2,( Simply copy and paste the following text int...,0
3,Click below for a FREE download of a colorfull...,1
4,Click on the `` DOWNLOAD ( 7.42 MB ) '' green ...,0


###  Davidson et al

In [4]:
df_Davidson = pd.read_csv(r"C:\Users\samsu\Desktop\Fall20\CS 695-002\Project\Datasets\English\Davidson et al.csv")
df_Davidson.hate_speech[df_Davidson['hate_speech'] > 1] = 1
df_Davidson = df_Davidson[['tweet', 'hate_speech']]
df_Davidson.columns = ['tweet', 'label']
print(df_Davidson.label.value_counts())
df_Davidson.head()

0    19789
1     4993
Name: label, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,tweet,label
0,!!! RT @mayasolovely: As a woman you shouldn't...,0
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,0
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,0
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,0
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,0


### Basile et al

In [5]:
train = pd.read_csv(r'C:\Users\samsu\Desktop\Fall20\CS 695-002\Project\Datasets\English\Basile et al\hateval2019_en_train.csv')
dev = pd.read_csv(r'C:\Users\samsu\Desktop\Fall20\CS 695-002\Project\Datasets\English\Basile et al\hateval2019_en_dev.csv')
test = pd.read_csv(r'C:\Users\samsu\Desktop\Fall20\CS 695-002\Project\Datasets\English\Basile et al\hateval2019_en_test.csv')

df_Basile = pd.concat([train,dev,test], axis = 0)
df_Basile.drop(['id','TR','AG'], axis = 1, inplace = True)
df_Basile.columns = ['tweet','label']
print(df_Basile.label.value_counts())
df_Basile.head()

0    7530
1    5470
Name: label, dtype: int64


Unnamed: 0,tweet,label
0,"Hurray, saving us $$$ in so many ways @potus @...",1
1,Why would young fighting age men be the vast m...,1
2,@KamalaHarris Illegals Dump their Kids at the ...,1
3,NY Times: 'Nearly All White' States Pose 'an A...,0
4,Orban in Brussels: European leaders are ignori...,0


### Ousidhoum et al

In [6]:
Directory_English = r'C:\Users\samsu\Desktop\Fall20\CS 695-002\Project\Datasets\English\Ousidhoum et al'
Ousidhoum_English = 'en_dataset.csv'
df_Ousidhoum_English = Ousidhoum_data(Directory_English, Ousidhoum_English)

1    1278
0     661
Name: label, dtype: int64


### Waseem et al

In [7]:
def Waseem_dataset(Directory, file_name):
    file_name = os.path.join(Directory, file_name)
    df = pd.read_json(file_name, lines = True)
    df_Waseem_English= pd.DataFrame()
    df_Waseem_English['tweet'] = df['text']
    df_Waseem_English['label'] = df['Annotation']
    return df_Waseem_English

In [8]:
Directory = r'C:\Users\samsu\Desktop\Fall20\CS 695-002\Project\Datasets\English\Waseem et al'
filename_racism = 'racism.json'
filename_sexism = 'sexism.json'
filename_neither = 'neither.json'
df_racism = Waseem_dataset(Directory, filename_racism)
df_sexism = Waseem_dataset(Directory, filename_sexism)
df_neither = Waseem_dataset(Directory, filename_neither)

In [9]:
df_Waseem=pd.concat([df_racism, df_sexism, df_neither], axis =0)
df_Waseem.label[df_Waseem.label!='none'] =1
df_Waseem.label[df_Waseem.label=='none'] =0
df_Waseem = df_Waseem.reset_index(drop = True)
print(df_Waseem.label.value_counts())
df_Waseem

0    11501
1     5406
Name: label, dtype: int64


Unnamed: 0,tweet,label
0,So Drasko just said he was impressed the girls...,1
1,Drasko they didn't cook half a bird you idiot ...,1
2,Hopefully someone cooks Drasko in the next ep ...,1
3,of course you were born in serbia...you're as ...,1
4,These girls are the equivalent of the irritati...,1
...,...,...
16902,RT @JakeM_1998: RT BillSpindle: It's all about...,0
16903,RT @ThinkAgain_DOS: Iraq: #ISIS sets off 21 ca...,0
16904,RT @ThePatriot143: DEAR STATE DEPARTMENT: WHER...,0
16905,"""@panelrific: Let's go 🐧🐧🐧🐧🐧🐧😃""",0


### Founta et al

In [10]:
# df = pd.read_csv(r'C:\Users\samsu\Desktop\Fall20\CS 695-002\Project\Datasets\English\Founta et al.csv')

# CONSUMER_KEY = "XXXXX"
# CONSUMER_SECRET = "XXXXX"
# OAUTH_TOKEN = "XXXXX"
# OAUTH_TOKEN_SECRET = "XXXXX"
# auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
# auth.set_access_token(OAUTH_TOKEN, OAUTH_TOKEN_SECRET)
# api = tweepy.API(auth)

# tweets = []
# tweet_ids = []
# for i in df.tweet_id:
#     try:
#         tweet = api.get_status(i)
#         tweets.append(tweet.text)
#         tweet_ids.append(i)
#     except tweepy.error.TweepError:
#         #print("tweet not found")
#         continue

In [11]:
# df1 = pd.DataFrame()
# df1['tweets'] = tweets
# df1['tweet_id'] = tweet_ids

# final_df = df1.merge(df, on='tweet_id', how='inner')
# final_df.to_csv(r'C:\Users\samsu\Desktop\Fall20\CS 695-002\Project\Datasets\English\df_Founta.csv')

In [16]:
df_Founta = pd.read_csv(r"C:\Users\samsu\Desktop\Fall20\CS 695-002\Project\Datasets\English\Founta et al\df_Founta.csv")
df_Founta.drop(['Unnamed: 0','tweet_id'], axis = 1, inplace = True)
df_Founta = df_Founta[(df_Founta['label'] == 'hateful') | (df_Founta['label'] == 'normal')]
df_Founta.label[df_Founta.label!='normal'] =1
df_Founta.label[df_Founta.label=='normal'] =0
print(df_Founta.label.value_counts())
df_Founta

0    36087
1     2008
Name: label, dtype: int64


Unnamed: 0,tweet,label
2,RT @MailOnline: The Nazi death gas so horrific...,0
3,I hate er chase because if the Bitch that work...,1
6,RT @nyctophil3: Pineapples do not belong on pi...,1
9,Niggas keep talking about women wearing weave ...,1
12,Carlos Correa had gyalchester as his walkup mu...,0
...,...,...
52426,RT @prozdkp: when you're mad at video games bu...,0
52429,"my spring break starts this saturday, i cant f...",0
52445,@Sushree_Metal Yaa dats a point ishu ko pata j...,0
52456,"If it didn't work out, then it didn't work out...",0


### Combining all the english processed datasets

In [17]:
df_English = pd.concat([df_Gilbert,df_Davidson, df_Basile,df_Ousidhoum_English, df_Waseem, df_Founta], axis =0)
df_English = df_English.reset_index(drop= True)
print(df_English.label.value_counts())
df_English

0    85075
1    20351
Name: label, dtype: int64


Unnamed: 0,tweet,label
0,"As of March 13th , 2014 , the booklet had been...",0
1,In order to help increase the booklets downloa...,0
2,( Simply copy and paste the following text int...,0
3,Click below for a FREE download of a colorfull...,1
4,Click on the `` DOWNLOAD ( 7.42 MB ) '' green ...,0
...,...,...
105421,RT @prozdkp: when you're mad at video games bu...,0
105422,"my spring break starts this saturday, i cant f...",0
105423,@Sushree_Metal Yaa dats a point ishu ko pata j...,0
105424,"If it didn't work out, then it didn't work out...",0


# Arabic

### Ousidhoum et al

In [13]:
Directory_Arabic = r'C:\Users\samsu\Desktop\Fall20\CS 695-002\Project\Datasets\Arabic'
Ousidhoum_Arabic = 'Ousidhoum et al.csv'
df_Ousidhoum_Arabic = Ousidhoum_data(Directory_Arabic, Ousidhoum_Arabic)

0    915
1    755
Name: label, dtype: int64


### Mulki et al

In [14]:
df_Mulki = pd.read_csv(r"C:\Users\samsu\Desktop\Fall20\CS 695-002\Project\Datasets\Arabic\Mulki et al.csv", sep = '\t' ,error_bad_lines=False)
df_Mulki.columns = ['tweet','label']
df_Mulki_normal  = df_Mulki[df_Mulki['label']== 'normal']
df_Mulki_hate  = df_Mulki[df_Mulki['label'] == 'hate']
df_Mulki = pd.concat([df_Mulki_normal,df_Mulki_hate], axis = 0)
df_Mulki.label[df_Mulki.label =='normal'] =0
df_Mulki.label[df_Mulki.label =='hate'] =1
print(df_Mulki.label.value_counts())
df_Mulki = df_Mulki.reset_index(drop =True)
df_Mulki

0    3650
1     468
Name: label, dtype: int64


Unnamed: 0,tweet,label
0,صديقي انت ابن جامعه اللعبه اكبر من داعش اللعبه...,0
1,و مصلحة لبنان تبدأ باستخراج النفط و الغاز لوقف...,0
2,يا جبران باسيل يا معلم يا ريس يا استاذ بدك حضن...,0
3,نصيحة احكي مع الرئيس ميشال عون او هاجم جبران ب...,0
4,مش عم يناموا الليل ليقدمولنا حكومة,0
...,...,...
4113,جبران باسيل عار على العرب ككل ماهو الا فرخ مجو...,1
4114,لبنان شامخ بعروبتو بأخواتو العرب بس الفرس ما ج...,1
4115,لا تنسى ان هذا الوهاب عنصري وحاقد وخادم أمين ل...,1
4116,إخزاك الله كنا مخدوعين فيك او انك أفدغ مصحوك ع...,1


In [15]:
df_Arabic = pd.concat([df_Mulki, df_Ousidhoum_Arabic], axis =0)
df_Arabic

Unnamed: 0,tweet,label
0,صديقي انت ابن جامعه اللعبه اكبر من داعش اللعبه...,0
1,و مصلحة لبنان تبدأ باستخراج النفط و الغاز لوقف...,0
2,يا جبران باسيل يا معلم يا ريس يا استاذ بدك حضن...,0
3,نصيحة احكي مع الرئيس ميشال عون او هاجم جبران ب...,0
4,مش عم يناموا الليل ليقدمولنا حكومة,0
...,...,...
1665,@user لا أعتبر بايرة حتى أصل ال30,0
1666,اقليم كردستان: ازدياد معدل العنف والتحرش الجنس...,0
1667,اكبر ثلاث كاذبين بل عالم اولهم من قال صوت المر...,0
1668,@user ي بعد جبدي انت . انا أحبك أكثر ي واحشني ...,0


# Spanish

### Pereira et al

In [16]:
tweets = []
labels = []
with open(r'C:\Users\samsu\Desktop\Fall20\CS 695-002\Project\Datasets\Spanish\Pereira et al\Pereira et al.txt', 'r', encoding = 'utf-8') as inp:
    lines = inp.readlines()
    for line in lines:
        line = str(line)
        label = line.split(",")[-1].strip()
        labels.append(label)
        tweet = (line.split(",")[:-1])
        tweets.append(tweet)     
df_Pereira = pd.DataFrame()
df_Pereira['tweet'] = tweets
df_Pereira['label'] = labels
print(df_Pereira.label.value_counts())
df_Pereira.head()

0    4433
1    1567
Name: label, dtype: int64


Unnamed: 0,tweet,label
0,[Ismael es egocentrico porque se vuelve loca s...,0
1,[..ya tardaba en salir quien pronunciase nombr...,0
2,"[(Esto no es un discurso político y razonado, ...",0
3,"[Muy despreciados, siiii, pero todos vestidos ...",1
4,[marica explicame porque a veces no te entiend...,1


### Basile et al

In [17]:
train = pd.read_csv(r'C:\Users\samsu\Desktop\Fall20\CS 695-002\Project\Datasets\Spanish\Basile et al\hateval2019_es_train.csv')
dev = pd.read_csv(r'C:\Users\samsu\Desktop\Fall20\CS 695-002\Project\Datasets\Spanish\Basile et al\hateval2019_es_dev.csv')
test = pd.read_csv(r'C:\Users\samsu\Desktop\Fall20\CS 695-002\Project\Datasets\Spanish\Basile et al\hateval2019_es_test.csv')

df_Basile_Spanish = pd.concat([train,dev,test], axis = 0)
df_Basile_Spanish.drop(['id','TR','AG'], axis = 1, inplace = True)
df_Basile_Spanish.columns = ['tweet','label']
print(df_Basile_Spanish.label.value_counts())
df_Basile_Spanish.head()

0    3861
1    2739
Name: label, dtype: int64


Unnamed: 0,tweet,label
0,Easyjet quiere duplicar el número de mujeres p...,1
1,El gobierno debe crear un control estricto de ...,1
2,Yo veo a mujeres destruidas por acoso laboral ...,0
3,"— Yo soy respetuoso con los demás, sólamente l...",0
4,Antonio Caballero y como ser de mal gusto e ig...,0


In [18]:
df_Spanish = pd.concat([df_Basile_Spanish, df_Pereira], axis =0)
df_Spanish

Unnamed: 0,tweet,label
0,Easyjet quiere duplicar el número de mujeres p...,1
1,El gobierno debe crear un control estricto de ...,1
2,Yo veo a mujeres destruidas por acoso laboral ...,0
3,"— Yo soy respetuoso con los demás, sólamente l...",0
4,Antonio Caballero y como ser de mal gusto e ig...,0
...,...,...
5995,"[Sea independentista o constitucionalista, la...",0
5996,[@Bernithedude Llave de gobierno de qué si hay...,0
5997,[@13Pardis @guenhwyvarblack Los carlinos son u...,0
5998,[Aplausos en el partido de Colau cuando anunci...,0


# French

In [19]:
Directory_French = r'C:\Users\samsu\Desktop\Fall20\CS 695-002\Project\Datasets\French'
Ousidhoum_French = 'Ousidhoum et al.csv'
df_French = Ousidhoum_data(Directory_French, Ousidhoum_French)

0    821
1    399
Name: label, dtype: int64


## Writing all the final files to CSV

In [14]:
def df_csv(directory, fname, df):
    fname = os.path.join(directory, fname)
    df.to_csv(fname, index = False)

In [26]:
path = r'C:\Users\samsu\Desktop\Fall20\CS 695-002\Project\Datasets\Final_Data'

df_csv(path, 'Arabic.csv', df_Arabic)
df_csv(path, 'English.csv', df_English)
df_csv(path, 'French.csv', df_French)
df_csv(path, 'Spanish.csv', df_Spanish)

# Multilingual Dataset preparation

In [31]:
en = pd.read_csv(r'C:\Users\samsu\Desktop\Fall20\CS 695-002\Project\Datasets\Final_Data\English.csv')
fr = pd.read_csv(r'C:\Users\samsu\Desktop\Fall20\CS 695-002\Project\Datasets\Final_Data\French.csv')
ar = pd.read_csv(r'C:\Users\samsu\Desktop\Fall20\CS 695-002\Project\Datasets\Final_Data\Arabic.csv')

In [57]:
en_fr = pd.concat([en, fr], axis =0).reset_index(drop=True)
fr_ar = pd.concat([ar, fr], axis =0).reset_index(drop=True)
en_ar = pd.concat([en, ar], axis =0).reset_index(drop=True)

path = r'C:\Users\samsu\Desktop\Fall20\CS 695-002\Project\Datasets\Multilingual data'

df_csv(path, 'en_fr.csv', en_fr)
df_csv(path, 'fr_ar.csv', fr_ar)
df_csv(path, 'en_ar.csv', en_ar)