In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
import numpy as np

In [2]:
basepath = 'base_path/'

In [3]:
# founta_data = pd.read_csv('founta_full_data.csv')
# waseem_data = pd.read_csv('waseem_full_data.csv')
# razavai_data = pd.read_csv('razavai_full_data.csv')

##### 
founta total: 59321
founta hate: 8424
founta no hate: 50887

waseem total: 14143
waseem hate: 494
waseem no hate: 13649

razavai total: 1496
razavai hate: 475
razavai no hate: 1021

In [11]:
# df_waseem = pd.DataFrame(waseem_data)
# df_founta = pd.DataFrame(founta_data)
# df_razavai = pd.DataFrame(razavai_data)

In [6]:
## splitting across each dataset
waseem_train, waseem_test_val = train_test_split(df_waseem, test_size=0.2)
waseem_test, waseem_val = train_test_split(waseem_test_val, test_size=0.5)

razavai_train, razavai_test_val = train_test_split(df_razavai, test_size=0.2)
razavai_test, razavai_val = train_test_split(razavai_test_val, test_size=0.5)

founta_train, founta_test_val = train_test_split(df_founta, test_size=0.2)
founta_test, founta_val = train_test_split(founta_test_val, test_size=0.5)

In [8]:
## joining all data for stratified sets
df_train = pd.concat([waseem_train,founta_train,razavai_train])
df_val = pd.concat([waseem_val,founta_val,razavai_val])
df_test = pd.concat([waseem_test,founta_test,razavai_test])

In [12]:
## weighting the labeling
df_train['label'].value_counts()
df_train['weight'] = np.where(df_train['label']==0, 0.5, 2.5)
print(df_train[df_train['label']==0]['weight'].sum())
print(df_train[df_train['label']==1]['weight'].sum())

In [10]:
df_train.to_csv('train.csv', index=False)
df_val.to_csv('val.csv', index=False)
df_test.to_csv('test.csv', index=False)

In [102]:
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(df_train['text'])
x_val = vectorizer.transform(df_val['text'])
x_test = vectorizer.transform(df_test['text'])

In [108]:
x_train.shape

(59783, 96571)

In [109]:
## with n_est =1 this is essentially a decision of tree, but gives us access to max_features, etc.
base_estimator = RandomForestClassifier(n_estimators=1, max_depth=1, max_features='sqrt')
model = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=1000)

In [110]:
model.fit(x_train, df_train['label'], df_train['weight'])

AdaBoostClassifier(base_estimator=RandomForestClassifier(max_depth=1,
                                                         max_features='sqrt',
                                                         n_estimators=1),
                   n_estimators=1000)

In [111]:
df_val['y_pred'] = model.predict(x_val)
temp1 = df_val[df_val['source']=='founta']['label']
temp2 = df_val[df_val['source']=='founta']['y_pred']
print(classification_report(temp1, temp2))

              precision    recall  f1-score   support

           0       0.89      0.81      0.85      5065
           1       0.27      0.42      0.33       847

    accuracy                           0.76      5912
   macro avg       0.58      0.62      0.59      5912
weighted avg       0.80      0.76      0.78      5912



In [112]:
temp1 = df_val[df_val['source']=='waseem']['label']
temp2 = df_val[df_val['source']=='waseem']['y_pred']
print(classification_report(temp1, temp2))

              precision    recall  f1-score   support

           0       0.97      0.84      0.90      1359
           1       0.05      0.23      0.08        53

    accuracy                           0.81      1412
   macro avg       0.51      0.53      0.49      1412
weighted avg       0.93      0.81      0.86      1412



In [113]:
temp1 = df_val[df_val['source']=='razavai']['label']
temp2 = df_val[df_val['source']=='razavai']['y_pred']
print(classification_report(temp1, temp2))

              precision    recall  f1-score   support

           0       0.63      0.67      0.65        95
           1       0.37      0.33      0.35        55

    accuracy                           0.55       150
   macro avg       0.50      0.50      0.50       150
weighted avg       0.54      0.55      0.54       150



In [13]:
## we could do this if we wanted to capture instances of a within a specified length
df_train[df_train['char count']<100]

Unnamed: 0,text,label,char count,source,weight
14058,Productive weekend! :thumbs_up::face_blowing_a...,0,52,waseem,0.5
427,"@ShariaIsJustice ""Yeap,"" Islam is for ignorant...",1,53,waseem,2.5
5899,#mkr I'm out people. Going to @Mr_Fanta_Pants ...,0,90,waseem,0.5
6826,"If Sheri and Emilie go ""home,"" im not watching...",0,71,waseem,0.5
3660,"RT @LouisRITHPotter I'm not ""sexist,"" but some...",0,77,waseem,0.5
...,...,...,...,...,...
748,that decision of yours to stop writing can you...,0,65,razavai,0.5
854,"NOS = No One wanted it then , and it is old Sh...",0,58,razavai,0.5
456,and what is your name asshole \n,1,31,razavai,2.5
1155,would you be helpful and share with all of us ...,0,69,razavai,0.5


In [14]:
# print(df_razavai['char count'].mean())
# print(df_razavai['char count'].median())
# print(df_razavai['char count'].min())
# print(df_razavai['char count'].max())