This Notebook is used to compute the accuracy of Traditional Machine Learning approaches on the Asian Prejudice dataset and the R-8 dataset.

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

#for model-building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score

# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#for word embedding
import gensim
from gensim.models import Word2Vec #Word2Vec is mostly used for huge datasets

In [2]:
import pandas as pd
  
# read text file into pandas DataFrame and
# create header
df = pd.read_csv("twitter_asian_prejudice_sentences_clean.txt", header=None)
df.columns = ['a']
# display DataFrame
print(df)

                                                       a
0        cnn doubt china female hashtag_eastasia hashtag
1      hashtag_eastasia happening behind live stream ...
2                                         afraid hashtag
3              rt mugisalty everybody wear masks hashtag
4      rt makes remember sad days 2003 china covered ...
...                                                  ...
19994  marcorubio everything tweet true except one th...
19995  gotta love bad chinese russian trolls try piss...
19996  nytimes guess china checks must pretty impress...
19997  hashtag_eastasia communist country led communi...
19998  zlj517 hey hashtag_eastasia_virus learn write ...

[19999 rows x 1 columns]


In [3]:
df_label = pd.read_csv("twitter_asian_prejudice_labels.txt", header=None)
df_label.columns = ['b']

df_label

Unnamed: 0,b
0,none_of_the_above
1,none_of_the_above
2,none_of_the_above
3,none_of_the_above
4,entity_directed_hostility
...,...
19995,entity_directed_hostility
19996,entity_directed_hostility
19997,entity_directed_hostility
19998,entity_directed_criticism


In [4]:
#from google.colab import files
#result.to_csv('data.csv', encoding = 'utf-8-sig') 
#files.download('data.csv')



In [5]:
result = pd.concat([df, df_label[:-1]], axis=1)
result = result[result.b != 'counter_speech']


In [6]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
vectorizer = CountVectorizer()
df1 = vectorizer.fit_transform(result['a'])

vectorizer1 = TfidfVectorizer()
df2 = vectorizer1.fit_transform(result['a'])

In [7]:
from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(df1,result['b'],test_size=0.2,shuffle=True)
X_train, X_test, y_train, y_test = train_test_split(df2,result['b'],test_size=0.2,shuffle=True)

In [8]:
#FITTING THE CLASSIFICATION MODEL using Logistic Regression(tf-idf)
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,confusion_matrix,roc_curve,auc 

lr_tfidf=LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')



lr_tfidf.fit(X_train, y_train)  #model

#Predict y value for test dataset
y_predict = lr_tfidf.predict(X_test)
y_prob = lr_tfidf.predict_proba(X_test)[:,1]
 

print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))

'''
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
print('AUC:', roc_auc)  '''

                                   precision    recall  f1-score   support

discussion_of_eastasian_prejudice       0.14      0.04      0.06       179
        entity_directed_criticism       0.05      0.01      0.02       279
        entity_directed_hostility       0.23      0.13      0.16       799
                none_of_the_above       0.69      0.87      0.77      2720

                         accuracy                           0.62      3977
                        macro avg       0.28      0.26      0.25      3977
                     weighted avg       0.53      0.62      0.56      3977

Confusion Matrix: [[   7    2   15  155]
 [   3    3   35  238]
 [  11   17  100  671]
 [  30   36  288 2366]]


"\nfpr, tpr, thresholds = roc_curve(y_test, y_prob)\nroc_auc = auc(fpr, tpr)\nprint('AUC:', roc_auc)  "

In [9]:
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
#Predict y value for test dataset
y_predict = mnb.predict(X_test)

print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                   precision    recall  f1-score   support

discussion_of_eastasian_prejudice       0.00      0.00      0.00       179
        entity_directed_criticism       0.00      0.00      0.00       279
        entity_directed_hostility       0.47      0.01      0.02       799
                none_of_the_above       0.68      1.00      0.81      2720

                         accuracy                           0.68      3977
                        macro avg       0.29      0.25      0.21      3977
                     weighted avg       0.56      0.68      0.56      3977

Confusion Matrix: [[   0    0    0  179]
 [   0    0    0  279]
 [   0    0    8  791]
 [   0    0    9 2711]]


In [10]:
from sklearn.ensemble import RandomForestClassifier

mnb = RandomForestClassifier()
mnb.fit(X_train, y_train)
#Predict y value for test dataset
y_predict = mnb.predict(X_test)

print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))

                                   precision    recall  f1-score   support

discussion_of_eastasian_prejudice       0.00      0.00      0.00       179
        entity_directed_criticism       0.00      0.00      0.00       279
        entity_directed_hostility       0.28      0.02      0.03       799
                none_of_the_above       0.69      0.99      0.81      2720

                         accuracy                           0.68      3977
                        macro avg       0.24      0.25      0.21      3977
                     weighted avg       0.52      0.68      0.56      3977

Confusion Matrix: [[   0    1    2  176]
 [   0    0    2  277]
 [   5    2   13  779]
 [   3    6   30 2681]]


In [11]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(X_train, y_train)
#Predict y value for test dataset
y_predict = svc.predict(X_test)

print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))

                                   precision    recall  f1-score   support

discussion_of_eastasian_prejudice       0.00      0.00      0.00       179
        entity_directed_criticism       0.00      0.00      0.00       279
        entity_directed_hostility       0.53      0.01      0.02       799
                none_of_the_above       0.69      1.00      0.81      2720

                         accuracy                           0.68      3977
                        macro avg       0.30      0.25      0.21      3977
                     weighted avg       0.58      0.68      0.56      3977

Confusion Matrix: [[   0    0    1  178]
 [   0    0    1  278]
 [   0    1    8  790]
 [   0    0    5 2715]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
import pandas as pd
  
  
# read text file into pandas DataFrame and
# create header
df3 = pd.read_csv("r8_sentences_clean.txt", header=None)
df3.columns = ['a']
# display DataFrame
print(df3)

                                                      a
0     champion products approves stock split champio...
1     computer terminal systems completes sale compu...
2     inc year net shr cts vs dlrs net vs assets mln...
3     international inc nd qtr jan oper shr loss two...
4     brown forman inc th qtr net shr one dlr vs cts...
...                                                 ...
7669  balladur maintenance louvre accords french fin...
7670  philippine trade gap widens january august phi...
7671  iran soviet union swap crude refined products ...
7672  n z chase corp makes offer entregrowth chase c...
7673  japan india conference cuts gulf war risk char...

[7674 rows x 1 columns]


In [13]:
df4 = pd.read_csv("r8_labels.txt", sep="\t", header=None)
df4.columns = ['b','c','d']
# display DataFrame
print(df4)

         b      c         d
0        0  train      earn
1        1  train       acq
2        2  train      earn
3        3  train      earn
4        4  train      earn
...    ...    ...       ...
7669  7669   test  money-fx
7670  7670   test     trade
7671  7671   test     crude
7672  7672   test       acq
7673  7673   test      ship

[7674 rows x 3 columns]


In [14]:
result2 = pd.concat([df3, df4], axis=1)
result2 = result2.drop('b', axis=1)
result3 = result2.drop('c', axis=1)
vectorizer = CountVectorizer()

df5 = vectorizer.fit_transform(result3['a'])
#X_train, X_test, y_train, y_test = train_test_split(df1,result['b'],test_size=0.2,shuffle=True)
X_train, X_test, y_train, y_test = train_test_split(df5,result3['d'],test_size=0.2,shuffle=True)


In [15]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
lr_tfidf=LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')



lr_tfidf.fit(X_train, y_train)  #model

#Predict y value for test dataset
y_predict = lr_tfidf.predict(X_test)
y_prob = lr_tfidf.predict_proba(X_test)[:,1]
 

print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))


              precision    recall  f1-score   support

         acq       0.97      0.99      0.98       460
       crude       1.00      0.93      0.96        73
        earn       0.99      0.99      0.99       776
       grain       1.00      0.73      0.84        11
    interest       0.86      0.93      0.89        55
    money-fx       0.91      0.83      0.87        64
        ship       0.96      0.90      0.93        29
       trade       0.93      0.97      0.95        67

    accuracy                           0.97      1535
   macro avg       0.95      0.91      0.93      1535
weighted avg       0.97      0.97      0.97      1535

Confusion Matrix: [[456   0   3   0   0   1   0   0]
 [  4  68   0   0   0   0   1   0]
 [  9   0 767   0   0   0   0   0]
 [  0   0   0   8   1   1   0   1]
 [  0   0   0   0  51   3   0   1]
 [  1   0   2   0   7  53   0   1]
 [  1   0   0   0   0   0  26   2]
 [  1   0   1   0   0   0   0  65]]


In [16]:
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
#Predict y value for test dataset
y_predict = mnb.predict(X_test)

print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))

              precision    recall  f1-score   support

         acq       0.91      0.98      0.95       460
       crude       0.92      0.99      0.95        73
        earn       1.00      0.93      0.96       776
       grain       0.89      0.73      0.80        11
    interest       0.76      0.85      0.80        55
    money-fx       0.81      0.81      0.81        64
        ship       0.93      0.97      0.95        29
       trade       0.88      0.96      0.91        67

    accuracy                           0.94      1535
   macro avg       0.89      0.90      0.89      1535
weighted avg       0.95      0.94      0.94      1535

Confusion Matrix: [[452   1   1   0   0   1   1   4]
 [  1  72   0   0   0   0   0   0]
 [ 40   4 724   1   4   1   1   1]
 [  0   0   0   8   1   0   0   2]
 [  0   0   0   0  47   7   0   1]
 [  1   0   0   0  10  52   0   1]
 [  0   1   0   0   0   0  28   0]
 [  0   0   0   0   0   3   0  64]]


In [17]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
#Predict y value for test dataset
y_predict = rfc.predict(X_test)

print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))

              precision    recall  f1-score   support

         acq       0.89      0.99      0.93       460
       crude       0.98      0.84      0.90        73
        earn       0.99      0.97      0.98       776
       grain       1.00      0.27      0.43        11
    interest       0.84      0.84      0.84        55
    money-fx       0.91      0.77      0.83        64
        ship       1.00      0.52      0.68        29
       trade       0.89      0.94      0.91        67

    accuracy                           0.94      1535
   macro avg       0.94      0.77      0.81      1535
weighted avg       0.94      0.94      0.94      1535

Confusion Matrix: [[455   0   5   0   0   0   0   0]
 [ 12  61   0   0   0   0   0   0]
 [ 24   0 752   0   0   0   0   0]
 [  3   0   1   3   1   0   0   3]
 [  3   0   1   0  46   3   0   2]
 [  4   0   2   0   8  49   0   1]
 [ 11   1   0   0   0   0  15   2]
 [  2   0   0   0   0   2   0  63]]


In [18]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(X_train, y_train)
#Predict y value for test dataset
y_predict = svc.predict(X_test)

print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))

              precision    recall  f1-score   support

         acq       0.79      0.99      0.88       460
       crude       1.00      0.81      0.89        73
        earn       0.99      0.95      0.97       776
       grain       1.00      0.18      0.31        11
    interest       0.87      0.71      0.78        55
    money-fx       1.00      0.52      0.68        64
        ship       0.95      0.66      0.78        29
       trade       0.96      0.69      0.80        67

    accuracy                           0.91      1535
   macro avg       0.94      0.69      0.76      1535
weighted avg       0.92      0.91      0.90      1535

Confusion Matrix: [[457   0   3   0   0   0   0   0]
 [ 13  59   0   0   0   0   1   0]
 [ 38   0 738   0   0   0   0   0]
 [  7   0   1   2   0   0   0   1]
 [ 14   0   1   0  39   0   0   1]
 [ 24   0   1   0   6  33   0   0]
 [  8   0   2   0   0   0  19   0]
 [ 20   0   1   0   0   0   0  46]]
