In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
import pandas as pd
import string
import re
import numpy as np
from collections import Counter
import unicodedata
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import spacy
from gensim.utils import simple_preprocess
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support, accuracy_score
from sklearn.model_selection import StratifiedKFold
from collections import defaultdict
import pickle
import json
path = "/content/drive/My Drive/cs 583/sentiment analysis/"

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
%%capture
!pip install transformers
!pip install evaluate
!pip install datasets #restart

# Pre-process data: execute once

In [None]:
dfo = pd.read_csv(path+"data/Obama.csv")
dfo['candidate']="obama"
dfr = pd.read_csv(path+"data/Romney.csv")
dfr['candidate']='romney'
df = pd.concat([dfo,dfr])
df.fillna('', inplace = True)
print(len(df))
df = df.drop_duplicates()
print(len(df))
df.head(2)

14398
14391


Unnamed: 0,date,time,tweet,class,candidate
0,10/16/12,10:28:53-05:00,"Kirkpatrick, who wore a baseball cap embroider...",0,obama
1,12/10/16,10:09:00-05:00,Question: If <e>Romney</e> and <e>Obama</e> ha...,2,obama


In [None]:
def preprocess(example):
  new_text = [] 
  for t in example.split(" "):
      t = '@user' if t.startswith('@') and len(t) > 1 else t
      t = 'http' if t.startswith('http') else t
      new_text.append(t)
  new_text = " ".join(new_text)
  return new_text

In [None]:
print(df['class'].value_counts()) # 0:neutral, 1:positive, -1:negative, 2: mixed
df = df[df['class'].isin(['0', '1' , '-1'])]
print(len(df))
df['class'] = df['class'].astype(int)
df = df.rename(columns={'class': 'sentiment'})
# print(df.head)
df = df.drop(columns=['time', 'date'], axis=1)
print(df['sentiment'].value_counts())

df['tweet'] = df.tweet.apply(preprocess)
df = df[~df.tweet.isna()]
df = df[df.tweet!='']
print(len(df))
# df.to_csv(path+"data/tweets.csv")

-1            4856
0             3657
2             2895
1             2753
!!!!           169
                34
irrevelant      23
IR               3
irrelevant       1
Name: class, dtype: int64
11266
-1    4856
 0    3657
 1    2753
Name: sentiment, dtype: int64
11265


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['class'] = df['class'].astype(int)


In [None]:
df[df.candidate=="romney"]['sentiment'].value_counts()

-1    2892
 0    1680
 1    1075
Name: sentiment, dtype: int64

In [None]:
print(df.head())

## Split data for 10 fold Cross validation

In [None]:
df = pd.read_csv(path+"data/tweets.csv", index_col=None)
df.fillna('', inplace = True)
df = df.drop('Unnamed: 0', axis=1)
print(df.head())

                                               tweet  sentiment candidate
0  Kirkpatrick, who wore a baseball cap embroider...          0     obama
1  #<e>obama</e> debates that Cracker Ass Cracker...          1     obama
2  @user @user  Youre missing the point  Im afrai...          0     obama
3  I was raised as a Democrat  left the party yea...         -1     obama
4  The <e>Obama camp</e> can't afford to lower ex...          0     obama


In [None]:
df = df[df.tweet!= '']
print(df.head(2))
strtfdKFold = StratifiedKFold(n_splits=10, shuffle=True)
kfold = strtfdKFold.split(df.tweet, df.sentiment)

for i, (train, test) in enumerate(kfold):
  train_df = df.iloc[train]
  test_df = df.iloc[test]
  train_df.to_csv(f'{path}data/train/train_{i+1}.csv')
  test_df.to_csv(f'{path}data/test/test_{i+1}.csv')
  print(f'Iteration {i} done')

                                               tweet  sentiment candidate
0  Kirkpatrick, who wore a baseball cap embroider...          0     obama
1  #<e>obama</e> debates that Cracker Ass Cracker...          1     obama
Iteration 0 done
Iteration 1 done
Iteration 2 done
Iteration 3 done
Iteration 4 done
Iteration 5 done
Iteration 6 done
Iteration 7 done
Iteration 8 done
Iteration 9 done


# Vector-space (BOW models)

## Additional pre-processing for BoW models

In [None]:
#Keep only alphabets
def clean_tweet(text):
  temp = text.lower()
  temp = re.sub("'", "", temp) # to avoid removing contractions in english
  temp = re.sub("<[ea]>","", temp)
  temp = re.sub("</[ea]>","",temp)
  temp = re.sub("@[A-Za-z0-9_]+","", temp)
  temp = re.sub("#[A-Za-z0-9_]+","", temp)
  temp = re.sub(r'http\S+', '', temp)
  temp = re.sub('[()!?]', ' ', temp)
  temp = re.sub('\[.*?\]',' ', temp)
  temp = re.sub("[^a-z0-9]"," ", temp)
  temp = re.sub("[0-9]", "", temp)
  temp = re.sub("\</?[a-z]+\>", '', temp)
  temp = re.sub('\s+', ' ', temp)
  temp = temp.strip()
  return temp

# Models

## Word n-gram

### Logistic Regression

In [None]:
accuracy_list = []
precision_list = defaultdict(list)
recall_list = defaultdict(list)
f1_list = defaultdict(list)

for i in range(1, 11):
  # train
  df_train = pd.read_csv(f'{path}data/train/train_{i}.csv')
  df_train['tweet'] = df_train['tweet'].apply(lambda x: clean_tweet(x))
  X_train = df_train['tweet']
  y_train = df_train['sentiment']
  
  # test
  df_test = pd.read_csv(f'{path}data/test/test_{i}.csv')
  df_test['tweet'] = df_test['tweet'].apply(lambda x: clean_tweet(x))
  X_test = df_test['tweet']
  y_test = df_test['sentiment']
  
  # vectorizer
  vectorizer_word = TfidfVectorizer(max_features=40000, min_df=2, max_df=.5, analyzer='word', stop_words='english', ngram_range=(1, 5))
  vectorizer_word.fit(X_train)

  # transform & model fit
  tfidf_matrix_word_train = vectorizer_word.transform(X_train)
  lr_word = LogisticRegression(verbose=2, C=.7, n_jobs = -1, class_weight='balanced')
  lr_word.fit(tfidf_matrix_word_train, y_train)
  
  # save vectorizer 
  with open(f'{path}vectorizer/LR_Word/LR_Word_vectorizer_{i}.pickle', 'wb') as fin:
    pickle.dump(vectorizer_word, fin)

  # save model
  with open(f'{path}model/LR_Word/LR_Word_{i}.pickle', 'wb') as fin:
    pickle.dump(lr_word, fin)

  # predict
  tfidf_matrix_word_test = vectorizer_word.transform(X_test)
  df_test['pred'] = lr_word.predict(tfidf_matrix_word_test)

  # save output
  df_test.to_csv(f'{path}output/LR_Word/LR_Word_{i}.csv')
  
  # calculate metrics
  report = classification_report(y_test, df_test['pred'], output_dict=True)

  # save metrics
  with open(f'{path}classification_report/LR_Word/report_{i}.json', "w") as outfile:
    json.dump(report, outfile)

  for index in range(-1, 2):
    precision_list[index].append(report[str(index)]['precision'])
    recall_list[index].append(report[str(index)]['recall'])
    f1_list[index].append(report[str(index)]['f1-score'])
  accuracy_list.append(report['accuracy'])

# display metrics
for key, val_list in precision_list.items():
  print(f'Avg precision for class {key} = {round(np.mean(val_list), 4)} +/- {round(np.std(val_list), 4)}')

for key, val_list in recall_list.items():
  print(f'Avg recall for class {key} = {round(np.mean(val_list), 4)} +/- {round(np.std(val_list), 4)}')

for key, val_list in f1_list.items():
  print(f'Avg f1 for class {key} = {round(np.mean(val_list), 4)} +/- {round(np.std(val_list), 4)}')

print(f'Avg Accuracy: {round(np.mean(accuracy_list), 4)}')


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    3.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    2.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    1.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    1.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    1.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    1.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1

Avg precision for class -1 = 0.6515 +/- 0.0215
Avg precision for class 0 = 0.5276 +/- 0.0186
Avg precision for class 1 = 0.5498 +/- 0.0141
Avg recall for class -1 = 0.6293 +/- 0.0213
Avg recall for class 0 = 0.506 +/- 0.0245
Avg recall for class 1 = 0.6124 +/- 0.0261
Avg f1 for class -1 = 0.6401 +/- 0.0184
Avg f1 for class 0 = 0.5165 +/- 0.0207
Avg f1 for class 1 = 0.5793 +/- 0.018
Avg Accuracy: 0.5852


## Character n gram

### Logistic Regression

In [None]:
accuracy_list = []
precision_list = defaultdict(list)
recall_list = defaultdict(list)
f1_list = defaultdict(list)

for i in range(1, 11):
  # train
  df_train = pd.read_csv(f'{path}data/train/train_{i}.csv')
  df_train['tweet'] = df_train['tweet'].apply(lambda x: clean_tweet(x))
  X_train = df_train['tweet']
  y_train = df_train['sentiment']
  
  # test
  df_test = pd.read_csv(f'{path}data/test/test_{i}.csv')
  df_test['tweet'] = df_test['tweet'].apply(lambda x: clean_tweet(x))
  X_test = df_test['tweet']
  y_test = df_test['sentiment']

  # vectorizer
  vectorizer_char = TfidfVectorizer(max_features=40000,min_df=5, max_df=0.5, analyzer='char', ngram_range=(1, 11))
  vectorizer_char.fit(X_train)

  # transform & model fit
  tfidf_matrix_char_train = vectorizer_char.transform(X_train)
  lr_char = LogisticRegression(verbose=2, C=.7, n_jobs = -1, class_weight='balanced')
  lr_char.fit(tfidf_matrix_char_train, y_train)
  
  # save vectorizer 
  with open(f'{path}vectorizer/LR_Char/LR_Char_vectorizer_{i}.pickle', 'wb') as fin:
    pickle.dump(vectorizer_char, fin)

  # save model
  with open(f'{path}model/LR_Char/LR_Char_{i}.pickle', 'wb') as fin:
    pickle.dump(lr_char, fin)

  # predict
  tfidf_matrix_char_test = vectorizer_char.transform(X_test)
  df_test['pred'] = lr_char.predict(tfidf_matrix_char_test)

  # save output
  df_test.to_csv(f'{path}output/LR_Char/LR_Char_{i}.csv')
  
  # calculate metrics
  report = classification_report(y_test, df_test['pred'], output_dict=True)

  # save metrics
  with open(f'{path}classification_report/LR_Char/report_{i}.json', "w") as outfile:
    json.dump(report, outfile)

  for index in range(-1, 2):
    precision_list[index].append(report[str(index)]['precision'])
    recall_list[index].append(report[str(index)]['recall'])
    f1_list[index].append(report[str(index)]['f1-score'])
  accuracy_list.append(report['accuracy'])

# display metrics
for key, val_list in precision_list.items():
  print(f'Avg precision for class {key} = {round(np.mean(val_list), 4)} +/- {round(np.std(val_list), 4)}')

for key, val_list in recall_list.items():
  print(f'Avg recall for class {key} = {round(np.mean(val_list), 4)} +/- {round(np.std(val_list), 4)}')

for key, val_list in f1_list.items():
  print(f'Avg f1 for class {key} = {round(np.mean(val_list), 4)} +/- {round(np.std(val_list), 4)}')

print(f'Avg Accuracy: {round(np.mean(accuracy_list), 4)}')


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    7.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    6.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    6.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    5.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    6.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    6.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1

Avg precision for class -1 = 0.6513 +/- 0.018
Avg precision for class 0 = 0.5389 +/- 0.0191
Avg precision for class 1 = 0.5534 +/- 0.0161
Avg recall for class -1 = 0.6417 +/- 0.0254
Avg recall for class 0 = 0.5134 +/- 0.0224
Avg recall for class 1 = 0.6026 +/- 0.0288
Avg f1 for class -1 = 0.6463 +/- 0.0204
Avg f1 for class 0 = 0.5257 +/- 0.0189
Avg f1 for class 1 = 0.5768 +/- 0.0207
Avg Accuracy: 0.5905


## Meta-models
include obama/romney info

### Single model

In [None]:
from scipy import sparse 
from scipy.sparse import csr_matrix

#### Obama

In [None]:
accuracy_list = []
precision_list = defaultdict(list)
recall_list = defaultdict(list)
f1_list = defaultdict(list)

for i in range(1, 11):
  # train
  df_train = pd.read_csv(f'{path}data/train/train_{i}.csv')
  df_train['tweet'] = df_train['tweet'].apply(lambda x: clean_tweet(x))
  X_train = df_train['tweet']
  candidate_train_df = pd.DataFrame({'candidate' : df_train['candidate'] == "obama"})
  candidate_train = csr_matrix(candidate_train_df)
  y_train = df_train['sentiment']

  # test
  df_test = pd.read_csv(f'{path}data/test/test_{i}.csv')
  df_test['tweet'] = df_test['tweet'].apply(lambda x: clean_tweet(x))
  X_test = df_test['tweet']
  candidate_test_df = pd.DataFrame({'candidate' : df_test['candidate'] == "obama"})
  candidate_test = csr_matrix(candidate_test_df)
  y_test = df_test['sentiment']

  # vectorizer
  vectorizer_char = TfidfVectorizer(max_features=40000, min_df=5, max_df=0.5, analyzer='char', ngram_range=(1, 11))
  vectorizer_char.fit(X_train)

  # transform, new X_train & model fit
  tfidf_matrix_train = vectorizer_char.transform(X_train)
  X_train_new = sparse.hstack((tfidf_matrix_train, candidate_train)).tocsr()
  lr_char = LogisticRegression(verbose=2, C=.7, n_jobs = -1, class_weight='balanced')
  lr_char.fit(X_train_new, y_train)

  # save vectorizer 
  with open(f'{path}vectorizer/LR_Meta_Single_Char/LR_Meta_Single_Char_vectorizer_{i}.pickle', 'wb') as fin:
    pickle.dump(vectorizer_char, fin)

  # save model
  with open(f'{path}model/LR_Meta_Single_Char/LR_Meta_Single_Char_{i}.pickle', 'wb') as fin:
    pickle.dump(lr_char, fin)

  # predict
  tfidf_matrix_test = vectorizer_char.transform(X_test)
  X_test_new = sparse.hstack((tfidf_matrix_test, candidate_test)).tocsr()
  df_test['pred'] = lr_char.predict(X_test_new)

  # save output
  df_test.to_csv(f'{path}output/LR_Meta_Single_Char/LR_Meta_Single_Char_{i}.csv')
  
  # calculate metrics
  report = classification_report(y_test, df_test['pred'], output_dict=True)

  # save metrics
  with open(f'{path}classification_report/LR_Meta_Single_Char/report_{i}.json', "w") as outfile:
    json.dump(report, outfile)

  for index in range(-1, 2):
    precision_list[index].append(report[str(index)]['precision'])
    recall_list[index].append(report[str(index)]['recall'])
    f1_list[index].append(report[str(index)]['f1-score'])
  accuracy_list.append(report['accuracy'])

# display metrics
for key, val_list in precision_list.items():
  print(f'Avg precision for class {key} = {round(np.mean(val_list), 4)} +/- {round(np.std(val_list), 4)}')

for key, val_list in recall_list.items():
  print(f'Avg recall for class {key} = {round(np.mean(val_list), 4)} +/- {round(np.std(val_list), 4)}')

for key, val_list in f1_list.items():
  print(f'Avg f1 for class {key} = {round(np.mean(val_list), 4)} +/- {round(np.std(val_list), 4)}')

print(f'Avg Accuracy: {round(np.mean(accuracy_list), 4)}')


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    6.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    6.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    5.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    6.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    7.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    6.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1

Avg precision for class -1 = 0.6527 +/- 0.0185
Avg precision for class 0 = 0.5396 +/- 0.019
Avg precision for class 1 = 0.5547 +/- 0.015
Avg recall for class -1 = 0.6394 +/- 0.0239
Avg recall for class 0 = 0.517 +/- 0.0251
Avg recall for class 1 = 0.6055 +/- 0.0279
Avg f1 for class -1 = 0.6459 +/- 0.0199
Avg f1 for class 0 = 0.5279 +/- 0.0204
Avg f1 for class 1 = 0.5788 +/- 0.0192
Avg Accuracy: 0.5914


### Two models

In [None]:
def predict(row):
  if row['candidate']=='obama':
    return lr_o.predict(vectorizer_char_o.transform([row['tweet']]))[0]
  else:
    return lr_r.predict(vectorizer_char_r.transform([row['tweet']]))[0]

In [None]:
accuracy_list = []
precision_list = defaultdict(list)
recall_list = defaultdict(list)
f1_list = defaultdict(list)

for i in range(1, 11):
  # train - obama
  df_train= pd.read_csv(f'{path}data/train/train_{i}.csv')
  df_train['tweet'] = df_train['tweet'].apply(lambda x: clean_tweet(x))
  df_o = df_train[df_train.candidate=="obama"]
  X_train_o = df_o['tweet']
  y_train_o = df_o['sentiment']

  # train - romney
  df_r = df_train[df_train.candidate=="romney"]
  X_train_r = df_r['tweet']
  y_train_r = df_r['sentiment']

  # vectorizer - obama & romney
  vectorizer_char_o = TfidfVectorizer(max_features=40000,min_df=5, max_df=0.5, analyzer='char', ngram_range=(1, 11))
  vectorizer_char_r = TfidfVectorizer(max_features=40000,min_df=5, max_df=0.5, analyzer='char', ngram_range=(1, 11))
  vectorizer_char_o.fit(X_train_o)
  vectorizer_char_r.fit(X_train_r)  

  # transform & model fit - obama
  tfidf_matrix_train_o = vectorizer_char_o.transform(X_train_o)
  lr_o = LogisticRegression(verbose=2, C=.7, n_jobs = -1, class_weight='balanced')
  lr_o.fit(tfidf_matrix_train_o, y_train_o)

  # transform & model fit - romney
  tfidf_matrix_train_r = vectorizer_char_r.transform(X_train_r)
  lr_r = LogisticRegression(verbose=2, C=.7, n_jobs = -1, class_weight='balanced')
  lr_r.fit(tfidf_matrix_train_r, y_train_r)

  # save vectorizer 
  with open(f'{path}vectorizer/LR_Meta_TwoModel_Char/Obama/Obama_{i}.pickle', 'wb') as fin:
    pickle.dump(vectorizer_char_o, fin)

  with open(f'{path}vectorizer/LR_Meta_TwoModel_Char/Romney/Romney_{i}.pickle', 'wb') as fin:
    pickle.dump(vectorizer_char_r, fin)

  # save model
  with open(f'{path}model/LR_Meta_TwoModel_Char/Obama/Obama_{i}.pickle', 'wb') as fin:
    pickle.dump(lr_o, fin)

  with open(f'{path}model/LR_Meta_TwoModel_Char/Romney/Romney_{i}.pickle', 'wb') as fin:
    pickle.dump(lr_r, fin)

  # predict
  df_test = pd.read_csv(f'{path}data/test/test_{i}.csv')
  df_test['tweet'] = df_test['tweet'].apply(lambda x: clean_tweet(x))
  df_test['pred'] = df_test.apply(predict, axis = 1)

  # save output
  df_test.to_csv(f'{path}output/LR_Meta_TwoModel_Char/LR_Meta_TwoModel_Char_{i}.csv')

  # calculate metrics
  report = classification_report(df_test['sentiment'], df_test['pred'], output_dict=True)

  # save metrics
  with open(f'{path}classification_report/LR_Meta_TwoModel_Char/report_{i}.json', "w") as outfile:
    json.dump(report, outfile)

  for index in range(-1, 2):
    precision_list[index].append(report[str(index)]['precision'])
    recall_list[index].append(report[str(index)]['recall'])
    f1_list[index].append(report[str(index)]['f1-score'])
  accuracy_list.append(report['accuracy'])

# display metrics
for key, val_list in precision_list.items():
  print(f'Avg precision for class {key} = {round(np.mean(val_list), 4)} +/- {round(np.std(val_list), 4)}')

for key, val_list in recall_list.items():
  print(f'Avg recall for class {key} = {round(np.mean(val_list), 4)} +/- {round(np.std(val_list), 4)}')

for key, val_list in f1_list.items():
  print(f'Avg f1 for class {key} = {round(np.mean(val_list), 4)} +/- {round(np.std(val_list), 4)}')

print(f'Avg Accuracy: {round(np.mean(accuracy_list), 4)}')


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    2.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    3.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    3.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    2.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    4.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    1.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1

Avg precision for class -1 = 0.6472 +/- 0.0128
Avg precision for class 0 = 0.5339 +/- 0.021
Avg precision for class 1 = 0.5735 +/- 0.016
Avg recall for class -1 = 0.6487 +/- 0.0204
Avg recall for class 0 = 0.5074 +/- 0.0232
Avg recall for class 1 = 0.6088 +/- 0.0229
Avg f1 for class -1 = 0.6479 +/- 0.0153
Avg f1 for class 0 = 0.5201 +/- 0.0199
Avg f1 for class 1 = 0.5906 +/- 0.0186
Avg Accuracy: 0.5931


# Transformer-based Language Models

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import Trainer
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification
from datasets import load_metric
from scipy.special import softmax
import csv
import urllib.request

## Roberta twitter sentiment classifier

### Predict w/o fine-tuning

In [None]:
def predict(text):
  encoded_input = tokenizer(text, return_tensors='pt')
  output = model(**encoded_input)
  scores = output[0][0].detach().numpy()
  scores = softmax(scores)
  
  ranking = np.argsort(scores)
  ranking = ranking[::-1]
  l= labels[ranking[0]]
  if l == "positive": return 1
  elif l == "negative":return -1
  else: return 0

# download label mapping
mapping_link = "https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]#labels are negative, neutral, positive

accuracy_list = []
precision_list = defaultdict(list)
recall_list = defaultdict(list)
f1_list = defaultdict(list)
MODEL = "cardiffnlp/twitter-roberta-base-sentiment"

for i in range(1, 11):
  tokenizer = AutoTokenizer.from_pretrained(MODEL)
  model = AutoModelForSequenceClassification.from_pretrained(MODEL)
  df_test = pd.read_csv(f'{path}data/test/test_{i}.csv')
  x_test = df_test.tweet
  y_test = df_test.sentiment

  df_test['pred'] = x_test.apply(predict)

  # save output
  df_test.to_csv(f'{path}output/Roberta_Not_Finetuned/Roberta_Not_Finetuned_{i}.csv')

  # calculate metrics
  report = classification_report(y_test, df_test['pred'], output_dict=True)

  # save metrics
  with open(f'{path}classification_report/Roberta_Not_Finetuned/report_{i}.json', "w") as outfile:
    json.dump(report, outfile)

  for index in range(-1, 2):
    precision_list[index].append(report[str(index)]['precision'])
    recall_list[index].append(report[str(index)]['recall'])
    f1_list[index].append(report[str(index)]['f1-score'])
  accuracy_list.append(report['accuracy'])

# display metrics
for key, val_list in precision_list.items():
  print(f'Avg precision for class {key} = {round(np.mean(val_list), 4)} +/- {round(np.std(val_list), 4)}')

for key, val_list in recall_list.items():
  print(f'Avg recall for class {key} = {round(np.mean(val_list), 4)} +/- {round(np.std(val_list), 4)}')

for key, val_list in f1_list.items():
  print(f'Avg f1 for class {key} = {round(np.mean(val_list), 4)} +/- {round(np.std(val_list), 4)}')

print(f'Avg Accuracy = {round(np.mean(accuracy_list), 4)}')


Avg precision for class -1 = 0.6803 +/- 0.0147

Avg precision for class 0 = 0.4467 +/- 0.0095

Avg precision for class 1 = 0.6356 +/- 0.0331

Avg recall for class -1 = 0.6534 +/- 0.0092

Avg recall for class 0 = 0.6455 +/- 0.0211

Avg recall for class 1 = 0.3033 +/- 0.0227

Avg f1 for class -1 = 0.6664 +/- 0.0061

Avg f1 for class 0 = 0.5279 +/- 0.0116

Avg f1 for class 1 = 0.4104 +/- 0.0259

Avg Accuracy: 0.5653

### Finetune Roberta

In [None]:
def transform_labels(label):
    label = label['sentiment']
    if label == 1:
        num = 2
    elif label == -1:
        num = 0
    elif label == 0:
        num = 1
    return {'labels': num}

def tokenize_data(example):
    return tokenizer(example['tweet'], padding='max_length', max_length = 512 )

In [None]:
MODEL = "cardiffnlp/twitter-roberta-base-sentiment"

# Training the models
for i in range(5, 7):
  tokenizer = AutoTokenizer.from_pretrained(MODEL)
  dataset = load_dataset('csv', data_files={'train': f'{path}data/train/train_{i}.csv', 'test': f'{path}data/test/test_{i}.csv'}, encoding = "ISO-8859-1")
  dataset = dataset.map(tokenize_data, batched=True)
  model = AutoModelForSequenceClassification.from_pretrained(MODEL)
  remove_columns = ['Unnamed: 0', 'tweet', 'sentiment']
  dataset = dataset.map(transform_labels, remove_columns=remove_columns)
  train_dataset = dataset['train']
  eval_dataset = dataset['test']


  def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    pred.append(predictions)
    y.append(labels)
    return metric.compute(predictions=predictions, references=labels, average='macro')

  # save training args
  training_args = TrainingArguments(f'{path}model/Roberta_Finetuned/Roberta_Checkpoint', num_train_epochs=2)
  trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics)
  history = trainer.train()

  # save model
  trainer.save_model(f'{path}model/Roberta_Finetuned/Roberta_Finetuned_{i}')

  metric = load_metric("f1", average='macro')
  pred =[]
  y=[]

  # test
  trainer.evaluate()

  # save output
  df = pd.DataFrame({"pred":pred[0], "y":y[0]})
  df['pred'] = df['pred'].replace({0:-1, 1:0, 2:1})
  df['y'] = df['y'].replace({0:-1, 1:0, 2:1})
  df.to_csv(f'{path}output/Roberta_Finetuned/Roberta_Finetuned_{i}.csv')

  # calculate metrics
  report = classification_report(df.y, df.pred, output_dict=True)

  # save metrics
  with open(f'{path}classification_report/Roberta_Finetuned/report_{i}.json', "w") as outfile:
    json.dump(report, outfile)

  for index in range(-1, 2):
    precision_list[index].append(report[str(index)]['precision'])
    recall_list[index].append(report[str(index)]['recall'])
    f1_list[index].append(report[str(index)]['f1-score'])
  accuracy_list.append(report['accuracy'])
  

In [32]:
accuracy_list = []
precision_list = defaultdict(list)
recall_list = defaultdict(list)
f1_list = defaultdict(list)
macro_f1 = []

In [33]:
import json
 
# Opening JSON file
for i in range(1,11):
  with open(f'{path}classification_report/Roberta_Finetuned/report_{i}.json') as json_file:
    report = json.load(json_file)
  
  for index in range(-1, 2):
    precision_list[index].append(report[str(index)]['precision'])
    recall_list[index].append(report[str(index)]['recall'])
    f1_list[index].append(report[str(index)]['f1-score'])
  accuracy_list.append(report['accuracy'])
  macro_f1.append(report['macro avg']['f1-score'])

In [34]:
# display metrics
f1 = []
for key, val_list in precision_list.items():
  print(f'Avg precision for class {key} = {round(np.mean(val_list), 4)} +/- {round(np.std(val_list), 4)}')

for key, val_list in recall_list.items():
  print(f'Avg recall for class {key} = {round(np.mean(val_list), 4)} +/- {round(np.std(val_list), 4)}')

for key, val_list in f1_list.items():
  f1.append(round(np.std(val_list), 4))
  print(f'Avg f1 for class {key} = {round(np.mean(val_list), 4)} +/- {round(np.std(val_list), 4)}')

print(f'Macro f1 : {round(np.mean(macro_f1), 4)} +/- {round(np.std(macro_f1), 4)}')
print(f'Avg Accuracy: {round(np.mean(accuracy_list), 4)} +/- {round(np.std(accuracy_list), 4)}')

Avg precision for class -1 = 0.7349 +/- 0.0151
Avg precision for class 0 = 0.666 +/- 0.0209
Avg precision for class 1 = 0.7206 +/- 0.0169
Avg recall for class -1 = 0.8013 +/- 0.0166
Avg recall for class 0 = 0.5919 +/- 0.0262
Avg recall for class 1 = 0.7109 +/- 0.0257
Avg f1 for class -1 = 0.7666 +/- 0.0135
Avg f1 for class 0 = 0.6263 +/- 0.0177
Avg f1 for class 1 = 0.7154 +/- 0.0163
Macro f1 : 0.7028 +/- 0.0116
Avg Accuracy: 0.7112 +/- 0.0115
