# SETUP

In [30]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import nltk
from nltk.stem import WordNetLemmatizer
import statistics
import re


# Hyper-params

In [31]:
TRAIN_VAL_SIZE = 0.2
VAL_TEST_SIZE = 0.5

# Helper Functions

In [32]:
def get_metrics(true_labels, predictions):
  f1 = f1_score(true_labels, predictions, average="macro")
  precision = precision_score(true_labels, predictions, average="macro")
  recall = recall_score(true_labels, predictions, average="macro")
  accuracy = accuracy_score(true_labels,predictions)
  return f1, precision, recall, accuracy

In [33]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rayenebech/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rayenebech/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Prepare the data

In [34]:
stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [6]:
bbc_df = pd.read_csv("../datasets/bbc/bbc-text.csv")

In [7]:
def prepare_data(train_X, test_X):
    train_Y = train_X.pop("category")
    test_Y = test_X.pop("category")
    return train_X, train_Y, test_X, test_Y


In [16]:
def clean_text(text):
    text=re.sub('<br \/>','',text) 
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text) 
    text = re.sub('\[[^]]*\]', '', text)
    return text

In [26]:
movies_df = pd.read_csv("../datasets/movies/sampled.csv")

In [35]:
twitter_df = pd.read_csv("../datasets/twitter_sampled.csv")
twitter_df

Unnamed: 0,label,text
0,4,why and she screaming ahaha this song is funny
1,0,the_trini_bajan work as usual
2,0,desi_f pack me in your luggage I wanna go
3,4,elm8 Thanks I enjoy talking to you too
4,4,watchin the season finale of The Office lets h...
...,...,...
4795,0,So sleepy this morning
4796,0,bakespace do you archive your newsletters some...
4797,4,santyadh hope that will soon change though bo...
4798,0,I think I should do my homework


# Train the model on BBC News

In [36]:
def train(train_X, train_Y, test_X, test_Y):
    tf_idf = TfidfVectorizer(analyzer="char", ngram_range =(2,6))
    classifier_tfidf = LogisticRegression()
    # Train
    model_tfidf = Pipeline([("vectorizer", tf_idf), ("classifier", classifier_tfidf)])
    start_time = datetime.now()
    model_tfidf.fit(train_X["text"], train_Y)
    end_time = datetime.now()
    training_time_tfidf = (end_time - start_time).total_seconds()

    # Eval    
    predicted_test_tfidf = model_tfidf.predict(test_X["text"])
    test_f1, test_precision, test_recall, test_accuracy = get_metrics(predicted_test_tfidf, test_Y)
    print('Testing: Accuracy: {:.3%}, Recall: {:.3%}, Precision: {:.3%}, f1: {:.3%}'.format(test_accuracy,test_recall, test_precision, test_f1))
    print('Training time: {:.2f}s'.format(training_time_tfidf))
    return test_precision, test_recall, test_f1

In [9]:
precisions = []
recalls = []
f1s = []
for i in range(5):
    train_X, val_X, train_Y, val_Y = train_test_split(bbc_df, bbc_df["category"], test_size = TRAIN_VAL_SIZE)
    val_X, test_X, val_Y, test_Y = train_test_split(val_X, val_Y, test_size = VAL_TEST_SIZE)
    precision, recall, f1 = train(train_X, train_Y, test_X, test_Y)
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)

Testing: Accuracy: 96.861%, Recall: 96.907%, Precision: 96.720%, f1: 96.802%
Training time: 66.43s
Testing: Accuracy: 98.206%, Recall: 98.222%, Precision: 98.160%, f1: 98.138%
Training time: 80.97s
Testing: Accuracy: 98.206%, Recall: 98.281%, Precision: 98.135%, f1: 98.197%
Training time: 59.63s
Testing: Accuracy: 97.309%, Recall: 97.278%, Precision: 97.124%, f1: 97.173%
Training time: 61.58s
Testing: Accuracy: 95.964%, Recall: 96.084%, Precision: 95.716%, f1: 95.875%
Training time: 79.47s


In [10]:
print("Precision values:", precisions)
print("Precision avg: %0.4f (+/- %0.4f)" % (statistics.mean(precisions), statistics.stdev(precisions) * 2))
print("Recall values:", recalls)
print("Recall avg: %0.4f (+/- %0.4f)" % (statistics.mean(recalls), statistics.stdev(recalls) * 2))
print("F1 values:", f1s)
print("F1 avg: %0.4f (+/- %0.4f)" % (statistics.mean(f1s), statistics.stdev(f1s) * 2))

Precision values: [0.9671996630820161, 0.9816020671834625, 0.9813492063492063, 0.9712446468260423, 0.9571557754484583]
Precision avg: 0.9717 (+/- 0.0206)
Recall values: [0.9690676691729323, 0.9822222222222223, 0.9828059916333048, 0.9727815494246232, 0.9608412055780476]
Recall avg: 0.9735 (+/- 0.0185)
F1 values: [0.968021087823068, 0.9813801505907396, 0.9819736232031321, 0.9717303061575506, 0.9587532855118232]
F1 avg: 0.9724 (+/- 0.0194)


# Train with movies dataset

In [28]:

precisions = []
recalls = []
f1s = []
for i in range(5):
    train_X, val_X, train_Y, val_Y = train_test_split(movies_df, movies_df["sentiment"], test_size = TRAIN_VAL_SIZE)
    val_X, test_X, val_Y, test_Y = train_test_split(val_X, val_Y, test_size = VAL_TEST_SIZE)
    precision, recall, f1 = train(train_X, train_Y, test_X, test_Y)
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)

Testing: Accuracy: 85.600%, Recall: 85.653%, Precision: 85.608%, f1: 85.596%
Training time: 30.57s
Testing: Accuracy: 85.800%, Recall: 85.800%, Precision: 85.846%, f1: 85.795%
Training time: 31.71s
Testing: Accuracy: 84.000%, Recall: 84.278%, Precision: 83.964%, f1: 83.957%
Training time: 38.96s
Testing: Accuracy: 82.000%, Recall: 82.109%, Precision: 82.200%, f1: 81.995%
Training time: 30.05s
Testing: Accuracy: 85.800%, Recall: 86.344%, Precision: 85.653%, f1: 85.704%
Training time: 29.20s


In [29]:
print("Precision values:", precisions)
print("Precision avg: %0.4f (+/- %0.4f)" % (statistics.mean(precisions), statistics.stdev(precisions) * 2))
print("Recall values:", recalls)
print("Recall avg: %0.4f (+/- %0.4f)" % (statistics.mean(recalls), statistics.stdev(recalls) * 2))
print("F1 values:", f1s)
print("F1 avg: %0.4f (+/- %0.4f)" % (statistics.mean(f1s), statistics.stdev(f1s) * 2))


Precision values: [0.8560776972431559, 0.8584645700828273, 0.8396377368151562, 0.8219991971095946, 0.8565253586065573]
Precision avg: 0.8465 (+/- 0.0313)
Recall values: [0.8565340681523255, 0.858, 0.8427750145339448, 0.821091735920962, 0.8634434924520604]
Recall avg: 0.8484 (+/- 0.0341)
F1 values: [0.8559631265603994, 0.8579539770885767, 0.8395661869695656, 0.8199539082004994, 0.8570387284098276]
F1 avg: 0.8461 (+/- 0.0329)


# Train on Twitter Data 

In [37]:
precisions = []
recalls = []
f1s = []
for i in range(5):
    train_X, val_X, train_Y, val_Y = train_test_split(twitter_df, twitter_df["label"], test_size = TRAIN_VAL_SIZE)
    val_X, test_X, val_Y, test_Y = train_test_split(val_X, val_Y, test_size = VAL_TEST_SIZE)
    precision, recall, f1 = train(train_X, train_Y, test_X, test_Y)
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)

Testing: Accuracy: 72.083%, Recall: 71.988%, Precision: 71.988%, f1: 71.988%
Training time: 1.82s
Testing: Accuracy: 74.375%, Recall: 74.319%, Precision: 74.178%, f1: 74.222%
Training time: 2.76s
Testing: Accuracy: 75.625%, Recall: 75.622%, Precision: 75.619%, f1: 75.620%
Training time: 1.83s
Testing: Accuracy: 74.375%, Recall: 74.599%, Precision: 74.333%, f1: 74.294%
Training time: 2.01s
Testing: Accuracy: 72.917%, Recall: 72.988%, Precision: 72.876%, f1: 72.870%
Training time: 1.79s


In [38]:
print("Precision values:", precisions)
print("Precision avg: %0.4f (+/- %0.4f)" % (statistics.mean(precisions), statistics.stdev(precisions) * 2))
print("Recall values:", recalls)
print("Recall avg: %0.4f (+/- %0.4f)" % (statistics.mean(recalls), statistics.stdev(recalls) * 2))
print("F1 values:", f1s)
print("F1 avg: %0.4f (+/- %0.4f)" % (statistics.mean(f1s), statistics.stdev(f1s) * 2))


Precision values: [0.7198801477248973, 0.7417775764755069, 0.7561858623743294, 0.7433328703382179, 0.7287597020367766]
Precision avg: 0.7380 (+/- 0.0281)
Recall values: [0.7198801477248973, 0.7431883092989434, 0.7562170047235344, 0.7459850811759543, 0.7298773359389995]
Recall avg: 0.7390 (+/- 0.0285)
F1 values: [0.7198801477248973, 0.7422183023258859, 0.7561981497801182, 0.7429366354480975, 0.7286956521739131]
F1 avg: 0.7380 (+/- 0.0281)
