# SETUP

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from nltk.stem import WordNetLemmatizer
import statistics
import nltk
import re

# Hyper-params

In [19]:
TRAIN_VAL_SIZE = 0.2
VAL_TEST_SIZE = 0.5

# Helper Functions

In [20]:
def get_metrics(true_labels, predictions):
  f1 = f1_score(true_labels, predictions, average="macro")
  precision = precision_score(true_labels, predictions, average="macro")
  recall = recall_score(true_labels, predictions, average="macro")
  accuracy = accuracy_score(true_labels,predictions)
  return f1, precision, recall, accuracy

In [21]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rayenebech/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rayenebech/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Prepare the data

In [22]:
stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [3]:
bbc_df = pd.read_csv("../datasets/bbc/bbc-text.csv")

In [7]:
bbc_df["category"].value_counts()

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: category, dtype: int64

In [8]:
movies_df = pd.read_csv("../datasets/movies/sampled.csv")

In [10]:
movies_df["sentiment"].value_counts()

negative    2520
positive    2480
Name: sentiment, dtype: int64

In [11]:
twitter_df = pd.read_csv("../datasets/twitter_sampled.csv")
twitter_df

Unnamed: 0,label,text
0,4,why and she screaming ahaha this song is funny
1,0,the_trini_bajan work as usual
2,0,desi_f pack me in your luggage I wanna go
3,4,elm8 Thanks I enjoy talking to you too
4,4,watchin the season finale of The Office lets h...
...,...,...
4795,0,So sleepy this morning
4796,0,bakespace do you archive your newsletters some...
4797,4,santyadh hope that will soon change though bo...
4798,0,I think I should do my homework


In [12]:
twitter_df["label"].value_counts()

0    2409
4    2391
Name: label, dtype: int64

# Train the model with 5 Kfolds 

In [23]:
def train(train_X, train_Y, test_X, test_Y):
    tf_idf = TfidfVectorizer()
    classifier_tfidf = LogisticRegression()

    # Train
    model_tfidf = Pipeline([("vectorizer", tf_idf), ("classifier", classifier_tfidf)])
    start_time = datetime.now()
    model_tfidf.fit(train_X["text"], train_Y)
    end_time = datetime.now()
    training_time_tfidf = (end_time - start_time).total_seconds()

    # Eval    
    predicted_test_tfidf = model_tfidf.predict(test_X["text"])
    test_f1, test_precision, test_recall, test_accuracy = get_metrics(predicted_test_tfidf, test_Y)
    print('Testing: Accuracy: {:.3%}, Recall: {:.3%}, Precision: {:.3%}, f1: {:.3%}'.format(test_accuracy,test_recall, test_precision, test_f1))
    print('Training time: {:.2f}s'.format(training_time_tfidf))
    return test_precision, test_recall, test_f1

In [14]:
precisions = []
recalls = []
f1s = []
for i in range(5):
    train_X, val_X, train_Y, val_Y = train_test_split(bbc_df, bbc_df["category"], test_size = TRAIN_VAL_SIZE)
    val_X, test_X, val_Y, test_Y = train_test_split(val_X, val_Y, test_size = VAL_TEST_SIZE)
    precision, recall, f1 = train(train_X, train_Y, test_X, test_Y)
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)

Testing: Accuracy: 97.309%, Recall: 97.418%, Precision: 97.249%, f1: 97.310%
Training time: 1.18s
Testing: Accuracy: 98.206%, Recall: 98.429%, Precision: 97.936%, f1: 98.151%
Training time: 1.16s
Testing: Accuracy: 95.964%, Recall: 96.249%, Precision: 96.250%, f1: 96.189%
Training time: 1.18s
Testing: Accuracy: 97.309%, Recall: 97.384%, Precision: 97.292%, f1: 97.308%
Training time: 1.23s
Testing: Accuracy: 96.413%, Recall: 96.828%, Precision: 96.551%, f1: 96.684%
Training time: 1.24s


In [17]:
print("Precision values:", precisions)
print("Precision avg: %0.4f (+/- %0.4f)" % (statistics.mean(precisions), statistics.stdev(precisions) * 2))
print("Recall values:", recalls)
print("Recall avg: %0.4f (+/- %0.4f)" % (statistics.mean(recalls), statistics.stdev(recalls) * 2))
print("F1 values:", f1s)
print("F1 avg: %0.4f (+/- %0.4f)" % (statistics.mean(f1s), statistics.stdev(f1s) * 2))

Precision values: [0.9724888668975755, 0.9793615494978478, 0.9625042999656003, 0.972916611603013, 0.9655129958960329]
Precision avg: 0.9706 (+/- 0.0133)
Recall values: [0.9741835758693721, 0.9842857142857142, 0.9624888445049754, 0.9738412698412698, 0.9682805947388486]
Recall avg: 0.9726 (+/- 0.0162)
F1 values: [0.9731015774730946, 0.981510338217171, 0.9618899664187301, 0.9730829815634836, 0.9668400262764593]
F1 avg: 0.9713 (+/- 0.0148)


# Train with Movies datasets

In [15]:
precisions = []
recalls = []
f1s = []
for i in range(5):
    train_X, val_X, train_Y, val_Y = train_test_split(movies_df, movies_df["sentiment"], test_size = TRAIN_VAL_SIZE)
    val_X, test_X, val_Y, test_Y = train_test_split(val_X, val_Y, test_size = VAL_TEST_SIZE)
    precision, recall, f1 = train(train_X, train_Y, test_X, test_Y)
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)

Testing: Accuracy: 83.600%, Recall: 83.635%, Precision: 83.532%, f1: 83.562%
Training time: 1.13s
Testing: Accuracy: 83.000%, Recall: 83.133%, Precision: 82.870%, f1: 82.926%
Training time: 1.33s
Testing: Accuracy: 83.600%, Recall: 83.535%, Precision: 83.565%, f1: 83.548%
Training time: 1.21s
Testing: Accuracy: 85.800%, Recall: 85.883%, Precision: 85.750%, f1: 85.775%
Training time: 1.38s
Testing: Accuracy: 86.800%, Recall: 86.876%, Precision: 86.860%, f1: 86.800%
Training time: 1.33s


In [16]:
print("Precision values:", precisions)
print("Precision avg: %0.4f (+/- %0.4f)" % (statistics.mean(precisions), statistics.stdev(precisions) * 2))
print("Recall values:", recalls)
print("Recall avg: %0.4f (+/- %0.4f)" % (statistics.mean(recalls), statistics.stdev(recalls) * 2))
print("F1 values:", f1s)
print("F1 avg: %0.4f (+/- %0.4f)" % (statistics.mean(f1s), statistics.stdev(f1s) * 2))

Precision values: [0.8353193670318406, 0.8286979925984075, 0.8356483340024087, 0.8575030012004802, 0.8685963114754098]
Precision avg: 0.8452 (+/- 0.0341)
Recall values: [0.8363537047747573, 0.8313254957690072, 0.8353467776868653, 0.8588297667245035, 0.8687616118905759]
Recall avg: 0.8461 (+/- 0.0332)
F1 values: [0.8356212714093271, 0.8292562401822339, 0.8354840780688239, 0.8577490693583482, 0.8679978879662074]
F1 avg: 0.8452 (+/- 0.0334)


# Train with Twitter

In [24]:
precisions = []
recalls = []
f1s = []
for i in range(5):
    train_X, val_X, train_Y, val_Y = train_test_split(twitter_df, twitter_df["label"], test_size = TRAIN_VAL_SIZE)
    val_X, test_X, val_Y, test_Y = train_test_split(val_X, val_Y, test_size = VAL_TEST_SIZE)
    precision, recall, f1 = train(train_X, train_Y, test_X, test_Y)
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)

Testing: Accuracy: 74.583%, Recall: 74.607%, Precision: 74.600%, f1: 74.583%
Training time: 0.08s
Testing: Accuracy: 73.750%, Recall: 73.728%, Precision: 73.728%, f1: 73.728%
Training time: 0.19s
Testing: Accuracy: 72.292%, Recall: 72.535%, Precision: 72.388%, f1: 72.265%
Training time: 0.12s
Testing: Accuracy: 71.042%, Recall: 71.100%, Precision: 71.176%, f1: 71.026%
Training time: 0.10s
Testing: Accuracy: 71.667%, Recall: 71.667%, Precision: 71.664%, f1: 71.665%
Training time: 0.09s


In [25]:
print("Precision values:", precisions)
print("Precision avg: %0.4f (+/- %0.4f)" % (statistics.mean(precisions), statistics.stdev(precisions) * 2))
print("Recall values:", recalls)
print("Recall avg: %0.4f (+/- %0.4f)" % (statistics.mean(recalls), statistics.stdev(recalls) * 2))
print("F1 values:", f1s)
print("F1 avg: %0.4f (+/- %0.4f)" % (statistics.mean(f1s), statistics.stdev(f1s) * 2))

Precision values: [0.7460019794759598, 0.7372765025803201, 0.723881893182805, 0.7117647058823529, 0.7166443861872602]
Precision avg: 0.7271 (+/- 0.0286)
Recall values: [0.7460703430308293, 0.7372765025803201, 0.7253496503496504, 0.7109961100305641, 0.7166744803875605]
Recall avg: 0.7273 (+/- 0.0289)
F1 values: [0.7458289206409834, 0.7372765025803201, 0.7226458129683936, 0.7102645052306116, 0.7166469893742621]
F1 avg: 0.7265 (+/- 0.0294)
