In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import pandas as pd
from datetime import datetime
import statistics
import re

# Hyper-params

In [2]:
TRAIN_VAL_SIZE = 0.2
VAL_TEST_SIZE = 0.5

# Helper Functions

In [4]:
def get_metrics(true_labels, predictions):
  f1 = f1_score(true_labels, predictions, average="macro")
  precision = precision_score(true_labels, predictions, average="macro")
  recall = recall_score(true_labels, predictions, average="macro")
  accuracy = accuracy_score(true_labels,predictions)
  return f1, precision, recall, accuracy

# Prepare datasets

In [5]:
bbc_df = pd.read_csv("../datasets/bbc/bbc-text.csv")

In [5]:
def clean_text(text):
    text=re.sub('<br \/>','',text) 
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text) 
    text = re.sub('\[[^]]*\]', '', text)
    return text

In [11]:
movies_df = pd.read_csv("../datasets/movies/sampled.csv")

In [16]:
twitter_df  = pd.read_csv("../datasets/twitter_sampled.csv")
twitter_df

Unnamed: 0,label,text
0,4,why and she screaming ahaha this song is funny
1,0,the_trini_bajan work as usual
2,0,desi_f pack me in your luggage I wanna go
3,4,elm8 Thanks I enjoy talking to you too
4,4,watchin the season finale of The Office lets h...
...,...,...
4795,0,So sleepy this morning
4796,0,bakespace do you archive your newsletters some...
4797,4,santyadh hope that will soon change though bo...
4798,0,I think I should do my homework


# Train with BBC dataset

In [9]:
def train(train_X, train_Y, test_X, test_Y): 
    tf_idf = TfidfVectorizer(use_idf=True, norm='l2')
    svd = TruncatedSVD(n_components=20)
    lr_model = LogisticRegression()

    model_tfidf = Pipeline([("tfidf", tf_idf), ("svd", svd), ("classifier", lr_model)])
    start_time = datetime.now()
    model_tfidf.fit(train_X["text"], train_Y)
    end_time = datetime.now()
    training_time_tfidf = (end_time - start_time).total_seconds()

    # Eval    
    predicted_test_tfidf = model_tfidf.predict(test_X["text"])
    test_f1, test_precision, test_recall, test_accuracy = get_metrics(predicted_test_tfidf, test_Y)
    print('Testing: Accuracy: {:.3%}, Recall: {:.3%}, Precision: {:.3%}, f1: {:.3%}'.format(test_accuracy,test_recall, test_precision, test_f1))
    print('Training time: {:.2f}s'.format(training_time_tfidf))
    return test_precision, test_recall, test_f1

In [10]:
precisions = []
recalls = []
f1s = []
for i in range(5):
    train_X, val_X, train_Y, val_Y = train_test_split(bbc_df, bbc_df["category"], test_size = TRAIN_VAL_SIZE)
    val_X, test_X, val_Y, test_Y = train_test_split(val_X, val_Y, test_size = VAL_TEST_SIZE)
    precision, recall, f1 = train(train_X, train_Y, test_X, test_Y)
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)

Testing: Accuracy: 94.619%, Recall: 94.816%, Precision: 94.193%, f1: 94.436%
Training time: 0.48s
Testing: Accuracy: 94.170%, Recall: 93.838%, Precision: 94.492%, f1: 93.967%
Training time: 0.54s
Testing: Accuracy: 94.619%, Recall: 95.057%, Precision: 93.828%, f1: 94.266%
Training time: 0.68s
Testing: Accuracy: 93.722%, Recall: 94.032%, Precision: 93.139%, f1: 93.375%
Training time: 0.64s
Testing: Accuracy: 93.722%, Recall: 94.356%, Precision: 93.241%, f1: 93.565%
Training time: 0.84s


In [11]:
precisions = []
recalls = []
f1s = []
for i in range(5):
    train_X, val_X, train_Y, val_Y = train_test_split(bbc_df, bbc_df["category"], test_size = TRAIN_VAL_SIZE)
    val_X, test_X, val_Y, test_Y = train_test_split(val_X, val_Y, test_size = VAL_TEST_SIZE)
    precision, recall, f1 = train(train_X, train_Y, test_X, test_Y)
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)

Testing: Accuracy: 93.722%, Recall: 93.523%, Precision: 93.490%, f1: 93.468%
Training time: 0.60s
Testing: Accuracy: 91.480%, Recall: 91.370%, Precision: 91.158%, f1: 91.201%
Training time: 0.61s
Testing: Accuracy: 94.619%, Recall: 95.072%, Precision: 94.287%, f1: 94.449%
Training time: 0.61s
Testing: Accuracy: 94.619%, Recall: 93.568%, Precision: 93.648%, f1: 93.580%
Training time: 0.61s
Testing: Accuracy: 95.067%, Recall: 94.926%, Precision: 94.961%, f1: 94.907%
Training time: 0.55s


In [8]:
print("Precision values:", precisions)
print("Precision avg: %0.4f (+/- %0.4f)" % (statistics.mean(precisions), statistics.stdev(precisions) * 2))
print("Recall values:", recalls)
print("Recall avg: %0.4f (+/- %0.4f)" % (statistics.mean(recalls), statistics.stdev(recalls) * 2))
print("F1 values:", f1s)
print("F1 avg: %0.4f (+/- %0.4f)" % (statistics.mean(f1s), statistics.stdev(f1s) * 2))


Precision values: [0.9022239561167069, 0.9299206577398067, 0.9503376623376625, 0.9392664157431774, 0.9410347690179943]
Precision avg: 0.9326 (+/- 0.0369)
Recall values: [0.9091627738553116, 0.9399535373016569, 0.9515602982102255, 0.9411410323546361, 0.9536453823953824]
Recall avg: 0.9391 (+/- 0.0356)
F1 values: [0.9044188461124602, 0.9340752642126209, 0.9504470837804171, 0.9401607885459471, 0.9459413039684405]
F1 avg: 0.9350 (+/- 0.0363)


# Train with movies dataset

In [12]:
precisions = []
recalls = []
f1s = []
for i in range(5):
    train_X, val_X, train_Y, val_Y = train_test_split(movies_df, movies_df["sentiment"], test_size = TRAIN_VAL_SIZE)
    val_X, test_X, val_Y, test_Y = train_test_split(val_X, val_Y, test_size = VAL_TEST_SIZE)
    precision, recall, f1 = train(train_X, train_Y, test_X, test_Y)
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)

Testing: Accuracy: 79.000%, Recall: 79.100%, Precision: 79.127%, f1: 78.999%
Training time: 5.84s
Testing: Accuracy: 76.400%, Recall: 76.466%, Precision: 76.454%, f1: 76.400%
Training time: 5.87s
Testing: Accuracy: 79.600%, Recall: 79.588%, Precision: 79.588%, f1: 79.588%
Training time: 6.45s
Testing: Accuracy: 77.800%, Recall: 77.835%, Precision: 77.807%, f1: 77.796%
Training time: 6.68s
Testing: Accuracy: 78.600%, Recall: 78.746%, Precision: 78.667%, f1: 78.593%
Training time: 6.19s


In [13]:
print("Precision values:", precisions)
print("Precision avg: %0.4f (+/- %0.4f)" % (statistics.mean(precisions), statistics.stdev(precisions) * 2))
print("Recall values:", recalls)
print("Recall avg: %0.4f (+/- %0.4f)" % (statistics.mean(recalls), statistics.stdev(recalls) * 2))
print("F1 values:", f1s)
print("F1 avg: %0.4f (+/- %0.4f)" % (statistics.mean(f1s), statistics.stdev(f1s) * 2))


Precision values: [0.7912678946440308, 0.7645363729508197, 0.7958824282786885, 0.7780684490951855, 0.7866746698679472]
Precision avg: 0.7833 (+/- 0.0248)
Recall values: [0.7910019860336985, 0.7646550067268884, 0.7958824282786885, 0.778349029406112, 0.7874614791987673]
Recall avg: 0.7835 (+/- 0.0246)
F1 values: [0.7899924397278302, 0.763996223939583, 0.7958824282786885, 0.7779564794699761, 0.785930641527855]
F1 avg: 0.7828 (+/- 0.0247)


# Train on Twitter Data

In [21]:
precisions = []
recalls = []
f1s = []
for i in range(5):
    train_X, val_X, train_Y, val_Y = train_test_split(twitter_df, twitter_df["label"], test_size = TRAIN_VAL_SIZE)
    val_X, test_X, val_Y, test_Y = train_test_split(val_X, val_Y, test_size = VAL_TEST_SIZE)
    precision, recall, f1 = train(train_X, train_Y, test_X, test_Y)
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)

Testing: Accuracy: 58.750%, Recall: 58.682%, Precision: 58.662%, f1: 58.663%
Training time: 3.49s
Testing: Accuracy: 60.833%, Recall: 60.962%, Precision: 60.912%, f1: 60.809%
Training time: 3.55s
Testing: Accuracy: 61.458%, Recall: 61.630%, Precision: 61.565%, f1: 61.430%
Training time: 3.79s
Testing: Accuracy: 65.000%, Recall: 65.305%, Precision: 65.271%, f1: 64.998%
Training time: 3.01s
Testing: Accuracy: 58.333%, Recall: 58.333%, Precision: 58.313%, f1: 58.298%
Training time: 3.09s


In [22]:
print("Precision values:", precisions)
print("Precision avg: %0.4f (+/- %0.4f)" % (statistics.mean(precisions), statistics.stdev(precisions) * 2))
print("Recall values:", recalls)
print("Recall avg: %0.4f (+/- %0.4f)" % (statistics.mean(recalls), statistics.stdev(recalls) * 2))
print("F1 values:", f1s)
print("F1 avg: %0.4f (+/- %0.4f)" % (statistics.mean(f1s), statistics.stdev(f1s) * 2))


Precision values: [0.5866240266963292, 0.6091185410334347, 0.6156451949134876, 0.6527071284231064, 0.5831307729703452]
Precision avg: 0.6094 (+/- 0.0559)
Recall values: [0.5868232178942234, 0.6096211896494566, 0.6163018221841752, 0.6530483972344437, 0.5833333333333334]
Recall avg: 0.6098 (+/- 0.0560)
F1 values: [0.5866316393813392, 0.6080883885761934, 0.6143004200129435, 0.6499756927564414, 0.5829785755243176]
F1 avg: 0.6084 (+/- 0.0537)
