# Import Libraries

In [2]:
# evaluate a lda model on the dataset
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from datetime import datetime
import statistics
import nltk
import re

# Hyper-Params

In [3]:
TRAIN_VAL_SIZE = 0.2
VAL_TEST_SIZE = 0.5

# Functions

In [16]:
def get_metrics(true_labels, predictions):
  f1 = f1_score(true_labels, predictions, average="macro")
  precision = precision_score(true_labels, predictions, average="macro")
  recall = recall_score(true_labels, predictions, average="macro")
  accuracy = accuracy_score(true_labels,predictions)
  return f1, precision, recall, accuracy

# Dataset

In [4]:
bbc_df = pd.read_csv("../datasets/bbc/bbc-text.csv")
bbc_df["category"] = pd.Categorical(bbc_df['category']).codes

stopwords = nltk.corpus.stopwords.words('english')

In [5]:
def clean_text(text):
    text=re.sub('<br \/>','',text) 
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text) 
    text = re.sub('\[[^]]*\]', '', text)
    return text

In [6]:
movies_df = pd.read_csv("../datasets/movies/sampled.csv")
movies_df

Unnamed: 0,text,sentiment
0,A female vampire kills young women and paints ...,negative
1,Personally I think this show looks pretty chea...,negative
2,I grew up watching Inspector Gadget It was and...,negative
3,This movie is awful Im SORRY I bought this to ...,negative
4,This is a great example of a good dumb movie N...,positive
...,...,...
4995,After watching this on the MST3K episode I hav...,negative
4996,Upon completing this infernal piece of trash a...,negative
4997,Maybe Im biased because the F16 is my favorite...,positive
4998,The Best Movie of the 90s The Welsh Trainspott...,negative


In [13]:
twitter_df = pd.read_csv("../datasets/twitter_sampled.csv")
twitter_df

Unnamed: 0,label,text
0,4,why and she screaming ahaha this song is funny
1,0,the_trini_bajan work as usual
2,0,desi_f pack me in your luggage I wanna go
3,4,elm8 Thanks I enjoy talking to you too
4,4,watchin the season finale of The Office lets h...
...,...,...
4795,0,So sleepy this morning
4796,0,bakespace do you archive your newsletters some...
4797,4,santyadh hope that will soon change though bo...
4798,0,I think I should do my homework


# Training the model on BBC News

In [14]:
def train(train_X, train_Y, test_X, test_Y):
    model = LinearDiscriminantAnalysis(solver='lsqr', shrinkage=0.5)
    start_time = datetime.now()
    vectorizer = CountVectorizer(stop_words = "english")
    train_X = vectorizer.fit_transform(train_X["text"]).toarray()
    test_X = vectorizer.transform(test_X["text"]).toarray()
    model.fit(train_X, train_Y)
    end_time = datetime.now()
    training_time = (end_time - start_time).total_seconds()
    end_time = datetime.now()
    predicitons = model.predict(test_X)
    test_f1, test_precision, test_recall, test_accuracy = get_metrics(predicitons, test_Y)
    print('Testing: Accuracy: {:.3%}, Recall: {:.3%}, Precision: {:.3%}, f1: {:.3%}'.format(test_accuracy,test_recall, test_precision, test_f1))
    print('Training time: {:.2f}s'.format(training_time))
    return test_precision, test_recall, test_f1


In [6]:
#shrinkage with lsqr

f1s = []
recalls = []
precisions = []
for i in range(2):
    train_X, val_X, train_Y, val_Y = train_test_split(bbc_df, bbc_df["category"], test_size = TRAIN_VAL_SIZE)
    val_X, test_X, val_Y, test_Y = train_test_split(val_X, val_Y, test_size = VAL_TEST_SIZE)
    test_precision, test_recall, test_f1 = train(train_X, train_Y, test_X, test_Y)
    f1s.append(test_f1)
    recalls.append(test_recall)
    precisions.append(test_precision)


Testing: Accuracy: 97.309%, Recall: 97.598%, Precision: 97.248%, f1: 97.350%
Training time: 2937.12s
Testing: Accuracy: 95.964%, Recall: 95.988%, Precision: 96.038%, f1: 95.972%
Training time: 2645.75s


In [7]:
print("Precision values:", precisions)
print("Precision avg: %0.4f (+/- %0.4f)" % (statistics.mean(precisions), statistics.stdev(precisions) * 2))
print("Recall values:", recalls)
print("Recall avg: %0.4f (+/- %0.4f)" % (statistics.mean(recalls), statistics.stdev(recalls) * 2))
print("F1 values:", f1s)
print("F1 avg: %0.4f (+/- %0.4f)" % (statistics.mean(f1s), statistics.stdev(f1s) * 2))


Precision values: [0.9724811715019793, 0.9603782505910166]
Precision avg: 0.9664 (+/- 0.0171)
Recall values: [0.9759805015361905, 0.9598814229249012]
Recall avg: 0.9679 (+/- 0.0228)
F1 values: [0.973501859831126, 0.9597191429767171]
F1 avg: 0.9666 (+/- 0.0195)


In [202]:
#shrinkage with lsqr
"""
In normal mode, LDA assumes that the class covariance matrices are equal and estimated using the sample covariance of the entire dataset. In shrinkage mode, LDA uses a shrinkage estimator to regularize the covariance matrix and improve the stability of the model."""
test_precision, test_recall, test_f1 = train(train_X, train_Y, test_X, test_Y)

Testing: Accuracy: 94.619%, Recall: 94.721%, Precision: 94.802%, f1: 94.681%
Training time: 2197.96s


In [192]:
# stopwords n_components = 4
test_precision, test_recall, test_f1 = train(train_X, train_Y, test_X, test_Y)

Testing: Accuracy: 48.430%, Recall: 50.661%, Precision: 48.413%, f1: 49.117%
Training time: 22.66s


# Train with movies data

In [9]:
def train(train_X, train_Y, test_X, test_Y):
    model = LinearDiscriminantAnalysis()
    start_time = datetime.now()
    vectorizer = CountVectorizer(stop_words = "english")
    train_X = vectorizer.fit_transform(train_X["text"]).toarray()
    test_X = vectorizer.transform(test_X["text"]).toarray()
    model.fit(train_X, train_Y)
    end_time = datetime.now()
    training_time = (end_time - start_time).total_seconds()
    end_time = datetime.now()
    predicitons = model.predict(test_X)
    test_f1, test_precision, test_recall, test_accuracy = get_metrics(predicitons, test_Y)
    print('Testing: Accuracy: {:.3%}, Recall: {:.3%}, Precision: {:.3%}, f1: {:.3%}'.format(test_accuracy,test_recall, test_precision, test_f1))
    print('Training time: {:.2f}s'.format(training_time))
    return test_precision, test_recall, test_f1

In [10]:
#with n_components = 4
f1s = []
recalls = []
precisions = []
for i in range(5):
    train_X, val_X, train_Y, val_Y = train_test_split(movies_df, movies_df["sentiment"], test_size = TRAIN_VAL_SIZE)
    val_X, test_X, val_Y, test_Y = train_test_split(val_X, val_Y, test_size = VAL_TEST_SIZE)
    test_precision, test_recall, test_f1 = train(train_X, train_Y, test_X, test_Y)
    f1s.append(test_f1)
    recalls.append(test_recall)
    precisions.append(test_precision)


Testing: Accuracy: 70.200%, Recall: 70.168%, Precision: 70.183%, f1: 70.173%
Training time: 878.13s
Testing: Accuracy: 71.400%, Recall: 71.360%, Precision: 71.389%, f1: 71.367%
Training time: 818.03s
Testing: Accuracy: 58.000%, Recall: 57.928%, Precision: 57.917%, f1: 57.919%
Training time: 877.37s
Testing: Accuracy: 62.800%, Recall: 62.859%, Precision: 62.785%, f1: 62.740%
Training time: 894.63s
Testing: Accuracy: 71.000%, Recall: 70.978%, Precision: 70.960%, f1: 70.966%
Training time: 650.78s


In [11]:
print("Precision values:", precisions)
print("Precision avg: %0.4f (+/- %0.4f)" % (statistics.mean(precisions), statistics.stdev(precisions) * 2))
print("Recall values:", recalls)
print("Recall avg: %0.4f (+/- %0.4f)" % (statistics.mean(recalls), statistics.stdev(recalls) * 2))
print("F1 values:", f1s)
print("F1 avg: %0.4f (+/- %0.4f)" % (statistics.mean(f1s), statistics.stdev(f1s) * 2))


Precision values: [0.7018295711241769, 0.7138860834575739, 0.5791665999134878, 0.6278500456007297, 0.7096043297945589]
Precision avg: 0.6665 (+/- 0.1203)
Recall values: [0.7016841700819672, 0.7135950307377049, 0.5792783687089891, 0.6285907401149037, 0.7097756410256411]
Recall avg: 0.6666 (+/- 0.1199)
F1 values: [0.7017315584025622, 0.7136690013655786, 0.5791853027461165, 0.627403846153846, 0.7096643720140483]
F1 avg: 0.6663 (+/- 0.1203)


# Train the model on Twitter

In [15]:
def train(train_X, train_Y, test_X, test_Y):
    model = LinearDiscriminantAnalysis()
    start_time = datetime.now()
    vectorizer = CountVectorizer(stop_words = "english")
    train_X = vectorizer.fit_transform(train_X["text"]).toarray()
    test_X = vectorizer.transform(test_X["text"]).toarray()
    model.fit(train_X, train_Y)
    end_time = datetime.now()
    training_time = (end_time - start_time).total_seconds()
    end_time = datetime.now()
    predicitons = model.predict(test_X)
    test_f1, test_precision, test_recall, test_accuracy = get_metrics(predicitons, test_Y)
    print('Testing: Accuracy: {:.3%}, Recall: {:.3%}, Precision: {:.3%}, f1: {:.3%}'.format(test_accuracy,test_recall, test_precision, test_f1))
    print('Training time: {:.2f}s'.format(training_time))
    return test_precision, test_recall, test_f1

In [17]:
f1s = []
recalls = []
precisions = []
for i in range(5):
    train_X, val_X, train_Y, val_Y = train_test_split(twitter_df, twitter_df["label"], test_size = TRAIN_VAL_SIZE)
    val_X, test_X, val_Y, test_Y = train_test_split(val_X, val_Y, test_size = VAL_TEST_SIZE)
    test_precision, test_recall, test_f1 = train(train_X, train_Y, test_X, test_Y)
    f1s.append(test_f1)
    recalls.append(test_recall)
    precisions.append(test_precision)


Testing: Accuracy: 50.000%, Recall: 50.000%, Precision: 50.000%, f1: 49.685%
Training time: 122.15s
Testing: Accuracy: 55.208%, Recall: 55.449%, Precision: 55.037%, f1: 54.263%
Training time: 136.60s
Testing: Accuracy: 56.458%, Recall: 57.219%, Precision: 56.848%, f1: 56.037%
Training time: 108.91s
Testing: Accuracy: 58.542%, Recall: 59.000%, Precision: 58.728%, f1: 58.294%
Training time: 125.71s
Testing: Accuracy: 53.542%, Recall: 53.273%, Precision: 53.174%, f1: 52.968%
Training time: 103.78s


In [18]:
print("Precision values:", precisions)
print("Precision avg: %0.4f (+/- %0.4f)" % (statistics.mean(precisions), statistics.stdev(precisions) * 2))
print("Recall values:", recalls)
print("Recall avg: %0.4f (+/- %0.4f)" % (statistics.mean(recalls), statistics.stdev(recalls) * 2))
print("F1 values:", f1s)
print("F1 avg: %0.4f (+/- %0.4f)" % (statistics.mean(f1s), statistics.stdev(f1s) * 2))


Precision values: [0.5, 0.5503724540292754, 0.5684788654060067, 0.5872774641771602, 0.5317391304347826]
Precision avg: 0.5476 (+/- 0.0673)
Recall values: [0.5, 0.5544850123957629, 0.5721942281264316, 0.5899957016978293, 0.5327348388369715]
Recall avg: 0.5499 (+/- 0.0701)
F1 values: [0.49684655567008507, 0.5426322577214046, 0.5603682879692888, 0.5829385541695229, 0.5296826324415289]
F1 avg: 0.5425 (+/- 0.0648)
