# Importing Libraries

In [1]:
import fasttext
import pandas as pd
from sklearn.model_selection import train_test_split
import csv
import time
import statistics
import re

# Hyper-params

In [2]:
TRAIN_VAL_SIZE = 0.2
VAL_TEST_SIZE = 0.5

# Helper Functions

In [3]:
def compute_f1(precision, recall):
    return (precision * recall *2) / (precision + recall)

# Data Preprocessing

In [4]:
bbc_df = pd.read_csv("../datasets/bbc/bbc-text.csv")
bbc_df = bbc_df[["text", "category"]]


In [59]:
def clean_text(text):
    text=re.sub('<br \/>','',text) 
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text) 
    text = re.sub('\[[^]]*\]', '', text)
    return text

In [57]:
twitter_df = pd.read_csv("../datasets/twitter.csv", encoding='latin-1', names = ["label","id", "date", "flag", "user", "text"])
twitter_df = twitter_df.sample(frac=0.003)

In [60]:
twitter_df = twitter_df[["label", "text"]]
twitter_df["text"] = twitter_df["text"].apply(lambda x: clean_text(x))
twitter_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  twitter_df["text"] = twitter_df["text"].apply(lambda x: clean_text(x))


Unnamed: 0,label,text
1203885,4,why and she screaming ahaha this song is funny
484270,0,the_trini_bajan work as usual
543432,0,desi_f pack me in your luggage I wanna go
1502254,4,elm8 Thanks I enjoy talking to you too
943392,4,watchin the season finale of The Office lets h...
...,...,...
5733,0,So sleepy this morning
392307,0,bakespace do you archive your newsletters some...
1267790,4,santyadh hope that will soon change though bo...
84846,0,I think I should do my homework


In [61]:
twitter_df.to_csv("../datasets/twitter_sampled.csv", index = False)

In [65]:
twitter_df = pd.read_csv("../datasets/twitter_sampled.csv")
twitter_df["label"] = twitter_df["label"].apply(lambda x: "negative" if x==0 else "positive")

In [66]:
twitter_df

Unnamed: 0,label,text
0,positive,why and she screaming ahaha this song is funny
1,negative,the_trini_bajan work as usual
2,negative,desi_f pack me in your luggage I wanna go
3,positive,elm8 Thanks I enjoy talking to you too
4,positive,watchin the season finale of The Office lets h...
...,...,...
4795,negative,So sleepy this morning
4796,negative,bakespace do you archive your newsletters some...
4797,positive,santyadh hope that will soon change though bo...
4798,negative,I think I should do my homework


In [33]:
movies_df = pd.read_csv("../datasets/movies/sampled.csv")
movies_df

Unnamed: 0,text,sentiment
0,A female vampire kills young women and paints ...,negative
1,Personally I think this show looks pretty chea...,negative
2,I grew up watching Inspector Gadget It was and...,negative
3,This movie is awful Im SORRY I bought this to ...,negative
4,This is a great example of a good dumb movie N...,positive
...,...,...
4995,After watching this on the MST3K episode I hav...,negative
4996,Upon completing this infernal piece of trash a...,negative
4997,Maybe Im biased because the F16 is my favorite...,positive
4998,The Best Movie of the 90s The Welsh Trainspott...,negative


In [67]:
def prepare_twitter_data(train_X, test_X):
    train_X["label"] = train_X["label"].apply(lambda x: "__label__" + x)
    test_X["label"] = test_X["label"].apply(lambda x: "__label__" + x)
    # Saving the CSV file as a text file to train/test the classifier
    train_X[['label', 'text']].to_csv('../datasets/twitter/fasttext_train.txt', 
                                            index = False, 
                                            sep = ' ',
                                            header = None, 
                                            quoting = csv.QUOTE_NONE, 
                                            quotechar = "", 
                                            escapechar = " ")

    test_X[['label', 'text']].to_csv('../datasets/twitter/fasttext_test.txt', 
                                            index = False, 
                                            sep = ' ',
                                            header = None, 
                                            quoting = csv.QUOTE_NONE, 
                                            quotechar = "", 
                                            escapechar = " ")

In [36]:
def prepare_movies_data(train_X, test_X):
    train_X["sentiment"] = train_X["sentiment"].apply(lambda x: "__label__" + x)
    test_X["sentiment"] = test_X["sentiment"].apply(lambda x: "__label__" + x)
    # Saving the CSV file as a text file to train/test the classifier
    train_X[['sentiment', 'text']].to_csv('../datasets/movies/fasttext_train.txt', 
                                            index = False, 
                                            sep = ' ',
                                            header = None, 
                                            quoting = csv.QUOTE_NONE, 
                                            quotechar = "", 
                                            escapechar = " ")

    test_X[['sentiment', 'text']].to_csv('../datasets/movies/fasttext_test.txt', 
                                            index = False, 
                                            sep = ' ',
                                            header = None, 
                                            quoting = csv.QUOTE_NONE, 
                                            quotechar = "", 
                                            escapechar = " ")

In [5]:
def prepare_data(train_X, test_X):
    train_X["category"] = train_X["category"].apply(lambda x: "__label__" + x)
    test_X["category"] = test_X["category"].apply(lambda x: "__label__" + x)
    # Saving the CSV file as a text file to train/test the classifier
    train_X[['category', 'text']].to_csv('../datasets/bbc/fasttext_train.txt', 
                                            index = False, 
                                            sep = ' ',
                                            header = None, 
                                            quoting = csv.QUOTE_NONE, 
                                            quotechar = "", 
                                            escapechar = " ")

    test_X[['category', 'text']].to_csv('../datasets/bbc/fasttext_test.txt', 
                                            index = False, 
                                            sep = ' ',
                                            header = None, 
                                            quoting = csv.QUOTE_NONE, 
                                            quotechar = "", 
                                            escapechar = " ")

# Model Training BBC News Categorization

In [6]:
def train(train_path, test_path):
    model = fasttext.train_supervised(train_path, wordNgrams = 1, epoch = 20, lr=0.4)
    n, precision, recall = model.test(test_path)   
    f1 = compute_f1(precision, recall)
    return precision, recall, f1

In [7]:
f1s = []
precisions = []
recalls = []
train_path = "../datasets/bbc/fasttext_train.txt"
test_path = "../datasets/bbc/fasttext_test.txt"
for i in range(5):
    train_X, val_X, train_Y, val_Y = train_test_split(bbc_df, bbc_df["category"], test_size = TRAIN_VAL_SIZE)
    val_X, test_X, val_Y, test_Y = train_test_split(val_X, val_Y, test_size = VAL_TEST_SIZE)
    prepare_data(train_X, test_X)
    time.sleep(1)
    precision, recall, f1 = train(train_path, test_path)
    f1s.append(f1)
    precisions.append(precision)
    recalls.append(recall)

Read 0M words
Number of words:  39005
Number of labels: 5
Progress: 100.0% words/sec/thread: 2371220 lr:  0.000000 avg.loss:  0.268089 ETA:   0h 0m 0s
Read 0M words
Number of words:  39736
Number of labels: 5
Progress: 100.0% words/sec/thread: 2146255 lr:  0.000000 avg.loss:  0.334762 ETA:   0h 0m 0s
Read 0M words
Number of words:  38895
Number of labels: 5
Progress: 100.0% words/sec/thread: 2119361 lr:  0.000000 avg.loss:  0.231247 ETA:   0h 0m 0s
Read 0M words
Number of words:  39540
Number of labels: 5
Progress: 100.0% words/sec/thread: 2160571 lr:  0.000000 avg.loss:  0.244408 ETA:   0h 0m 0s
Read 0M words
Number of words:  39414
Number of labels: 5
Progress: 100.0% words/sec/thread: 2166931 lr:  0.000000 avg.loss:  0.313807 ETA:   0h 0m 0s


In [8]:
print("Precision values:", precisions)
print("Precision avg: %0.4f (+/- %0.4f)" % (statistics.mean(precisions), statistics.stdev(precisions) * 2))
print("Recall values:", recalls)
print("Recall avg: %0.4f (+/- %0.4f)" % (statistics.mean(recalls), statistics.stdev(recalls) * 2))
print("F1 values:", f1s)
print("F1 avg: %0.4f (+/- %0.4f)" % (statistics.mean(f1s), statistics.stdev(f1s) * 2))


Precision values: [0.9730941704035875, 0.9551569506726457, 0.9596412556053812, 0.9596412556053812, 0.9417040358744395]
Precision avg: 0.9578 (+/- 0.0225)
Recall values: [0.9730941704035875, 0.9551569506726457, 0.9596412556053812, 0.9596412556053812, 0.9417040358744395]
Recall avg: 0.9578 (+/- 0.0225)
F1 values: [0.9730941704035875, 0.9551569506726457, 0.9596412556053812, 0.9596412556053812, 0.9417040358744395]
F1 avg: 0.9578 (+/- 0.0225)


# Model Training Movies Sentiment Analysis

In [37]:
f1s = []
precisions = []
recalls = []
train_path = "../datasets/movies/fasttext_train.txt"
test_path = "../datasets/movies/fasttext_test.txt"
for i in range(5):
    train_X, val_X, train_Y, val_Y = train_test_split(movies_df, movies_df["sentiment"], test_size = TRAIN_VAL_SIZE)
    val_X, test_X, val_Y, test_Y = train_test_split(val_X, val_Y, test_size = VAL_TEST_SIZE)
    prepare_movies_data(train_X, test_X)
    time.sleep(1)
    precision, recall, f1 = train(train_path, test_path)
    f1s.append(f1)
    precisions.append(precision)
    recalls.append(recall)

Read 0M words
Number of words:  55087
Number of labels: 2
Progress: 100.0% words/sec/thread: 4132050 lr:  0.000000 avg.loss:  0.152790 ETA:   0h 0m 0s
Read 0M words
Number of words:  55509
Number of labels: 2
Progress: 100.0% words/sec/thread: 4238521 lr:  0.000000 avg.loss:  0.168775 ETA:   0h 0m 0s
Read 0M words
Number of words:  55109
Number of labels: 2
Progress: 100.0% words/sec/thread: 4165218 lr:  0.000000 avg.loss:  0.189808 ETA:   0h 0m 0s
Read 0M words
Number of words:  55037
Number of labels: 2
Progress: 100.0% words/sec/thread: 4157227 lr:  0.000000 avg.loss:  0.184182 ETA:   0h 0m 0s
Read 0M words
Number of words:  55222
Number of labels: 2
Progress: 100.0% words/sec/thread: 3558873 lr:  0.000000 avg.loss:  0.178894 ETA:   0h 0m 0s


In [38]:
print("Precision values:", precisions)
print("Precision avg: %0.4f (+/- %0.4f)" % (statistics.mean(precisions), statistics.stdev(precisions) * 2))
print("Recall values:", recalls)
print("Recall avg: %0.4f (+/- %0.4f)" % (statistics.mean(recalls), statistics.stdev(recalls) * 2))
print("F1 values:", f1s)
print("F1 avg: %0.4f (+/- %0.4f)" % (statistics.mean(f1s), statistics.stdev(f1s) * 2))


Precision values: [0.852, 0.83, 0.838, 0.834, 0.794]
Precision avg: 0.8296 (+/- 0.0431)
Recall values: [0.852, 0.83, 0.838, 0.834, 0.794]
Recall avg: 0.8296 (+/- 0.0431)
F1 values: [0.852, 0.83, 0.838, 0.834, 0.7940000000000002]
F1 avg: 0.8296 (+/- 0.0431)


# Model Training Twitter Analysis

In [69]:
f1s = []
precisions = []
recalls = []
train_path = "../datasets/twitter/fasttext_train.txt"
test_path = "../datasets/twitter/fasttext_test.txt"
for i in range(5):
    train_X, val_X, train_Y, val_Y = train_test_split(twitter_df, twitter_df["label"], test_size = TRAIN_VAL_SIZE)
    val_X, test_X, val_Y, test_Y = train_test_split(val_X, val_Y, test_size = VAL_TEST_SIZE)
    prepare_twitter_data(train_X, test_X)
    time.sleep(1)
    precision, recall, f1 = train(train_path, test_path)
    f1s.append(f1)
    precisions.append(precision)
    recalls.append(recall)

Read 0M words
Number of words:  11339
Number of labels: 2
Progress: 100.0% words/sec/thread: 1575134 lr:  0.000000 avg.loss:  0.130671 ETA:   0h 0m 0s
Read 0M words
Number of words:  11313
Number of labels: 2
Progress: 100.0% words/sec/thread: 1542816 lr:  0.000000 avg.loss:  0.158548 ETA:   0h 0m 0s
Read 0M words
Number of words:  11389
Number of labels: 2
Progress: 100.0% words/sec/thread: 1562870 lr:  0.000000 avg.loss:  0.155193 ETA:   0h 0m 0s
Read 0M words
Number of words:  11309
Number of labels: 2
Progress: 100.0% words/sec/thread: 1594767 lr:  0.000000 avg.loss:  0.091563 ETA:   0h 0m 0s
Read 0M words
Number of words:  11375
Number of labels: 2
Progress: 100.0% words/sec/thread:  793853 lr:  0.000000 avg.loss:  0.115710 ETA:   0h 0m 0s


In [70]:
print("Precision values:", precisions)
print("Precision avg: %0.4f (+/- %0.4f)" % (statistics.mean(precisions), statistics.stdev(precisions) * 2))
print("Recall values:", recalls)
print("Recall avg: %0.4f (+/- %0.4f)" % (statistics.mean(recalls), statistics.stdev(recalls) * 2))
print("F1 values:", f1s)
print("F1 avg: %0.4f (+/- %0.4f)" % (statistics.mean(f1s), statistics.stdev(f1s) * 2))


Precision values: [0.7333333333333333, 0.6770833333333334, 0.6979166666666666, 0.6958333333333333, 0.69375]
Precision avg: 0.6996 (+/- 0.0412)
Recall values: [0.7333333333333333, 0.6770833333333334, 0.6979166666666666, 0.6958333333333333, 0.69375]
Recall avg: 0.6996 (+/- 0.0412)
F1 values: [0.7333333333333333, 0.6770833333333334, 0.6979166666666666, 0.6958333333333333, 0.69375]
F1 avg: 0.6996 (+/- 0.0412)
