In [118]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import string as str
# import seaborn as sns
from bs4 import BeautifulSoup

In [137]:
df_train = pd.read_csv("tweet_training.csv")
df_validation = pd.read_csv("tweet_dev.csv")

df_train

Unnamed: 0.1,Unnamed: 0,sentiment,text
0,0,0,pauln - Got my arse kicked at Crystal Palace -...
1,1,4,"@IHeartLost yeah, it is pretty cool. I prefer ..."
2,2,4,I'm happy... Not for any real reason.. Just ha...
3,3,0,slept very well.take a shower now.have to do m...
4,4,4,"@GOttaviani yesss, it's cool, my favorite tune..."
...,...,...,...
79995,79995,4,raidys bbq tonight free food &amp; morrellis ...
79996,79996,4,Don't you just luv follow fridays--** thank yo...
79997,79997,4,@megodbike ahh I see! you need to go recruit s...
79998,79998,0,is not of housewife material : self-cooked ric...


In [138]:
#Strips emojis
def handle_emojis(tweet):
    # Smile -- :), : ), :-), (:, ( :, (-:, :'),:D, : D, =)
    
    tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))|:\s?D | =\)', '', tweet)
    # Sad -- :-(, : (, :(, ):, )-: , :p
    tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)|:p', '', tweet)
    return tweet

In [139]:
#clean data with below components removed or replaced
url = re.compile(r"(?:(http[s]?://\S+)|((//)?(\w+\.)?\w+\.\w+/\S+))")
user_mention = re.compile(r"(?:(?<!\w)@\w+\b)")
number = re.compile(r"(?:\b\d+\b)")
repeated_char = '([a-zA-Z])\\1+'
length_repeated_char = '\\1\\1'

def clean(raw):
  #convert HTML encoding to text
  new_row = BeautifulSoup(raw, 'html.parser').get_text()
  
  #Change all text to lower case
  new_row = new_row.lower()
  
  #Replaces any url with class URL
  new_row = re.sub(url, '', new_row)
  
  #replace any @username with class USERNAME
  new_row = re.sub(user_mention, '', new_row)
  
  #Strips repeated chars
  new_row = re.sub(repeated_char, length_repeated_char, new_row)
  
  #Replaces #hashtag with hashtag
  new_row = re.sub(r'#(\S+)', r' \1 ', new_row)
  
  #Remove numbers
  new_row = re.sub(number, '', new_row)
  
  #decode text with 'utf-8-sig'
  try:
    temp_row = new_row.decode("utf-8-sig").replace(u"\ufffd", "?")     
  except:
    temp_row = new_row
  
  #Removes emojis
  new_row = handle_emojis(temp_row);
  
  return new_row

In [140]:
clean_train = []
clean_validation = []

In [141]:
for i in range(0, 80000):
    clean_train.append(clean(df_train['text'][i]))

for i in range(0, 16000):
    clean_validation.append(clean(df_validation['text'][i]))

In [142]:
#save to csv file
clean_df_train = pd.DataFrame(clean_train, columns=['text'])
clean_df_validation = pd.DataFrame(clean_validation, columns = ['text'])

clean_df_train['target'] = df_train.sentiment
clean_df_validation['target'] = df_validation.sentiment

clean_df_train.to_csv('tweet_clean_train.csv',encoding='utf-8')
clean_df_validation.to_csv("tweet_clean_validation.csv", encoding = "utf-8")

In [143]:
# Using columns 1,2 because col 0 is the number of the row, could be changed if the row number is removed from the csv file.
traindf = pd.read_csv('tweet_clean_train.csv', usecols = [1,2], encoding='latin-1')
validationdf = pd.read_csv("tweet_clean_validation.csv", usecols = [1,2], encoding = "latin-1")

traindf

Unnamed: 0,text,target
0,pauln - got my arse kicked at crystal palace -...,0
1,"yeah, it is pretty cool. i prefer it when eve...",4
2,i'm happy... not for any real reason.. just ha...,4
3,slept very well.take a shower now.have to do m...,0
4,"yess, it's cool, my favorite tune in . intere...",4
...,...,...
79995,raidys bbq tonight free food & morrellis ice ...,4
79996,don't you just luv follow fridays--** thank yo...,4
79997,ahh i see! you need to go recruit some young ...,4
79998,is not of housewife material : self-cooked ric...,0


In [144]:
# Checking read cleaned file info if there are any null entries (found none) or white space
traindf.info()
traindf['text'] = traindf['text'].str.strip()
traindf.drop(traindf[traindf.text == ''].index, inplace=True)
traindf

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 2 columns):
text      80000 non-null object
target    80000 non-null int64
dtypes: int64(1), object(1)
memory usage: 1.2+ MB


Unnamed: 0,text,target
0,pauln - got my arse kicked at crystal palace -...,0
1,"yeah, it is pretty cool. i prefer it when ever...",4
2,i'm happy... not for any real reason.. just ha...,4
3,slept very well.take a shower now.have to do m...,0
4,"yess, it's cool, my favorite tune in . interes...",4
...,...,...
79995,raidys bbq tonight free food & morrellis ice ...,4
79996,don't you just luv follow fridays--** thank yo...,4
79997,ahh i see! you need to go recruit some young l...,4
79998,is not of housewife material : self-cooked ric...,0


In [146]:
X_train = traindf["text"].values
y_train = traindf["target"].values

X_validation = validationdf["text"].values
y_validation = validationdf["target"].values

In [156]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn import model_selection
from sklearn.metrics import accuracy_score

In [157]:
cvec = CountVectorizer()
tfidf = TfidfVectorizer()

X_train_cvec = cvec.fit_transform(X_train, y_train)
X_train_tf = tfidf.fit_transform(X_train, y_train)

In [160]:
#=====================Generic 5-fold Cross Validation Score==================
# This cross validation will only operate on the training set
# Start the Timer
start = timeit.default_timer()

mnb = MultinomialNB()
results_cv = model_selection.cross_val_score(mnb, X_train_cvec, y_train, cv = 5)
print("Cross validation with countvectorizer: ", results_cv.mean())

results_tf = model_selection.cross_val_score(mnb, X_train_tf, y_train, cv = 5)
print("Cross validation with TfidfVectorizer: ", results_tf.mean())

# Timer stops
stop = timeit.default_timer()
print("Time Execution: {}".format(stop - start))

Cross validation with countvectorizer:  0.7649350030210773
Cross validation with TfidfVectorizer:  0.7615534912606388
Time Execution: 0.493951937998645


In [161]:
# function for evaluating the accuracy of a given classifier and vectorizer
def nfeature_accuracy_checker(vectorizer=None, n_features=None, stop_words=None, ngram_range=(1, 1), classifier=None):
    result = []
    print (classifier)
    print ("\n")
    for n in n_features:
        vectorizer.set_params(stop_words=stop_words, max_features=n, ngram_range=ngram_range)
        checker_pipeline = Pipeline([
            ('vectorizer', vectorizer),
            ('classifier', classifier)
        ])
        t0 = time()
        sentiment_fit = checker_pipeline.fit(X_train, y_train)
        y_pred = sentiment_fit.predict(X_test)
        train_test_time = time() - t0
        accuracy = accuracy_score(y_test, y_pred)
        print("accuracy with", n, "features:", accuracy)
    print("\n")

In [162]:
from sklearn.pipeline import Pipeline
from time import time

In [163]:
cvec = CountVectorizer()
tfidf = TfidfVectorizer()
mnb = MultinomialNB()
n_features = np.arange(10000,50001,10000)

In [164]:
stop_word_extractor = [None, "english"]
vectorizer = [cvec, tfidf]

In [165]:
# unigram case
for i in range(len(stop_word_extractor)):
    if i == 0:
        print("==============================INCLUDING STOP WORDS============================")
    else:
        print("==============================NOT INCLUDING STOP WORDS============================")
    for j in range(len(vectorizer)):
        if j == 0:
            print("\t=================COUNTVECTORIZER===============")
        else:
            print("\t=================TFIDF=============")
        feature_result_unigram = nfeature_accuracy_checker(vectorizer=vectorizer[j], n_features=n_features, stop_words=stop_word_extractor[i], classifier=mnb)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


accuracy with 10000 features: 0.7713125
accuracy with 20000 features: 0.772125
accuracy with 30000 features: 0.771125
accuracy with 40000 features: 0.77
accuracy with 50000 features: 0.7705


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


accuracy with 10000 features: 0.7678125
accuracy with 20000 features: 0.766875
accuracy with 30000 features: 0.7675
accuracy with 40000 features: 0.767
accuracy with 50000 features: 0.767375


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


accuracy with 10000 features: 0.7489375
accuracy with 20000 features: 0.749125
accuracy with 30000 features: 0.75
accuracy with 40000 features: 0.748875
accuracy with 50000 features: 0.7485625


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


accuracy with 10000 features: 0.747875
accuracy with 20000 features: 0.7483125
accuracy with 30000 features: 0.7505625
accuracy with 40000 features: 0.749625
accuracy with 5000

In [None]:
# bigram case
# in here, we modify the number of features from the range 30000 to 130000, since 
# much more features will be created

n_features = np.arange(30000,130001,10000)

for i in range(len(stop_word_extractor)):
    if i == 0:
        print("==============================INCLUDING STOP WORDS============================")
    else:
        print("==============================NOT INCLUDING STOP WORDS============================")
    for j in range(len(vectorizer)):
        if j == 0:
            print("\t=================COUNTVECTORIZER===============")
        else:
            print("\t=================TFIDF=============")
        feature_result_unigram = nfeature_accuracy_checker(vectorizer=vectorizer[j], n_features=n_features, stop_words=stop_word_extractor[i], ngram_range = (1, 2), classifier=mnb)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


accuracy with 30000 features: 0.7878125
accuracy with 40000 features: 0.7888125
accuracy with 50000 features: 0.7888125
accuracy with 60000 features: 0.7885
accuracy with 70000 features: 0.788875
accuracy with 80000 features: 0.789125


In [None]:
# trigram case
# We furthur extent the number of features to the range of 120000

n_features = np.arange(10000,150001,10000)

for i in range(len(stop_word_extractor)):
    if i == 0:
        print("==============================INCLUDING STOP WORDS============================")
    else:
        print("==============================NOT INCLUDING STOP WORDS============================")
    for j in range(len(vectorizer)):
        if j == 0:
            print("\t=================COUNTVECTORIZER===============")
        else:
            print("\t=================TFIDF=============")
        feature_result_unigram = nfeature_accuracy_checker(vectorizer=vectorizer[j], n_features=n_features, stop_words=stop_word_extractor[i], ngram_range = (1, 3), classifier=mnb)

In [96]:
mnbGV = MultinomialNB()

tvecGV = TfidfVectorizer(ngram_range = (1, 2), max_features = 50000)
X_tfidf = tvecGV.fit_transform(X_train)


In [97]:
X_tfidf.shape

(80000, 50000)

In [112]:
#-----------------------------GRID SEARCH CROSS VALIDATION------------------------------
from sklearn.model_selection import GridSearchCV

# Timer begins
start = timeit.default_timer()

tuned_parameters = [{'alpha' : [2.05, 2.1, 2.15, 2.2]}]
n_folds = 5

grid_search = GridSearchCV(estimator = mnb, param_grid = tuned_parameters, cv = n_folds, refit = False, n_jobs = -1)

grid_search.fit(X_tfidf, y)

scores = grid_search.cv_results_['mean_test_score']
scores_std = grid_search.cv_results_['std_test_score']
print('scores:',scores)
print('scores_std',scores_std)

# Optimal hyperparameter
bestAlpha = grid_search.best_params_['alpha']
print(grid_search.best_params_)

# Timer stops
stop = timeit.default_timer()
print("Time Execution: {}".format(stop - start))
# #-----------------------------END OF GRID SEARCH-----------------------------------------

scores: [0.78135   0.7816875 0.78145   0.7812875]
scores_std [0.00353832 0.00354304 0.00358129 0.00377339]
{'alpha': 2.1}
Time Execution: 0.8598041959994589


In [113]:
mnbFinal = MultinomialNB(alpha = 2.1)
X_final = tvecGV.transform(X_test)

In [115]:
mnbFinal.fit(X_tfidf, y)

MultinomialNB(alpha=2.1, class_prior=None, fit_prior=True)

In [116]:
y_pred = mnbFinal.predict(X_final)

In [117]:
print(accuracy_score(y_pred, y_test))

0.7915
