In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import string as str
# import seaborn as sns
from bs4 import BeautifulSoup

In [4]:
df_train = pd.read_csv("tweet_training.csv")
df_validation = pd.read_csv("tweet_dev.csv")

df_train

Unnamed: 0.1,Unnamed: 0,sentiment,text
0,0,0,pauln - Got my arse kicked at Crystal Palace -...
1,1,4,"@IHeartLost yeah, it is pretty cool. I prefer ..."
2,2,4,I'm happy... Not for any real reason.. Just ha...
3,3,0,slept very well.take a shower now.have to do m...
4,4,4,"@GOttaviani yesss, it's cool, my favorite tune..."
...,...,...,...
79995,79995,4,raidys bbq tonight free food &amp; morrellis ...
79996,79996,4,Don't you just luv follow fridays--** thank yo...
79997,79997,4,@megodbike ahh I see! you need to go recruit s...
79998,79998,0,is not of housewife material : self-cooked ric...


In [5]:
#Strips emojis
def handle_emojis(tweet):
    # Smile -- :), : ), :-), (:, ( :, (-:, :'),:D, : D, =)
    
    tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))|:\s?D | =\)', '', tweet)
    # Sad -- :-(, : (, :(, ):, )-: , :p
    tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)|:p', '', tweet)
    return tweet

In [6]:
#clean data with below components removed or replaced
url = re.compile(r"(?:(http[s]?://\S+)|((//)?(\w+\.)?\w+\.\w+/\S+))")
user_mention = re.compile(r"(?:(?<!\w)@\w+\b)")
number = re.compile(r"(?:\b\d+\b)")
repeated_char = '([a-zA-Z])\\1+'
length_repeated_char = '\\1\\1'

def clean(raw):
  #convert HTML encoding to text
  new_row = BeautifulSoup(raw, 'html.parser').get_text()
  
  #Change all text to lower case
  new_row = new_row.lower()
  
  #Replaces any url with class URL
  new_row = re.sub(url, '', new_row)
  
  #replace any @username with class USERNAME
  new_row = re.sub(user_mention, '', new_row)
  
  #Strips repeated chars
  new_row = re.sub(repeated_char, length_repeated_char, new_row)
  
  #Replaces #hashtag with hashtag
  new_row = re.sub(r'#(\S+)', r' \1 ', new_row)
  
  #Remove numbers
  new_row = re.sub(number, '', new_row)
  
  #decode text with 'utf-8-sig'
  try:
    temp_row = new_row.decode("utf-8-sig").replace(u"\ufffd", "?")     
  except:
    temp_row = new_row
  
  #Removes emojis
  new_row = handle_emojis(temp_row);
  
  return new_row

In [7]:
clean_train = []
clean_validation = []

In [8]:
for i in range(0, 80000):
    clean_train.append(clean(df_train['text'][i]))

for i in range(0, 16000):
    clean_validation.append(clean(df_validation['text'][i]))

In [9]:
#save to csv file
clean_df_train = pd.DataFrame(clean_train, columns=['text'])
clean_df_validation = pd.DataFrame(clean_validation, columns = ['text'])

clean_df_train['target'] = df_train.sentiment
clean_df_validation['target'] = df_validation.sentiment

clean_df_train.to_csv('tweet_clean_train.csv',encoding='utf-8')
clean_df_validation.to_csv("tweet_clean_validation.csv", encoding = "utf-8")

In [10]:
# Using columns 1,2 because col 0 is the number of the row, could be changed if the row number is removed from the csv file.
traindf = pd.read_csv('tweet_clean_train.csv', usecols = [1,2], encoding='latin-1')
validationdf = pd.read_csv("tweet_clean_validation.csv", usecols = [1,2], encoding = "latin-1")

traindf

Unnamed: 0,text,target
0,pauln - got my arse kicked at crystal palace -...,0
1,"yeah, it is pretty cool. i prefer it when eve...",4
2,i'm happy... not for any real reason.. just ha...,4
3,slept very well.take a shower now.have to do m...,0
4,"yess, it's cool, my favorite tune in . intere...",4
...,...,...
79995,raidys bbq tonight free food & morrellis ice ...,4
79996,don't you just luv follow fridays--** thank yo...,4
79997,ahh i see! you need to go recruit some young ...,4
79998,is not of housewife material : self-cooked ric...,0


In [11]:
# Checking read cleaned file info if there are any null entries (found none) or white space
traindf.info()
traindf['text'] = traindf['text'].str.strip()
traindf.drop(traindf[traindf.text == ''].index, inplace=True)
traindf

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 2 columns):
text      80000 non-null object
target    80000 non-null int64
dtypes: int64(1), object(1)
memory usage: 1.2+ MB


Unnamed: 0,text,target
0,pauln - got my arse kicked at crystal palace -...,0
1,"yeah, it is pretty cool. i prefer it when ever...",4
2,i'm happy... not for any real reason.. just ha...,4
3,slept very well.take a shower now.have to do m...,0
4,"yess, it's cool, my favorite tune in . interes...",4
...,...,...
79995,raidys bbq tonight free food & morrellis ice ...,4
79996,don't you just luv follow fridays--** thank yo...,4
79997,ahh i see! you need to go recruit some young l...,4
79998,is not of housewife material : self-cooked ric...,0


In [12]:
X_train = traindf["text"].values
y_train = traindf["target"].values

X_validation = validationdf["text"].values
y_validation = validationdf["target"].values

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn import model_selection
from sklearn.metrics import accuracy_score

In [14]:
cvec = CountVectorizer()
tfidf = TfidfVectorizer()

X_train_cvec = cvec.fit_transform(X_train, y_train)
X_train_tf = tfidf.fit_transform(X_train, y_train)

In [15]:
#=====================Generic 5-fold Cross Validation Score==================
# This cross validation will only operate on the training set
# Start the Timer
start = timeit.default_timer()

mnb = MultinomialNB()
results_cv = model_selection.cross_val_score(mnb, X_train_cvec, y_train, cv = 5)
print("Cross validation with countvectorizer: ", results_cv.mean())

results_tf = model_selection.cross_val_score(mnb, X_train_tf, y_train, cv = 5)
print("Cross validation with TfidfVectorizer: ", results_tf.mean())

# Timer stops
stop = timeit.default_timer()
print("Time Execution: {}".format(stop - start))

NameError: name 'timeit' is not defined

In [316]:
# function for evaluating the accuracy of a given classifier and vectorizer
def nfeature_accuracy_checker(vectorizer=None, n_features=None, stop_words=None, ngram_range=(1, 1), classifier=None):
    result = []
    print (classifier)
    print ("\n")
    for n in n_features:
        vectorizer.set_params(stop_words=stop_words, max_features=n, ngram_range=ngram_range)
        checker_pipeline = Pipeline([
            ('vectorizer', vectorizer),
            ('classifier', classifier)
        ])
        t0 = time()
        sentiment_fit = checker_pipeline.fit(X_train, y_train)
        y_pred = sentiment_fit.predict(X_validation)
        train_test_time = time() - t0
        accuracy = accuracy_score(y_pred, y_validation)
        print("accuracy with", n, "features:", accuracy)
    print("\n")

In [317]:
from sklearn.pipeline import Pipeline
from time import time

In [221]:
cvec = CountVectorizer()
tfidf = TfidfVectorizer()
mnb = MultinomialNB()
n_features = np.arange(10000,50001,10000)

In [222]:
stop_word_extractor = [None, "english"]
vectorizer = [cvec, tfidf]

In [223]:
# unigram case
for i in range(len(stop_word_extractor)):
    if i == 0:
        print("==============================INCLUDING STOP WORDS============================")
    else:
        print("==============================NOT INCLUDING STOP WORDS============================")
    for j in range(len(vectorizer)):
        if j == 0:
            print("\t=================COUNTVECTORIZER===============")
        else:
            print("\t=================TFIDF=============")
        feature_result_unigram = nfeature_accuracy_checker(vectorizer=vectorizer[j], n_features=n_features, stop_words=stop_word_extractor[i], classifier=mnb)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


accuracy with 10000 features: 0.7729375
accuracy with 20000 features: 0.773125
accuracy with 30000 features: 0.7725625
accuracy with 40000 features: 0.7720625
accuracy with 50000 features: 0.7723125


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


accuracy with 10000 features: 0.7689375
accuracy with 20000 features: 0.7686875
accuracy with 30000 features: 0.7693125
accuracy with 40000 features: 0.7684375
accuracy with 50000 features: 0.7685625


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


accuracy with 10000 features: 0.7536875
accuracy with 20000 features: 0.7541875
accuracy with 30000 features: 0.7549375
accuracy with 40000 features: 0.7545625
accuracy with 50000 features: 0.7545625


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


accuracy with 10000 features: 0.7511875
accuracy with 20000 features: 0.75225
accuracy with 30000 features: 0.754125
accuracy with 40000 features: 0.75

In [224]:
# bigram case
# in here, we modify the number of features from the range 30000 to 130000, since 
# much more features will be created

n_features = np.arange(30000,130001,10000)

for i in range(len(stop_word_extractor)):
    if i == 0:
        print("==============================INCLUDING STOP WORDS============================")
    else:
        print("==============================NOT INCLUDING STOP WORDS============================")
    for j in range(len(vectorizer)):
        if j == 0:
            print("\t=================COUNTVECTORIZER===============")
        else:
            print("\t=================TFIDF=============")
        feature_result_unigram = nfeature_accuracy_checker(vectorizer=vectorizer[j], n_features=n_features, stop_words=stop_word_extractor[i], ngram_range = (1, 2), classifier=mnb)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


accuracy with 30000 features: 0.791
accuracy with 40000 features: 0.791375
accuracy with 50000 features: 0.79175
accuracy with 60000 features: 0.79075
accuracy with 70000 features: 0.7908125
accuracy with 80000 features: 0.7911875
accuracy with 90000 features: 0.7914375
accuracy with 100000 features: 0.79175
accuracy with 110000 features: 0.7919375
accuracy with 120000 features: 0.792625
accuracy with 130000 features: 0.79325


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


accuracy with 30000 features: 0.7908125
accuracy with 40000 features: 0.792375
accuracy with 50000 features: 0.7930625
accuracy with 60000 features: 0.793
accuracy with 70000 features: 0.792625
accuracy with 80000 features: 0.7941875
accuracy with 90000 features: 0.7948125
accuracy with 100000 features: 0.795125
accuracy with 110000 features: 0.7945625
accuracy with 120000 features: 0.794125
accuracy with 130000 features: 0.7948125


Multinom

In [225]:
# trigram case
# We furthur extent the number of features to the range of 120000

n_features = np.arange(60000,160001,10000)

for i in range(len(stop_word_extractor)):
    if i == 0:
        print("==============================INCLUDING STOP WORDS============================")
    else:
        print("==============================NOT INCLUDING STOP WORDS============================")
    for j in range(len(vectorizer)):
        if j == 0:
            print("\t=================COUNTVECTORIZER===============")
        else:
            print("\t=================TFIDF=============")
        feature_result_unigram = nfeature_accuracy_checker(vectorizer=vectorizer[j], n_features=n_features, stop_words=stop_word_extractor[i], ngram_range = (1, 3), classifier=mnb)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


accuracy with 60000 features: 0.7889375
accuracy with 70000 features: 0.79
accuracy with 80000 features: 0.7894375
accuracy with 90000 features: 0.7888125
accuracy with 100000 features: 0.788875
accuracy with 110000 features: 0.7881875
accuracy with 120000 features: 0.7885
accuracy with 130000 features: 0.788375
accuracy with 140000 features: 0.78825
accuracy with 150000 features: 0.789375
accuracy with 160000 features: 0.7894375


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


accuracy with 60000 features: 0.7885625
accuracy with 70000 features: 0.7905625
accuracy with 80000 features: 0.78975
accuracy with 90000 features: 0.79
accuracy with 100000 features: 0.7905625
accuracy with 110000 features: 0.79
accuracy with 120000 features: 0.7896875
accuracy with 130000 features: 0.7905625
accuracy with 140000 features: 0.7916875
accuracy with 150000 features: 0.7914375
accuracy with 160000 features: 0.7928125


Multi

In [348]:
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer('english')
analyzer = CountVectorizer().build_analyzer()

def stem_analyzer(document):
    return (stemmer.stem(w) for w in analyzer(document))

In [352]:
# Lemmatizing through the analyzer
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
analyzer = CountVectorizer().build_analyzer()

def lemm_analyzer(document):
    return (lemmatizer.lemmatize(w) for w in analyzer(document))

In [356]:
mnbGV = MultinomialNB()


tvecGV = TfidfVectorizer(ngram_range = (1, 2), max_features = 100000)
# tvecGV = TfidfVectorizer(ngram_range = (1, 2), max_features = 100000, stop_words = "english")
# tvecGV = TfidfVectorizer(ngram_range = (1, 2), max_features = 100000, analyzer = lemm_analyzer)
X_tfidf = tvecGV.fit_transform(X_train)

In [357]:
X_tfidf.shape

(79846, 100000)

In [358]:
#-----------------------------GRID SEARCH CROSS VALIDATION------------------------------
from sklearn.model_selection import GridSearchCV

# Timer begins
start = timeit.default_timer()

tuned_parameters = [{'alpha' : [2.05, 2.06, 2.07, 2.1, 2.15, 2.2]}]
n_folds = 5

grid_search = GridSearchCV(estimator = mnb, param_grid = tuned_parameters, cv = n_folds, refit = False, n_jobs = -1)

grid_search.fit(X_tfidf, y_train)

scores = grid_search.cv_results_['mean_test_score']
scores_std = grid_search.cv_results_['std_test_score']
print('scores:',scores)
print('scores_std',scores_std)

# Optimal hyperparameter
bestAlpha = grid_search.best_params_['alpha']
print(grid_search.best_params_)

# Timer stops
stop = timeit.default_timer()
print("Time Execution: {}".format(stop - start))
# #-----------------------------END OF GRID SEARCH-----------------------------------------

scores: [0.78605065 0.7860256  0.78606317 0.78588783 0.78587531 0.78591288]
scores_std [0.00287893 0.00293372 0.00299004 0.00300964 0.00306567 0.00304846]
{'alpha': 2.07}
Time Execution: 1.3531304229982197


In [1]:
mnbFinal = MultinomialNB(alpha = 2.2)
X_final = tvecGV.transform(X_validation)

NameError: name 'MultinomialNB' is not defined

In [2]:
mnbFinal.fit(X_tfidf, y_train)

NameError: name 'mnbFinal' is not defined

In [335]:
y_pred = mnbFinal.predict(X_final)
y_pred.shape

(16000,)

In [336]:
print(accuracy_score(y_pred, y_validation))

0.795875


In [362]:
#====================Bagging Classifier with NB Model================
from sklearn.ensemble import BaggingClassifier
from sklearn import model_selection

# Timer begins
start = timeit.default_timer()

# Re-create the multinomialNB model, this time using the optimized parameter
# that was found through the GridSearchCV
mnb = MultinomialNB(alpha = 2.1)

bg = BaggingClassifier(mnb, max_samples = 0.6, max_features = 0.5, n_estimators = 200)
results = model_selection.cross_val_score(bg, X_tfidf, y_train, cv = 5)
print(results.mean())

# Timer stops
stop = timeit.default_timer()
print("Time Execution bagging classifier: {}".format(stop - start))
#==============================================================

0.7838839991054993
Time Execution bagging classifier: 53.636927380997804


In [363]:
#====================TESTING SECTION========================

In [364]:
df_test_pre = pd.read_csv('testdata.manual.2009.06.14.csv', header = None)

clean_test = []

for i in range(0, 498):
    clean_test.append(clean(df_test_pre[5][i]))
    

In [365]:
#save to csv file
tweet_test = pd.DataFrame(clean_test, columns=['text'])
tweet_test['target'] = df_test_pre[0]   # sentiment

tweet_test.to_csv('tweet_clean_test.csv',encoding='utf-8')

In [366]:
df_test = pd.read_csv("tweet_clean_test.csv")

In [367]:
# cols = ['sentiment','id','date','query_string','user','text']


# drop rows with neutral sentiment 
df_test.drop(df_test[df_test.target == 2].index, inplace=True)
# drop rows with retweet text 
# df_test.drop(df_test[df_test[5].str.contains(' RT ')].index, inplace=True)

df_test

Unnamed: 0.1,Unnamed: 0,text,target
0,0,i loovvee my kindle2. not that the dx is cool...,4
1,1,reading my kindle2... love it... lee childs i...,4
2,2,"ok, first assesment of the kindle2 ...it fuc...",4
3,3,you'll love your kindle2. i've had mine for a...,4
4,4,fair enough. but i have the kindle2 and i th...,4
...,...,...,...
492,492,"after using latex a lot, any other typeset mat...",4
494,494,"on that note, i hate word. i hate pages. i hat...",0
495,495,ahh... back in a *real* text editing environme...,4
496,496,"trouble in iran, i see. hmm. iran. iran so far...",0


In [368]:
X_test = df_test['text']
y_test = df_test['target']

In [369]:
X_final_test = tvecGV.transform(X_test)

In [370]:
bg.fit(X_tfidf, y_train)

BaggingClassifier(base_estimator=MultinomialNB(alpha=2.1, class_prior=None,
                                               fit_prior=True),
                  bootstrap=True, bootstrap_features=False, max_features=0.5,
                  max_samples=0.6, n_estimators=200, n_jobs=None,
                  oob_score=False, random_state=None, verbose=0,
                  warm_start=False)

In [371]:
y_pred_final = bg.predict(X_final_test)

In [372]:
accuracy_score(y_pred_final, y_test)

0.8217270194986073