In [213]:
import pandas as pd
import zipfile
import nltk
from nltk.probability import FreqDist
import warnings
warnings.simplefilter('default', ImportWarning)

In [78]:
# Loading the Dataset
zf = zipfile.ZipFile('Sentiment140.csv.zip')
df = pd.read_csv(zf.open('Sentiment140.csv'))

In [79]:
# getting 20000 random rows to develop the analysis
df = df.sample(n=20000)
df.reset_index(drop=True,inplace=True)

In [80]:
# Dataframe sample
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,2012883900,Tue Jun 02 21:28:37 PDT 2009,NO_QUERY,peterdarlington,@dubdotdash I don't have aircon Lovely office...
1,0,2052369572,Sat Jun 06 00:30:08 PDT 2009,NO_QUERY,Preetha_87,Yesterday wasn't a great day..my best friend's...
2,4,1967982555,Fri May 29 20:33:23 PDT 2009,NO_QUERY,LMRB,@hma4983 It's the same house We came back &am...
3,4,1967192097,Fri May 29 19:08:12 PDT 2009,NO_QUERY,gabeezy,@reginaislegit is the new david sides!!! http...
4,4,1970340485,Sat May 30 03:27:56 PDT 2009,NO_QUERY,SharpShooter67,Driva! On the road again...


In [81]:
# open the file with the functions
%run Challenge_2.ipynb

In [82]:
# Prepare Textual Data for Sentiment Analysis
def text_proc(x):
    text = clean_up(x)
    text = tokenize(text)
    text = stem_and_lemmatize(text)
    text = remove_stopwords(text)
    return text

In [83]:
# creting the new column with the text processed
df['text_processed'] = df['text'].apply(lambda x: text_proc(x))

In [86]:
# Dataframe sample
df.head()

Unnamed: 0,target,id,date,flag,user,text,text_processed
0,0,2012883900,Tue Jun 02 21:28:37 PDT 2009,NO_QUERY,peterdarlington,@dubdotdash I don't have aircon Lovely office...,"[dubdotdash, aircon, love, offic, love, view, ..."
1,0,2052369572,Sat Jun 06 00:30:08 PDT 2009,NO_QUERY,Preetha_87,Yesterday wasn't a great day..my best friend's...,"[yesterday, great, day, best, friend, parent, ..."
2,4,1967982555,Fri May 29 20:33:23 PDT 2009,NO_QUERY,LMRB,@hma4983 It's the same house We came back &am...,"[hma, hous, came, back, amp, outbid, buyer, go..."
3,4,1967192097,Fri May 29 19:08:12 PDT 2009,NO_QUERY,gabeezy,@reginaislegit is the new david sides!!! http...,"[reginaislegit, new, david, side, p]"
4,4,1970340485,Sat May 30 03:27:56 PDT 2009,NO_QUERY,SharpShooter67,Driva! On the road again...,"[driva, road]"


In [87]:
# Creating Bag of Words
words_lst = []

for i in df['text_processed']:
    words_lst += i

In [88]:
# Sample of words list
print(words_lst[:100])

['dubdotdash', 'aircon', 'love', 'offic', 'love', 'view', 'veri', 'veri', 'chilli', 'yesterday', 'great', 'day', 'best', 'friend', 'parent', 'met', 'accid', 'thank', 'go', 'seriou', 'feel', 'freakin', 'bad', 'hma', 'hous', 'came', 'back', 'amp', 'outbid', 'buyer', 'go', 'ani', 'higher', 'though', 'bid', 'u', 'reginaislegit', 'new', 'david', 'side', 'p', 'driva', 'road', 'chrispenn', 'tweet', 'alert', 'keep', 'go', 'spam', 'box', 'time', 'pas', 'hurt', 'even', 'merz', 'soundcheck', 'parti', 'goer', 'get', 'one', 'mask', 'aheartofstar', 'damn', 'girl', 'fuck', 'suck', 'whi', 'aint', 'text', 'noth', 'hope', 'feel', 'better', 'way', 'way', 'way', 'alaska', 'mayn', 'new', 'moon', 'trailer', 'look', 'soooo', 'good', 'trinawright', 'well', 'yeah', 'hand', 'fuse', 'togeth', 'render', 'key', 'useless', 'key', 'liter', 'pocket', 'finish', 'veri', 'last', 'high']


In [89]:
# Lenght of words list
print(len(words_lst))

157531


In [90]:
# Most frequent words
fdist = FreqDist(words_lst)

top_5000 = fdist.most_common(5000)

In [91]:
# Sample of 5000 most frequent words
top_5000[:10]

[('go', 1721),
 ('get', 1432),
 ('day', 1417),
 ('wa', 1264),
 ('thi', 1214),
 ('good', 1153),
 ('love', 1072),
 ('work', 1067),
 ('like', 1038),
 ('quot', 912)]

In [188]:
# Building Features matrix
col = [i[0] for i in top_5000]

feat = df[['text_processed','target']]

feat['is_positive'] = feat['target'].apply(lambda x: True if x==4 else False)

feat.drop('target', axis=1, inplace=True)

for i in col:
    feat[i] = feat['text_processed'].apply(lambda x: True if i in x else False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feat['is_positive'] = feat['target'].apply(lambda x: True if x==4 else False)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feat[i] = feat['text_processed'].apply(lambda x: True if i in x else False)


In [189]:
#Features matrix
feat.head()

Unnamed: 0,text_processed,is_positive,go,get,day,wa,thi,good,love,work,...,greec,rele,joejona,sixth,fate,xlc,borderlin,ufo,alexi,zabriel
0,"[dubdotdash, aircon, love, offic, love, view, ...",False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
1,"[yesterday, great, day, best, friend, parent, ...",False,True,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,"[hma, hous, came, back, amp, outbid, buyer, go...",True,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,"[reginaislegit, new, david, side, p]",True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,"[driva, road]",True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [190]:
# Features structure for nltk.NaiveBayesClassifier.train
feature_lst = []

for i in range(len(feat)):
    is_pos = feat.loc[i,'is_positive']
    dic = {}
    for e in col:
        dic[e] = feat.loc[i,e]
    feature_lst.append((dic,is_pos))

In [204]:
len(feature_lst)

20000

In [206]:
# split in train and test sets
train_set, test_set = feature_lst[:16000], feature_lst[-4000:]

In [212]:
# training the model
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [216]:
#Most Informative Features
classifier.show_most_informative_features(10)

Most Informative Features
                    sigh = True            False : True   =     22.8 : 1.0
            followfriday = True             True : False  =     19.9 : 1.0
                     thx = True             True : False  =     19.2 : 1.0
                 congrat = True             True : False  =     16.2 : 1.0
                     vip = True             True : False  =     16.0 : 1.0
                     sad = True            False : True   =     15.1 : 1.0
                   badli = True            False : True   =     11.9 : 1.0
                  welcom = True             True : False  =     11.4 : 1.0
                     bum = True            False : True   =     11.3 : 1.0
                    grrr = True            False : True   =     10.6 : 1.0


In [214]:
# Testing Naive Bayes Model
print(nltk.classify.accuracy(classifier, test_set))

0.731
