# Sentiment Analysis

This is to train a classifier on a robust and tagged twitter corpus( obtained from a kaggle challenge ) and allow the model to be used for tagging the scraped tweets  as either 0(negative) or 1(positive).

## Loading Data

In [28]:
'''
Get the dataset.
'''
import os
import numpy as np
import pandas as pd
import pprint
from sklearn.model_selection import train_test_split

data = open("kaggleTweets.csv", "rb")
df = pd.read_csv(data, error_bad_lines=False, usecols=['Sentiment', 'SentimentText'], encoding='utf-8')
df = df[:100000]
df.head()

Unnamed: 0,Sentiment,SentimentText
0,0,is so sad for my APL frie...
1,0,I missed the New Moon trail...
2,1,omg its already 7:30 :O
3,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,0,i think mi bf is cheating on me!!! ...


## Data Pre-processing

In [1]:
import re

def preprocessor(text):
    try:
        text = re.sub('<[^>]*>', ' ', text)    # removes HTML from tweets
    except:
        text = text
    try:
        text = re.sub('(http|https)://[^ ]+ ', '', text)    # removes all the hyperlinks
    except:
        text = text
    try:
        text = re.sub('\s\s+', '', text)    # removes all the extra whitespaces
    except:
        text = text
    try:
        emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P|[^T_T])', text)    #find all emoticons
    except:
        text = text
    try:
        text = re.sub('[\W]+', ' ', text.lower()) + ''.join(emoticons).replace('-', '')  # appends emmoticons at the end.
    except:
        text = text
    
    return text

In [32]:
preprocessor(df.loc[14000, "SentimentText"])    #example

u' quot then jesus said have dinner with me quot and the man responded quot first send me your resume and photo ;a;f'

In [33]:
df['SentimentText'] = df['SentimentText'].apply(preprocessor)

In [34]:
df.head()

Unnamed: 0,Sentiment,SentimentText
0,0,is so sad for my apl friend
1,0,i missed the new moon trailer
2,1,omg its already 7 30 o:3:O
3,0,omgaga im soooim gunna cry i ve been at this ...
4,0,i think mi bf is cheating on me t_t


In [2]:
# Processing into tokens
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer(text):
    return text.split()

def tokenizer_porter(text):
#     return [porter.stem(word) for word in text.split()]
    for word in text.split():
        try:
            return porter.stem(word)
        except Exception:
            return word

In [36]:
# exemplary run
tokenizer_porter(df.loc[14000, "SentimentText"])

[u'quot',
 u'then',
 u'jesu',
 u'said',
 u'have',
 u'dinner',
 u'with',
 u'me',
 u'quot',
 u'and',
 u'the',
 u'man',
 u'respond',
 u'quot',
 u'first',
 u'send',
 u'me',
 u'your',
 u'resum',
 u'and',
 u'photo',
 u';a;f']

In [37]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /home/pc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [38]:
# storing all the stopwords in an array.
from nltk.corpus import stopwords
stop = stopwords.words('english')

## Training a supervised learning classifier

In [39]:
# Prepare dataset to be operated upon by GridSearchCV
X_train = df.loc[:50000, "SentimentText"].values
y_train = df.loc[:50000, "Sentiment"].values
X_test = df.loc[50000:, "SentimentText"].values
y_test = df.loc[50000:, "Sentiment"].values

In [46]:
# Using GridSearchCV to find best parameters to use for classifier(SGDClassifier)
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              ]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=1,
                           n_jobs=-1)

In [47]:
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   54.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  5.0min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('vect', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=Tru...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid=[{'vect__ngram_range': [(1, 1)], 'vect__tokenizer': [<function tokenizer at 0x7fe3fb854230>, <function tokenizer_porter at 0x7fe3fb854488>], 'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0], 'vect__stop_words': [[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves...e3fb854488>], 'vect__use_idf': [False], 'clf__C': [1.0, 10.0, 100.0], 'clf__penalty': ['l1', 'l2']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
    

In [48]:
print('Best parameter set: %s '% gs_lr_tfidf.best_params_)
print('CV Accuracy: %.3f'%gs_lr_tfidf.best_score_)

Best parameter set: {'vect__ngram_range': (1, 1), 'vect__tokenizer': <function tokenizer at 0x7fe3fb854230>, 'clf__penalty': 'l2', 'clf__C': 1.0, 'vect__stop_words': None} 
CV Accuracy: 0.735


In [49]:
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))

Test Accuracy: 0.760


In [51]:
'''
serialize the classifier as a pickle file
'''

import pickle
import os

dest = os.path.join('tweetclassifier', 'pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)
    
# we serialize our stopwords so that we do not have to install NLTK on our servers    
pickle.dump(stop,
           open(os.path.join(dest, 'stopwords.pkl'), 'wb')
           )

pickle.dump(clf,
            open(os.path.join(dest, 'classifier.pkl'), 'wb')
           )

In [3]:
import os

import pickle
import re

clf = pickle.load(open(os.path.join('tweetclassifier' , 'pkl_objects', 'classifier.pkl'), 'rb'))

In [4]:
import numpy as np
label = {0:'negative', 1:'positive'}

# examplery
example = ['Drop the beat and go wild.']
print('Prediction: {}\nProbability: {:.2f}'.format(label[clf.predict(example)[0]], clf.predict_proba(example).max()*100))

Prediction: positive
Probability: 59.83


## Loading classifier and the dataset to be tagged

In [5]:
import os
import numpy as np
import pandas as pd
import pickle
import re

clf = pickle.load(open(os.path.join('tweetclassifier' , 'pkl_objects', 'classifier.pkl'), 'rb'))

data = open("twitterData.csv", "rb")
df = pd.read_csv(data, error_bad_lines=False, usecols=['Tweets', 'sentiment'], encoding='utf-8')
print(df.head())

                                              Tweets sentiment
0  is taking this day to relax and catch up on sl...       NaN
1  Went to chiro today...very sore now, but know ...       NaN
2  Updated the iPhone 2G to OS 3.0. Feels zippier...       NaN
3  I am SOOO hyped for this weekend. Going to Va ...       NaN
4  Was feeling down earlier but two wonderful fri...       NaN


In [6]:
'''
apply processor to all tweets to get clean data such that prediction can be made on it.
'''
df['Tweets'] = df['Tweets'].apply(preprocessor)

In [7]:
'''
exemplary testing
'''
df['Tweets'][:5]

0    is taking this day to relax and catch up on sl...
1    went to chiro today very sore now but know i w...
2    updated the iphone 2g to os 3 0 feels zippier ...
3    i am sooo hyped for this weekend going to va f...
4    was feeling down earlier but two wonderful fri...
Name: Tweets, dtype: object

In [8]:
df.head()

Unnamed: 0,Tweets,sentiment
0,is taking this day to relax and catch up on sl...,
1,went to chiro today very sore now but know i w...,
2,updated the iphone 2g to os 3 0 feels zippier ...,
3,i am sooo hyped for this weekend going to va f...,
4,was feeling down earlier but two wonderful fri...,


## Apply classifier on tweets and predict sentiment

In [9]:
'''
Tag the tweets using classifier 
and print the cases that raise exception.
'''

for i in range(len(df['Tweets'])):
    try:
        df['sentiment'][i] = clf.predict([df['Tweets'][i]])[0]
    except:
        print("{}\n".format(i))
        pass

5092

20802

27992

48738

78397

78524

80309

101148

107325

109572

126462

137404

140912

142019

155004

164791

166199

168287

169541

169796

176450

178049

186489

186990

187645

194024

199346

202838

204161

211643

215697

220241

226685

230714

230789

233830

240563

241102

244554

244741

245161

248908

249226

251753

256517

256728

261710

268424

274864

275269

282820

284716

284767

289213

291719

298847

306356

309581

309647

310244

310261

312130

313926

314150

314405

317075

318068

326584

329149

336283

352413

354793

355887

357052

357728



In [49]:
'''
for tweets on following indices,
classifier raised exeptions.
'''
faulty = [5092,20802,27992,48738,78397,78524,80309,101148,107325,109572,126462,137404,140912,142019,155004,164791,166199,168287,169541,169796,176450,178049,186489,186990,187645,194024,199346,202838,204161,211643,215697,220241,226685,230714,230789,233830,240563,241102,244554,244741,245161,248908,249226,251753,256517,256728,261710,268424,274864,275269,282820,284716,284767,289213,291719,298847,306356,309581,309647,310244,310261,312130,313926,314150,314405,317075,318068,326584,329149,336283,352413,354793,355887,357052,357728]

In [50]:
'''
make all sentiment None for all faulty tweets.
'''
for i in faulty:
    df['sentiment'][i] = None

In [53]:
df.head()

Unnamed: 0,Tweets,sentiment
0,is taking this day to relax and catch up on sl...,1
1,went to chiro today very sore now but know i w...,0
2,updated the iphone 2g to os 3 0 feels zippier ...,0
3,i am sooo hyped for this weekend going to va f...,1
4,was feeling down earlier but two wonderful fri...,1


In [52]:
# exemplary check
print("{} --------> {}".format(df.loc[5092,'Tweets'], df.loc[5092,'sentiment']))

nan --------> None


## Creating processed and tagged dataset

In [54]:
'''
reload the raw dataset file to get a fresh dataframe
'''
datafile = open("twitterData.csv", "rb")
new_df = pd.read_csv(datafile, error_bad_lines=False, usecols=['Date', 'Name', 'Tweets', 'sentiment'], encoding='utf-8')

In [55]:
'''
update the old columns with
-> new processed tweets,
and
-> corresponding sentiment.
'''
new_df['Tweets'] = df['Tweets']
new_df['sentiment'] = df['sentiment']

In [57]:
'''
write the dataframe to a new csv file.
'''
new_df.to_csv('TaggedData.csv', columns=['Date', 'Name', 'Tweets', 'sentiment'], encoding='utf-8')