### Sentiment Analysis - Train the model

In [1]:
import warnings
warnings.filterwarnings(action = 'ignore')

In [2]:
import pandas as pd
import re
import pickle

In [3]:
cleaned_tweets = pd.read_csv('tweets2.csv')

### Calculate Sentiment scores

In [4]:
from textblob import TextBlob

In [5]:
def get_tweet_sentiment(tweet):
        analysis = TextBlob(tweet)
        if analysis.sentiment.polarity > 0:
            return 'positive'
        elif analysis.sentiment.polarity == 0:
            return 'neutral'
        else:
            return 'negative'

In [6]:
cleaned_tweets['sentiment'] = cleaned_tweets.apply(lambda x: get_tweet_sentiment(x['tweet']),axis = 1)
cleaned_tweets.groupby(cleaned_tweets['sentiment']).count()

Unnamed: 0_level_0,tweet,sentiment
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1
negative,1714,1714
neutral,7057,7057
positive,9106,9106


### Term Frequency Analysis

In [7]:
text = cleaned_tweets['tweet']

### Remove Stop words and do frequency analysis

In [8]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
stopwords = ['the','to','a','is','and','for','i','you','in','on','with','of','this','we','have','your','my','are','have',\
             'from','-',':','rt','via','can','when','by','but','+'] + list(ENGLISH_STOP_WORDS)

In [9]:
words = []
for line in text:
    word = line.split()
    words+=word

In [10]:
for term in words:
    if term in stopwords:
        words.remove(term)

In [11]:
from collections import Counter
count = Counter()
allterms = [term for term in words]
count.update(allterms)
print(count.most_common(40)) 

[('oneplus', 8899), ('the', 4536), ('6t', 4343), ('new', 1625), ('purple', 1608), ('thunder', 1484), ('a', 1362), ('#oneplus6t', 1342), ('6', 1265), ('phone', 1193), ('you', 1031), ('win', 1005), ('video', 953), ('this', 943), ('giveaway', 822), ('to', 807), ('best', 783), ('color', 767), ('backpack', 762), ('black', 695), ('free', 685), ('vs', 623), ('i', 615), (':', 595), ('my', 568), ('check', 567), ('and', 557), ('android', 554), ('is', 543), ('with', 541), ('are', 533), ('have', 524), ('your', 523), ('liked', 505), ('we', 499), ('just', 497), ('explorer', 485), ('hands', 476), ('galaxy', 473), ('3', 470)]


In [12]:
regex = re.compile(u'[^\+]')
list1 = [x for x in allterms if regex.match(x)]

In [13]:
regex = re.compile(u'[^\#]')
list2 = [x for x in list1 if regex.match(x)]

In [14]:
regex = re.compile(u'[^0-9]')
list3 = [x for x in list2 if regex.match(x)]

In [15]:
regex = re.compile(u'[^\|]')
list4 = [x for x in list3 if regex.match(x)]

In [16]:
regex = re.compile(u'[^*\+\#*]')
okay_items = [x for x in list4 if regex.match(x)]

### Split the data into train and test at a ratio of 70:30

In [17]:
text, y = cleaned_tweets.tweet, cleaned_tweets.sentiment

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
text_train, text_test, y_train, y_test = train_test_split(text, y, test_size=0.33, random_state=42)

### Explore train and test data

In [20]:
import numpy as np
np.unique(y_train)

array(['negative', 'neutral', 'positive'], dtype=object)

In [21]:
y_train.groupby(y_train,axis=0).count()

sentiment
negative    1176
neutral     4720
positive    6081
Name: sentiment, dtype: int64

In [22]:
np.unique(y_test)

array(['negative', 'neutral', 'positive'], dtype=object)

In [23]:
y_test.groupby(y_test,axis=0).count()

sentiment
negative     538
neutral     2337
positive    3025
Name: sentiment, dtype: int64

### Feature Extraction

#### Rescaling the Data with tf-idf

One of the approaches to extract features from test is to
rescale features by how informative we expect them to be. One of the most common
ways to do this is using the term frequency–inverse document frequency (tf-idf)
method. Let us create a function for the same.

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
def tfidf_extractor(corpus, ngram_range=(1,1)):    
    vectorizer = TfidfVectorizer(min_df=1, 
                                 norm='l2',
                                 smooth_idf=True,
                                 use_idf=True,
                                 ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    #Productionize features
    pickle.dump(vectorizer.vocabulary_,open("feature.pkl","wb"))
    return vectorizer, features

In [26]:
tfidf_vectorizer, tfidf_train_features = tfidf_extractor(text_train)  
tfidf_test_features = tfidf_vectorizer.transform(text_test) 

In [27]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

In [28]:
selector = SelectKBest(f_classif, k=min(400,tfidf_train_features.shape[1]))

In [29]:
selector.fit(tfidf_train_features, y_train)
tfidf_train_features = selector.transform(tfidf_train_features)
tfidf_test_features = selector.transform(tfidf_test_features)

In [30]:
test_selector = selector.transform

In [31]:
#Productionize Feature selector

In [32]:
pickle.dump(test_selector,open("selector.pkl","wb"))

In [33]:
tfidf_train_features

<11977x400 sparse matrix of type '<class 'numpy.float64'>'
	with 78551 stored elements in Compressed Sparse Row format>

In [34]:
from sklearn import metrics

In [35]:
def get_metrics(true_labels, predicted_labels):
    print ('Accuracy:', np.round(metrics.accuracy_score(true_labels,predicted_labels),2))
    print ('Precision:', np.round(metrics.precision_score(true_labels,predicted_labels,average='weighted'),2))
    print ('Recall:', np.round(metrics.recall_score(true_labels, predicted_labels,average='weighted'),2))
    print ('F1 Score:', np.round(metrics.f1_score(true_labels, predicted_labels,average='weighted'),2)) 

In [36]:
def train_predict_evaluate_model(name, classifier,train_features, train_labels, test_features, test_labels):
    # build model    
    classifier.fit(train_features, train_labels)
    # predict using model
    predictions = classifier.predict(test_features) 
    #Productionize model
    modelname = name + ".pkl"
    pickle.dump(classifier,open(modelname,"wb"))
    # evaluate model prediction performance   
    get_metrics(true_labels=test_labels, 
                predicted_labels=predictions)
    return predictions

### Multinomial Naive Bayes with tfidf features   

In [37]:
from sklearn.naive_bayes import MultinomialNB

In [38]:
mnb_best = MultinomialNB(alpha = 0.001,fit_prior = True)

In [39]:
mnb_tfidf_predictions = train_predict_evaluate_model(name = "NaiveBayes", classifier=mnb_best,
                                           train_features=tfidf_train_features,
                                           train_labels=y_train,
                                           test_features=tfidf_test_features,
                                           test_labels=y_test)

Accuracy: 0.73
Precision: 0.76
Recall: 0.73
F1 Score: 0.72


### Confusion Matrix - Naive Bayes

In [40]:
cm = metrics.confusion_matrix(y_test, mnb_tfidf_predictions)
pd.DataFrame(cm, index=range(0,3), columns=range(0,3))

Unnamed: 0,0,1,2
0,260,49,229
1,32,1240,1065
2,33,165,2827


### Support Vector Machine

In [41]:
from sklearn.linear_model import SGDClassifier

In [42]:
svm_best = SGDClassifier(loss='hinge', max_iter=100)

In [43]:
svm_tfidf_predictions = train_predict_evaluate_model(name = "SVM", classifier=svm_best,
                                           train_features=tfidf_train_features,
                                           train_labels=y_train,
                                           test_features=tfidf_test_features,
                                           test_labels=y_test)

Accuracy: 0.89
Precision: 0.9
Recall: 0.89
F1 Score: 0.89


### Confusion Matrix - SVM

In [44]:
cm = metrics.confusion_matrix(y_test, svm_tfidf_predictions)
pd.DataFrame(cm, index=range(0,3), columns=range(0,3))

Unnamed: 0,0,1,2
0,313,175,50
1,9,2286,42
2,20,361,2644


### Incorrect Predictions

In [45]:
num = 0
score = ['positive','negative','neutral']
for document, label, predicted_label in zip(text_test, y_test, svm_tfidf_predictions):
    for i in score:
        if label == i and predicted_label != i:
            print ("Actual Label:", label)
            print ("Predicted Label:", predicted_label)
            print ("Review:", re.sub('\n', ' ', document))
    num += 1
    if num == 5:
        break

Actual Label: positive
Predicted Label: neutral
Review: rt :    my favorite #oneplus6t is #thunderpurple ++retweeted the tweet+followed all thr...
Actual Label: positive
Predicted Label: neutral
Review: my favourite #oneplus6t is #mirrorblack  coz it is bold and elegant+subsc...


### Correct Predictions

In [46]:
num = 0
score = ['positive','negative','neutral']
for document, label, predicted_label in zip(text_test, y_test, svm_tfidf_predictions):
    for i in score:
        if label == i and predicted_label == i:
            print ("Actual Label:", label)
            print ("Predicted Label:", predicted_label)
            print ("Review:", re.sub('\n', ' ', document))
    num += 1
    if num == 5:
        break

Actual Label: negative
Predicted Label: negative
Review: how an anime game's root detection led to the discovery of a security vulnerability in phones from lg, oneplus, hua...
Actual Label: positive
Predicted Label: positive
Review: black friday 2018 phone deals: free oneplus 6t now, lg v40 with free $500 tv, moto g6 for $200 - cnet
Actual Label: neutral
Predicted Label: neutral
Review: it is a oneplus 6 - it's in a carbon fibre style case
