# Sentiment Analysis using Naive Bayes

In this assignment, we will attempt to label tweets with sentiments (positive, neutral and negative) using Naive Bayes classifier. Naive Bayes is a very basic approach to this problem, but gives surprisingly good accuracy sometimes.

**Fill in the Blanks**

## Importing required libraries

In [None]:
import pandas as pd
import re
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

## Reading dataset

In [None]:
data=pd.read_csv('tweets.csv')
data.drop(data.columns[0],axis=1,inplace=True)
data.head()

Unnamed: 0,tweets,labels
0,Obama has called the GOP budget social Darwini...,1
1,"In his teen years, Obama has been known to use...",0
2,IPA Congratulates President Barack Obama for L...,0
3,RT @Professor_Why: #WhatsRomneyHiding - his co...,0
4,RT @wardollarshome: Obama has approved more ta...,1


## Text processing for the tweets

In [None]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.5.zip (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 11.6 MB/s eta 0:00:01
[?25hCollecting click
  Downloading click-7.1.2-py2.py3-none-any.whl (82 kB)
[K     |████████████████████████████████| 82 kB 2.1 MB/s  eta 0:00:01
Collecting regex
  Downloading regex-2020.10.15-cp37-cp37m-manylinux2010_x86_64.whl (662 kB)
[K     |████████████████████████████████| 662 kB 52.3 MB/s eta 0:00:01
[?25hCollecting tqdm
  Downloading tqdm-4.50.2-py2.py3-none-any.whl (70 kB)
[K     |████████████████████████████████| 70 kB 20.1 MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25ldone
[?25h  Created wheel for nltk: filename=nltk-3.5-py3-none-any.whl size=1434676 sha256=b8b146d9540fdbcbc71187c476e269ab9854bd95fa2d6dcb3b4a9c6649e02d47
  Stored in directory: /home/jovyan/.cache/pip/wheels/45/6c/46/a1865e7ba706b3817f5d1b2ff7ce8996aabdd0d03d47ba0266
Successfully built nltk
Installing collected 

In [None]:
import nltk 
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from nltk.tokenize import word_tokenize
from string import punctuation 
from nltk.corpus import stopwords 
import numpy as np

stopwords = set(stopwords.words('english') + list(punctuation) + ['AT_USER','URL'])
    
def processTweet(tweet):
    # tweet is the text we will pass for preprocessing 
    # convert passed tweet to lower case 
    if isinstance(tweet, float):
        return []
    

    tweet=tweet.lower()
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) # remove URLs
    tweet = re.sub('@[^\s]+', 'AT_USER', tweet) # remove usernames
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag
    words = word_tokenize(tweet)
    return [word for word in words if word not in stopwords]


   
    
    # use work_tokenize imported above to tokenize the tweet
    

## Process all tweets

In [None]:
data['tweets'][0]

'Obama has called the GOP budget social Darwinism. Nice try, but they believe in social creationism.'

In [None]:
processed=[]

for tweet in data['tweets']:
    
    # process all tweets using processTweet function above - store in variable 'cleaned' 
    cleaned=processTweet(tweet)
    processed.append(' '.join(cleaned))

In [None]:
data['processed'] = processed

In [None]:
data['processed']

0       obama called gop budget social darwinism nice ...
1            teen years obama known use marijuana cocaine
2       ipa congratulates president barack obama leade...
3       rt whatsromneyhiding connection supporters cri...
4       rt obama approved targeted assassinations mode...
                              ...                        
1375    trending idiot .. look tweets lol making fun o...
1376          rt kimkardashiansnextboyfriend barack obama
1377    rt gas 1.92 obama took office ... guess promis...
1378    haha know im smart mean got ta listen obama cu...
1379    obama dictator training passes training course...
Name: processed, Length: 1380, dtype: object

## Create pipeline and define parameters for GridSearch

In [None]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])

tuned_parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': [1, 1e-1, 1e-2]
}

## Split data into test and train

In [None]:
# split data into train and test with split as 0.2 
data_copy = data.copy()
train_set = data_copy.sample(frac=0.80, random_state=0)
test_set = data_copy.drop(train_set.index)

x_train = train_set.processed
y_train = train_set.labels

x_test = test_set.processed
y_test = test_set.labels

print(data_copy.shape)
print(train_set.shape)
print(test_set.shape)

## Perform classification (using GridSearch)

In [None]:
# perform GridSearch CV with 10 fold CV using pipeline and tuned_paramters defined above 
clf = GridSearchCV(text_clf,tuned_parameters,cv=10)
clf.fit(x_train, y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('vect', CountVectorizer()),
                                       ('tfidf', TfidfTransformer()),
                                       ('clf', MultinomialNB())]),
             param_grid={'clf__alpha': [1, 0.1, 0.01],
                         'tfidf__norm': ('l1', 'l2'),
                         'tfidf__use_idf': (True, False),
                         'vect__ngram_range': [(1, 1), (1, 2), (2, 2)]})

## Classification report 

In [None]:
# print classification report after predicting on test set with best model obtained in GridSearch
cv_df=pd.DataFrame(clf.cv_results_)
cv_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__alpha,param_tfidf__norm,param_tfidf__use_idf,param_vect__ngram_range,params,split0_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.01883,0.003021,0.002279,0.000212,1.0,l1,True,"(1, 1)","{'clf__alpha': 1, 'tfidf__norm': 'l1', 'tfidf_...",0.684685,...,0.675676,0.681818,0.681818,0.690909,0.690909,0.690909,0.690909,0.683898,0.006393,31
1,0.034422,0.004197,0.003234,0.000334,1.0,l1,True,"(1, 2)","{'clf__alpha': 1, 'tfidf__norm': 'l1', 'tfidf_...",0.684685,...,0.675676,0.681818,0.681818,0.690909,0.690909,0.690909,0.690909,0.683898,0.006393,31
2,0.023197,0.000281,0.002583,9.7e-05,1.0,l1,True,"(2, 2)","{'clf__alpha': 1, 'tfidf__norm': 'l1', 'tfidf_...",0.684685,...,0.675676,0.681818,0.681818,0.690909,0.690909,0.690909,0.690909,0.683898,0.006393,31
3,0.016511,0.001179,0.001974,0.00028,1.0,l1,False,"(1, 1)","{'clf__alpha': 1, 'tfidf__norm': 'l1', 'tfidf_...",0.684685,...,0.675676,0.681818,0.681818,0.690909,0.690909,0.690909,0.690909,0.683898,0.006393,31
4,0.034694,0.005023,0.003004,0.00037,1.0,l1,False,"(1, 2)","{'clf__alpha': 1, 'tfidf__norm': 'l1', 'tfidf_...",0.684685,...,0.675676,0.681818,0.681818,0.690909,0.690909,0.690909,0.690909,0.683898,0.006393,31
5,0.024245,0.003889,0.002315,8.7e-05,1.0,l1,False,"(2, 2)","{'clf__alpha': 1, 'tfidf__norm': 'l1', 'tfidf_...",0.684685,...,0.675676,0.681818,0.681818,0.690909,0.690909,0.690909,0.690909,0.683898,0.006393,31
6,0.016537,0.000312,0.002161,9.7e-05,1.0,l2,True,"(1, 1)","{'clf__alpha': 1, 'tfidf__norm': 'l2', 'tfidf_...",0.792793,...,0.81982,0.854545,0.781818,0.827273,0.790909,0.781818,0.818182,0.804373,0.023846,22
7,0.036254,0.004397,0.00364,0.000712,1.0,l2,True,"(1, 2)","{'clf__alpha': 1, 'tfidf__norm': 'l2', 'tfidf_...",0.810811,...,0.828829,0.845455,0.781818,0.827273,0.8,0.772727,0.845455,0.809795,0.02491,21
8,0.023513,0.000721,0.002572,9.5e-05,1.0,l2,True,"(2, 2)","{'clf__alpha': 1, 'tfidf__norm': 'l2', 'tfidf_...",0.810811,...,0.828829,0.845455,0.809091,0.845455,0.827273,0.754545,0.854545,0.818862,0.027405,19
9,0.017355,0.002539,0.002103,0.000347,1.0,l2,False,"(1, 1)","{'clf__alpha': 1, 'tfidf__norm': 'l2', 'tfidf_...",0.792793,...,0.81982,0.845455,0.754545,0.818182,0.781818,0.763636,0.763636,0.788943,0.029295,27


In [None]:
preds=clf.predict(x_test)

In [None]:
clf.score(x_test,y_test)

0.8152173913043478

In [None]:
print(classification_report(y_test,preds,digits=4))

              precision    recall  f1-score   support

           0     0.8565    0.9323    0.8928       192
           1     0.6825    0.6515    0.6667        66
           2     0.7500    0.1667    0.2727        18

    accuracy                         0.8152       276
   macro avg     0.7630    0.5835    0.6107       276
weighted avg     0.8079    0.8152    0.7983       276



## Important:

In [None]:
counts = data.labels.value_counts()
print(counts)

0    947
1    352
2     81
Name: labels, dtype: int64


We can see above that the class distribution is highly imbalanced, this would not lead to good sampling of the data for the classifier. For your learning, try using [SMOTE](https://imbalanced-learn.readthedocs.io/en/stable/api.html) to oversample the minority classes and then evaluate the performance with Naive Bayes and compare.

In [None]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.7.0-py3-none-any.whl (167 kB)
[K     |████████████████████████████████| 167 kB 21.0 MB/s eta 0:00:01
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.7.0 imblearn-0.0


In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
text_clf_sm = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('sampling', SMOTE()),
                     ('clf', MultinomialNB())])

tuned_parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'sampling__sampling_strategy': ['all','minority','auto'],
    'clf__alpha': [1, 1e-1, 1e-2]
}


In [None]:
# perform GridSearch CV with 10 fold CV using pipeline and tuned_paramters defined above 
clf = GridSearchCV(text_clf_sm,tuned_parameters,cv=10)
clf.fit(x_train,y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('vect', CountVectorizer()),
                                       ('tfidf', TfidfTransformer()),
                                       ('sampling', SMOTE()),
                                       ('clf', MultinomialNB())]),
             param_grid={'clf__alpha': [1, 0.1, 0.01],
                         'sampling__sampling_strategy': ['all', 'minority',
                                                         'auto'],
                         'tfidf__norm': ('l1', 'l2'),
                         'tfidf__use_idf': (True, False),
                         'vect__ngram_range': [(1, 1), (1, 2), (2, 2)]})

In [None]:
print(classification_report(y_test,preds,digits=4))

              precision    recall  f1-score   support

           0     0.8830    0.8646    0.8737       192
           1     0.6301    0.6970    0.6619        66
           2     0.3333    0.2778    0.3030        18

    accuracy                         0.7862       276
   macro avg     0.6155    0.6131    0.6129       276
weighted avg     0.7867    0.7862    0.7858       276



In [None]:
clf.best_params_

{'clf__alpha': 0.01,
 'sampling__sampling_strategy': 'minority',
 'tfidf__norm': 'l2',
 'tfidf__use_idf': False,
 'vect__ngram_range': (1, 2)}

In [None]:
# pd.set_option('display.max_rows', 200)
# pd.set_option('display.max_columns',50)
# # print classification report after predicting on test set with best model obtained in GridSearch
# cv_df=pd.DataFrame(clf.cv_results_)
# cv_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__alpha,param_sampling__sampling_strategy,param_tfidf__norm,param_tfidf__use_idf,param_vect__ngram_range,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.026138,0.001556,0.002288,0.000219,1.0,all,l1,True,"(1, 1)","{'clf__alpha': 1, 'sampling__sampling_strategy...",0.630631,0.693694,0.720721,0.72973,0.718182,0.7,0.709091,0.609091,0.627273,0.772727,0.691114,0.04963,102
1,0.044274,0.003392,0.003333,0.000321,1.0,all,l1,True,"(1, 2)","{'clf__alpha': 1, 'sampling__sampling_strategy...",0.63964,0.72973,0.747748,0.738739,0.745455,0.690909,0.727273,0.654545,0.654545,0.754545,0.708313,0.041972,93
2,0.033554,0.004397,0.002688,0.000247,1.0,all,l1,True,"(2, 2)","{'clf__alpha': 1, 'sampling__sampling_strategy...",0.765766,0.720721,0.801802,0.765766,0.827273,0.709091,0.809091,0.745455,0.654545,0.790909,0.759042,0.050162,73
3,0.023793,0.000414,0.00184,6.6e-05,1.0,all,l1,False,"(1, 1)","{'clf__alpha': 1, 'sampling__sampling_strategy...",0.657658,0.693694,0.756757,0.711712,0.7,0.654545,0.745455,0.645455,0.663636,0.745455,0.697437,0.039524,99
4,0.042747,0.004109,0.002949,0.000357,1.0,all,l1,False,"(1, 2)","{'clf__alpha': 1, 'sampling__sampling_strategy...",0.693694,0.693694,0.72973,0.756757,0.754545,0.663636,0.754545,0.718182,0.672727,0.754545,0.719206,0.034378,87
5,0.03374,0.003464,0.002541,0.000494,1.0,all,l1,False,"(2, 2)","{'clf__alpha': 1, 'sampling__sampling_strategy...",0.72973,0.711712,0.774775,0.765766,0.8,0.718182,0.8,0.727273,0.7,0.754545,0.748198,0.034288,78
6,0.027157,0.002908,0.002311,0.000158,1.0,all,l2,True,"(1, 1)","{'clf__alpha': 1, 'sampling__sampling_strategy...",0.630631,0.801802,0.738739,0.765766,0.718182,0.7,0.736364,0.636364,0.663636,0.781818,0.71733,0.056318,89
7,0.047779,0.004738,0.003617,0.000562,1.0,all,l2,True,"(1, 2)","{'clf__alpha': 1, 'sampling__sampling_strategy...",0.648649,0.792793,0.738739,0.756757,0.709091,0.681818,0.790909,0.654545,0.681818,0.754545,0.720966,0.050619,86
8,0.035004,0.006281,0.002866,0.000493,1.0,all,l2,True,"(2, 2)","{'clf__alpha': 1, 'sampling__sampling_strategy...",0.72973,0.810811,0.792793,0.747748,0.818182,0.718182,0.827273,0.736364,0.690909,0.790909,0.76629,0.045028,65
9,0.025082,0.0024,0.002042,0.000332,1.0,all,l2,False,"(1, 1)","{'clf__alpha': 1, 'sampling__sampling_strategy...",0.666667,0.72973,0.711712,0.738739,0.7,0.672727,0.718182,0.609091,0.645455,0.745455,0.693776,0.041983,101
