In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

import warnings
warnings.simplefilter('ignore')

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['train.tsv', 'test.tsv', 'sampleSubmission.csv']


In [2]:
data = pd.read_csv('../input/train.tsv',sep='\t')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 4 columns):
PhraseId      156060 non-null int64
SentenceId    156060 non-null int64
Phrase        156060 non-null object
Sentiment     156060 non-null int64
dtypes: int64(3), object(1)
memory usage: 4.8+ MB


In [4]:
data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


 The sentiment labels are:

0 - negative; 
1 - somewhat negative; 
2 - neutral; 
3 - somewhat positive; 
4 - positive

In [5]:
print(data.iloc[0]['Phrase'],'Sentiment - ',data.iloc[0]['Sentiment'])

A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . Sentiment -  1


The part of the sentence above which says 'but none of which amounts to much of a story' corresponds to a neagtive sentiment which is correctly indicated in the sentiment label '1'. Hence if we remove this phrase from the sentence as is done below, we get a neutral sentiment as the label!

In [6]:
print(data.iloc[1]['Phrase'],'Sentiment - ',data.iloc[1]['Sentiment'])

A series of escapades demonstrating the adage that what is good for the goose Sentiment -  2


Addition of a word like 'for' has shifted the sentiment from negative to neutral as shown below. Hence it is not recommended to use stopwords filtering here as we are not analysing full messages here, but phrases from the same sentence. A stopword like 'but' or 'not' can really alter the sentiment and hence filtering out them will be counterproductive.

In [7]:
print(data.iloc[32]['Phrase'],'Sentiment - ',data.iloc[32]['Sentiment'])
print('\n')
print(data.iloc[33]['Phrase'],'Sentiment - ',data.iloc[33]['Sentiment'])

for the gander , some of which occasionally amuses but none of which amounts to much of a story Sentiment -  2


the gander , some of which occasionally amuses but none of which amounts to much of a story Sentiment -  1


Lets clean the phrases by removing punctuation marks and splitting them into a list

In [8]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [9]:
from nltk.stem import WordNetLemmatizer
lm = WordNetLemmatizer()

def own_analyser(phrase):
    phrase = phrase.split()
    for i in range(0,len(phrase)):
        k = phrase.pop(0)
        if k not in string.punctuation:
                phrase.append(lm.lemmatize(k).lower())    
    return phrase

In [10]:
data.columns

Index(['PhraseId', 'SentenceId', 'Phrase', 'Sentiment'], dtype='object')

In [11]:
X = data['Phrase']
y = data['Sentiment']

In [12]:
from sklearn.model_selection import train_test_split
phrase_train,phrase_test,sentiment_train,sentiment_test = train_test_split(X,y,test_size=0.3)

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB


Using pipeline feature of sklearn - 

In [14]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([('BOW',CountVectorizer(analyzer=own_analyser)),
                    ('tfidf',TfidfTransformer()),
                    ('classifier',MultinomialNB())])

In [15]:
pipeline.fit(phrase_train,sentiment_train)

Pipeline(memory=None,
     steps=[('BOW', CountVectorizer(analyzer=<function own_analyser at 0x7f70d142cf28>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=No...f=False, use_idf=True)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [16]:
predictions = pipeline.predict(phrase_test)

In [17]:
from sklearn.metrics import classification_report

In [18]:
print(classification_report(sentiment_test,predictions))

              precision    recall  f1-score   support

           0       0.57      0.03      0.06      2144
           1       0.52      0.24      0.33      8290
           2       0.60      0.90      0.72     23968
           3       0.53      0.38      0.44      9736
           4       0.60      0.05      0.09      2680

   micro avg       0.58      0.58      0.58     46818
   macro avg       0.56      0.32      0.33     46818
weighted avg       0.57      0.58      0.53     46818



In [19]:
test_data = pd.read_csv('../input/test.tsv',sep='\t')

In [20]:
test_data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [21]:
test_predictions = pipeline.predict(test_data['Phrase'])

In [22]:
phrase_id = test_data['PhraseId'].values

In [23]:
test_predictions.shape

(66292,)

In [24]:
final_answer = pd.DataFrame({'PhraseId':phrase_id,'Sentiment':test_predictions})

In [25]:
final_answer.head()

Unnamed: 0,PhraseId,Sentiment
0,156061,3
1,156062,3
2,156063,2
3,156064,3
4,156065,3


In [26]:
filename = 'Sentiment Analysis - NaiveBayes.csv'

final_answer.to_csv(filename,index=False)

print('Saved file: ' + filename)

Saved file: Sentiment Analysis - NaiveBayes.csv
