In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

%matplotlib inline

# Importing kaggle Twitter cleaned data

In [2]:
twitter = pd.read_csv('./datasets/training_data/twitter-and-reddit-sentimental-analysis-dataset/Twitter_Data.csv')

In [3]:
twitter.head(2)

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0


In [4]:
twitter.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162980 entries, 0 to 162979
Data columns (total 2 columns):
clean_text    162976 non-null object
category      162973 non-null float64
dtypes: float64(1), object(1)
memory usage: 2.5+ MB


In [5]:
twitter.head(2)

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0


# Importing kaggle reddit cleaned data

In [6]:
reddit = pd.read_csv('./datasets/training_data/twitter-and-reddit-sentimental-analysis-dataset/Reddit_Data.csv')

In [7]:
reddit.head(2)

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1


In [8]:
reddit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37249 entries, 0 to 37248
Data columns (total 2 columns):
clean_comment    37149 non-null object
category         37249 non-null int64
dtypes: int64(1), object(1)
memory usage: 582.1+ KB


In [9]:
reddit.head(2)

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1


# Instantiating vader
https://medium.com/analytics-vidhya/simplifying-social-media-sentiment-analysis-using-vader-in-python-f9e6ec6fc52f

In [10]:
analyser = SentimentIntensityAnalyzer()

## Analyzer scores function

In [11]:
def sentiment_analyzer_scores(sentence):
    score = analyser.polarity_scores(sentence)
    print("{:-<40} {}".format(sentence, str(score)))

### Vanilla experiment

In [12]:
analyser.polarity_scores('I am happy right now')

{'neg': 0.0, 'neu': 0.448, 'pos': 0.552, 'compound': 0.5719}

In [13]:
sentiment_analyzer_scores('I am happy right now')

I am happy right now-------------------- {'neg': 0.0, 'neu': 0.448, 'pos': 0.552, 'compound': 0.5719}


In [14]:
sentiment_analyzer_scores('I am Very happy right now')

I am Very happy right now--------------- {'neg': 0.0, 'neu': 0.5, 'pos': 0.5, 'compound': 0.6115}


In [15]:
sentiment_analyzer_scores('I am very happy right now')

I am very happy right now--------------- {'neg': 0.0, 'neu': 0.5, 'pos': 0.5, 'compound': 0.6115}


In [16]:
sentiment_analyzer_scores('I am VERY happy right now')

I am VERY happy right now--------------- {'neg': 0.0, 'neu': 0.458, 'pos': 0.542, 'compound': 0.6933}


In [17]:
sentiment_analyzer_scores('I am VERY happy right now!!!')

I am VERY happy right now!!!------------ {'neg': 0.0, 'neu': 0.417, 'pos': 0.583, 'compound': 0.7651}


### Experimenting with emoji

In [18]:
sentiment_analyzer_scores('I am VERY ☹️ right now!!!')

I am VERY ☹️ right now!!!--------------- {'neg': 0.462, 'neu': 0.538, 'pos': 0.0, 'compound': -0.6488}


### Slang

In [19]:
sentiment_analyzer_scores('Just lost all my dollaz')

Just lost all my dollaz----------------- {'neg': 0.365, 'neu': 0.635, 'pos': 0.0, 'compound': -0.3182}


In [20]:
sentiment_analyzer_scores('Just lost all my dollaz, tbh dont really care haha')

Just lost all my dollaz, tbh dont really care haha {'neg': 0.528, 'neu': 0.472, 'pos': 0.0, 'compound': -0.7802}


# Creating vader df for twitter

In [21]:
data = twitter.head()
data

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [22]:
#https://towardsdatascience.com/sentiment-analysis-of-anthem-game-launch-in-python-16be9e5083d2

sentiment = data['clean_text'].apply(lambda x: analyser.polarity_scores(x))
data = pd.concat([data, sentiment.apply(pd.Series)], 1)

In [23]:
data

Unnamed: 0,clean_text,category,neg,neu,pos,compound
0,when modi promised “minimum government maximum...,-1.0,0.065,0.781,0.154,0.5267
1,talk all the nonsense and continue all the dra...,0.0,0.184,0.816,0.0,-0.4019
2,what did just say vote for modi welcome bjp t...,1.0,0.0,0.772,0.228,0.7096
3,asking his supporters prefix chowkidar their n...,1.0,0.187,0.655,0.158,-0.0713
4,answer who among these the most powerful world...,1.0,0.0,0.808,0.192,0.4754


In [24]:
twitter['clean_text'].head()

0    when modi promised “minimum government maximum...
1    talk all the nonsense and continue all the dra...
2    what did just say vote for modi  welcome bjp t...
3    asking his supporters prefix chowkidar their n...
4    answer who among these the most powerful world...
Name: clean_text, dtype: object

## An error produces when I scale up to the entire twitter corpus. Guessing it has to do with NaNs, dropping them here

In [25]:
twitter.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162980 entries, 0 to 162979
Data columns (total 2 columns):
clean_text    162976 non-null object
category      162973 non-null float64
dtypes: float64(1), object(1)
memory usage: 2.5+ MB


In [26]:
twitter.dropna(inplace=True)

## Lambda function to create vader sentiment columns in my twitter df

In [27]:
sentiment = twitter['clean_text'].apply(lambda x: analyser.polarity_scores(x))
twitter = pd.concat([twitter, sentiment.apply(pd.Series)], 1)

In [28]:
twitter

Unnamed: 0,clean_text,category,neg,neu,pos,compound
0,when modi promised “minimum government maximum...,-1.0,0.065,0.781,0.154,0.5267
1,talk all the nonsense and continue all the dra...,0.0,0.184,0.816,0.000,-0.4019
2,what did just say vote for modi welcome bjp t...,1.0,0.000,0.772,0.228,0.7096
3,asking his supporters prefix chowkidar their n...,1.0,0.187,0.655,0.158,-0.0713
4,answer who among these the most powerful world...,1.0,0.000,0.808,0.192,0.4754
...,...,...,...,...,...,...
162975,why these 456 crores paid neerav modi not reco...,-1.0,0.081,0.919,0.000,-0.1280
162976,dear rss terrorist payal gawar what about modi...,-1.0,0.398,0.491,0.111,-0.9571
162977,did you cover her interaction forum where she ...,0.0,0.000,1.000,0.000,0.0000
162978,there big project came into india modi dream p...,0.0,0.000,0.889,0.111,0.1280


In [29]:
twitter.describe()

Unnamed: 0,category,neg,neu,pos,compound
count,162969.0,162969.0,162969.0,162969.0,162969.0
mean,0.225442,0.092051,0.770576,0.137348,0.092059
std,0.781279,0.122334,0.169099,0.147333,0.492493
min,-1.0,0.0,0.0,0.0,-0.9944
25%,0.0,0.0,0.659,0.0,-0.2598
50%,0.0,0.0,0.774,0.11,0.0
75%,1.0,0.159,0.899,0.219,0.4939
max,1.0,1.0,1.0,1.0,0.9927


# Comparing vader results to test scores from Twitter kaggle (y actual)

In [30]:
twitter.head(1)

Unnamed: 0,clean_text,category,neg,neu,pos,compound
0,when modi promised “minimum government maximum...,-1.0,0.065,0.781,0.154,0.5267


In [31]:
twitter['vader_correct'] = 2
twitter.head(1)

Unnamed: 0,clean_text,category,neg,neu,pos,compound,vader_correct
0,when modi promised “minimum government maximum...,-1.0,0.065,0.781,0.154,0.5267,2


In [32]:
conditions = [
    (twitter['category'] == 1) & (twitter['compound'] >= 0.05) |
    (twitter['category'] == 0) & (twitter['compound'] < 0.05) & (twitter['compound'] > -0.05) |
    (twitter['category'] == -1) & (twitter['compound'] <= -0.05)
]

In [34]:
twitter[conditions[0]]

Unnamed: 0,clean_text,category,neg,neu,pos,compound,vader_correct
2,what did just say vote for modi welcome bjp t...,1.0,0.000,0.772,0.228,0.7096,2
4,answer who among these the most powerful world...,1.0,0.000,0.808,0.192,0.4754,2
5,kiya tho refresh maarkefir comment karo,0.0,0.000,1.000,0.000,0.0000,2
8,with upcoming election india saga going import...,1.0,0.000,0.927,0.073,0.2023,2
10,things like demonetisation gst goods and servi...,1.0,0.000,0.875,0.125,0.6124,2
...,...,...,...,...,...,...,...
162971,congress veteran sudhakar reddy joins bjp afte...,0.0,0.000,1.000,0.000,0.0000,2
162972,engine growth modi unveils indias first 12000 ...,1.0,0.000,0.776,0.224,0.3818,2
162975,why these 456 crores paid neerav modi not reco...,-1.0,0.081,0.919,0.000,-0.1280,2
162976,dear rss terrorist payal gawar what about modi...,-1.0,0.398,0.491,0.111,-0.9571,2


In [35]:
twitter.loc[conditions[0], 'vader_correct'] = 1
twitter.head()

Unnamed: 0,clean_text,category,neg,neu,pos,compound,vader_correct
0,when modi promised “minimum government maximum...,-1.0,0.065,0.781,0.154,0.5267,2
1,talk all the nonsense and continue all the dra...,0.0,0.184,0.816,0.0,-0.4019,2
2,what did just say vote for modi welcome bjp t...,1.0,0.0,0.772,0.228,0.7096,1
3,asking his supporters prefix chowkidar their n...,1.0,0.187,0.655,0.158,-0.0713,2
4,answer who among these the most powerful world...,1.0,0.0,0.808,0.192,0.4754,1


In [36]:
twitter.loc[~conditions[0], 'vader_correct'] = 0

In [37]:
twitter

Unnamed: 0,clean_text,category,neg,neu,pos,compound,vader_correct
0,when modi promised “minimum government maximum...,-1.0,0.065,0.781,0.154,0.5267,0
1,talk all the nonsense and continue all the dra...,0.0,0.184,0.816,0.000,-0.4019,0
2,what did just say vote for modi welcome bjp t...,1.0,0.000,0.772,0.228,0.7096,1
3,asking his supporters prefix chowkidar their n...,1.0,0.187,0.655,0.158,-0.0713,0
4,answer who among these the most powerful world...,1.0,0.000,0.808,0.192,0.4754,1
...,...,...,...,...,...,...,...
162975,why these 456 crores paid neerav modi not reco...,-1.0,0.081,0.919,0.000,-0.1280,1
162976,dear rss terrorist payal gawar what about modi...,-1.0,0.398,0.491,0.111,-0.9571,1
162977,did you cover her interaction forum where she ...,0.0,0.000,1.000,0.000,0.0000,1
162978,there big project came into india modi dream p...,0.0,0.000,0.889,0.111,0.1280,0


## Questionable results at best, only 57% correct, however this could be due to the topic is largely based on Indian media topics

In [39]:
twitter.vader_correct.value_counts(normalize=True)

1    0.567746
0    0.432254
Name: vader_correct, dtype: float64

# Comparing vader results to test scores from Reddit kaggle (y actual)

In [41]:
reddit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37249 entries, 0 to 37248
Data columns (total 2 columns):
clean_comment    37149 non-null object
category         37249 non-null int64
dtypes: int64(1), object(1)
memory usage: 582.1+ KB


In [42]:
reddit.dropna(inplace=True)
reddit

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1
...,...,...
37244,jesus,0
37245,kya bhai pure saal chutiya banaya modi aur jab...,1
37246,downvote karna tha par upvote hogaya,0
37247,haha nice,1


In [44]:
sentiment = reddit['clean_comment'].apply(lambda x: analyser.polarity_scores(x))
reddit = pd.concat([reddit, sentiment.apply(pd.Series)], 1)

In [45]:
reddit.head(2)

Unnamed: 0,clean_comment,category,neg,neu,pos,compound
0,family mormon have never tried explain them t...,1,0.067,0.594,0.339,0.9349
1,buddhism has very much lot compatible with chr...,1,0.066,0.653,0.28,0.9953


In [46]:
conditions = [
    (reddit['category'] == 1) & (reddit['compound'] >= 0.05) |
    (reddit['category'] == 0) & (reddit['compound'] < 0.05) & (reddit['compound'] > -0.05) |
    (reddit['category'] == -1) & (reddit['compound'] <= -0.05)
]

In [47]:
reddit[conditions[0]]

Unnamed: 0,clean_comment,category,neg,neu,pos,compound
0,family mormon have never tried explain them t...,1,0.067,0.594,0.339,0.9349
1,buddhism has very much lot compatible with chr...,1,0.066,0.653,0.280,0.9953
4,for your own benefit you may want read living ...,1,0.102,0.704,0.194,0.8907
6,was teens when discovered zen meditation was ...,1,0.041,0.761,0.197,0.9778
7,jesus was zen meets jew,0,0.000,1.000,0.000,0.0000
...,...,...,...,...,...,...
37243,this agree why push make other nations either ...,-1,0.323,0.594,0.083,-0.8074
37244,jesus,0,0.000,1.000,0.000,0.0000
37246,downvote karna tha par upvote hogaya,0,0.000,1.000,0.000,0.0000
37247,haha nice,1,0.000,0.000,1.000,0.7003


In [48]:
reddit.loc[conditions[0], 'vader_correct'] = 1
reddit.head()

Unnamed: 0,clean_comment,category,neg,neu,pos,compound,vader_correct
0,family mormon have never tried explain them t...,1,0.067,0.594,0.339,0.9349,1.0
1,buddhism has very much lot compatible with chr...,1,0.066,0.653,0.28,0.9953,1.0
2,seriously don say thing first all they won get...,-1,0.035,0.825,0.14,0.875,
3,what you have learned yours and only yours wha...,0,0.0,0.956,0.044,0.0772,
4,for your own benefit you may want read living ...,1,0.102,0.704,0.194,0.8907,1.0


In [49]:
reddit.loc[~conditions[0], 'vader_correct'] = 0

In [50]:
reddit

Unnamed: 0,clean_comment,category,neg,neu,pos,compound,vader_correct
0,family mormon have never tried explain them t...,1,0.067,0.594,0.339,0.9349,1.0
1,buddhism has very much lot compatible with chr...,1,0.066,0.653,0.280,0.9953,1.0
2,seriously don say thing first all they won get...,-1,0.035,0.825,0.140,0.8750,0.0
3,what you have learned yours and only yours wha...,0,0.000,0.956,0.044,0.0772,0.0
4,for your own benefit you may want read living ...,1,0.102,0.704,0.194,0.8907,1.0
...,...,...,...,...,...,...,...
37244,jesus,0,0.000,1.000,0.000,0.0000,1.0
37245,kya bhai pure saal chutiya banaya modi aur jab...,1,0.000,1.000,0.000,0.0000,0.0
37246,downvote karna tha par upvote hogaya,0,0.000,1.000,0.000,0.0000,1.0
37247,haha nice,1,0.000,0.000,1.000,0.7003,1.0


## Slightly better performance on reddit with 64% accuracy

In [51]:
reddit.vader_correct.value_counts(normalize=1)

1.0    0.643328
0.0    0.356672
Name: vader_correct, dtype: float64