# Sentiment Analysis

In [28]:
# import packages
import pandas as pd
import multiprocessing as mp
import tqdm
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [29]:
# import data
data = pd.read_csv('clean_data.csv', index_col = 0)
data.head()
type(data['body'])
data["body"] = data["body"].astype(str)
print(type(data['body']))
data['body'].to_string()
print(type(data['body']))

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


```Python
# initialize sentiment classifier
sid = sid()

# get sentiment scores from data
sentiment = data['body'].apply(SID.polarity_scores)

# convert sentiment series into dataframe (each sentiment value gets its own column)
sentiment = pd.DataFrame(sentiment.tolist())

# merge data and sentiment df into one
data = data.merge(sentiment, how = 'left', left_index = True, right_index = True)

# delete sentiment as its info is in data
del sentiment
```

In [30]:
# initialize sentiment classifier
sid = SentimentIntensityAnalyzer()

# get sentiment scores from data
sentiment = data['body'].apply(sid.polarity_scores)

# convert sentiment series into dataframe (each sentiment value gets its own column)
sentiment = pd.DataFrame(sentiment.tolist())

# merge data and sentiment df into one
data = data.merge(sentiment, how = 'left', left_index = True, right_index = True)

# delete sentiment as its info is in data
del sentiment

In [31]:
data.head()

Unnamed: 0_level_0,created_utc,author,author_fullname,body,subreddit,subreddit_id,send_replies,no_follow,DateTime,Dates,Time,neg,neu,pos,compound
...1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0.0,1614142000.0,ckh27,t2_6ga8asa9,No clue,ethereum,t5_2zf9m,True,True,2021-02-24 04:40:07,2021-02-24,04:40:07,0.688,0.312,0.0,-0.296
1.0,1614142000.0,Barmelo_Xanthony,t2_xr0uc,Yeah but those interesting and complex things ...,ethereum,t5_2zf9m,True,True,2021-02-24 04:39:26,2021-02-24,04:39:26,0.035,0.71,0.255,0.8957
2.0,1614142000.0,Archetypical3,t2_a1rblycq,That makes sense! Im patient,ethereum,t5_2zf9m,True,True,2021-02-24 04:39:11,2021-02-24,04:39:11,0.0,1.0,0.0,0.0
3.0,1614142000.0,Hanzburger,t2_wafj0,Probably best to reach out to Ledger to make s...,ethereum,t5_2zf9m,True,True,2021-02-24 04:38:59,2021-02-24,04:38:59,0.0,0.714,0.286,0.765
4.0,1614141000.0,ckh27,t2_6ga8asa9,ETH will NOT effect your coins. It is not a f...,ethereum,t5_2zf9m,True,True,2021-02-24 04:37:27,2021-02-24,04:37:27,0.08,0.87,0.05,-0.4215


In [6]:
data.head()

Unnamed: 0,author,subreddit,created_utc,score,controversiality,body,compound,neg,neu,pos
0,CryptoHODLer101,AMA,1513292107,0,0,No BTC is not fiat. I get paid in Bitcoin. As ...,0.3818,0.085,0.731,0.185
1,nappiestapparatus,AMA,1395194709,0,0,I think you're overestimating your hashing pow...,-0.2263,0.181,0.724,0.095
2,Skating2Death,AMA,1390016247,0,0,"I've heard of it, but with the high volatility...",-0.7684,0.142,0.837,0.021
3,OmarJunkman,AMA,1528300155,0,0,Do you have any bitcoin?,0.0,0.0,1.0,0.0
4,evanc1411,ASU,1525380807,0,1,"BTC -? Damn, when you buy a Bitcoin you're ...",-0.1779,0.213,0.63,0.157


## categorize sentiment data into sentiment categories

In [32]:
# create function to categorize compound sentiment score
# 0.05 threshold recommended on VADER documentation
# you should read the comments, read its compound score, and determine your own cuttoffs
def categorize_sentiment(x):
    if x >= 0.05:
        return 'positive_comment'
    elif 0.05 > x > -0.05:
        return 'neutral_comment'
    elif -0.05 >= x:
        return 'negative_comment'

In [33]:
# apply function categorize_sentiment to ['compound']
data['sentiment'] = data['compound'].apply(categorize_sentiment)

data.head()

Unnamed: 0_level_0,created_utc,author,author_fullname,body,subreddit,subreddit_id,send_replies,no_follow,DateTime,Dates,Time,neg,neu,pos,compound,sentiment
...1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0.0,1614142000.0,ckh27,t2_6ga8asa9,No clue,ethereum,t5_2zf9m,True,True,2021-02-24 04:40:07,2021-02-24,04:40:07,0.688,0.312,0.0,-0.296,negative_comment
1.0,1614142000.0,Barmelo_Xanthony,t2_xr0uc,Yeah but those interesting and complex things ...,ethereum,t5_2zf9m,True,True,2021-02-24 04:39:26,2021-02-24,04:39:26,0.035,0.71,0.255,0.8957,positive_comment
2.0,1614142000.0,Archetypical3,t2_a1rblycq,That makes sense! Im patient,ethereum,t5_2zf9m,True,True,2021-02-24 04:39:11,2021-02-24,04:39:11,0.0,1.0,0.0,0.0,neutral_comment
3.0,1614142000.0,Hanzburger,t2_wafj0,Probably best to reach out to Ledger to make s...,ethereum,t5_2zf9m,True,True,2021-02-24 04:38:59,2021-02-24,04:38:59,0.0,0.714,0.286,0.765,positive_comment
4.0,1614141000.0,ckh27,t2_6ga8asa9,ETH will NOT effect your coins. It is not a f...,ethereum,t5_2zf9m,True,True,2021-02-24 04:37:27,2021-02-24,04:37:27,0.08,0.87,0.05,-0.4215,negative_comment


In [34]:
# convert ['sentiment'] to categorical data type
data['sentiment'] = pd.Categorical(data['sentiment'])

#sentiment should be category
data.dtypes

created_utc         float64
author               object
author_fullname      object
body                 object
subreddit            object
subreddit_id         object
send_replies           bool
no_follow              bool
DateTime             object
Dates                object
Time                 object
neg                 float64
neu                 float64
pos                 float64
compound            float64
sentiment          category
dtype: object

In [35]:
# convert ['sentiment'] categories to binary variables in new df
binary_sentiment = data['sentiment'].str.get_dummies()
binary_sentiment.head()

Unnamed: 0_level_0,negative_comment,neutral_comment,positive_comment
...1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,1,0,0
1.0,0,0,1
2.0,0,1,0
3.0,0,0,1
4.0,1,0,0


In [25]:
# count of how many of each category were classified
binary_sentiment.sum()

negative_comment    21762
neutral_comment     26146
positive_comment    49477
dtype: int64

In [36]:
# merge binary_sentiment with data 
data = data.merge(binary_sentiment, how = 'left', left_index = True, right_index = True)
data.head()

Unnamed: 0_level_0,created_utc,author,author_fullname,body,subreddit,subreddit_id,send_replies,no_follow,DateTime,Dates,Time,neg,neu,pos,compound,sentiment,negative_comment,neutral_comment,positive_comment
...1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0.0,1614142000.0,ckh27,t2_6ga8asa9,No clue,ethereum,t5_2zf9m,True,True,2021-02-24 04:40:07,2021-02-24,04:40:07,0.688,0.312,0.0,-0.296,negative_comment,1,0,0
1.0,1614142000.0,Barmelo_Xanthony,t2_xr0uc,Yeah but those interesting and complex things ...,ethereum,t5_2zf9m,True,True,2021-02-24 04:39:26,2021-02-24,04:39:26,0.035,0.71,0.255,0.8957,positive_comment,0,0,1
2.0,1614142000.0,Archetypical3,t2_a1rblycq,That makes sense! Im patient,ethereum,t5_2zf9m,True,True,2021-02-24 04:39:11,2021-02-24,04:39:11,0.0,1.0,0.0,0.0,neutral_comment,0,1,0
3.0,1614142000.0,Hanzburger,t2_wafj0,Probably best to reach out to Ledger to make s...,ethereum,t5_2zf9m,True,True,2021-02-24 04:38:59,2021-02-24,04:38:59,0.0,0.714,0.286,0.765,positive_comment,0,0,1
4.0,1614141000.0,ckh27,t2_6ga8asa9,ETH will NOT effect your coins. It is not a f...,ethereum,t5_2zf9m,True,True,2021-02-24 04:37:27,2021-02-24,04:37:27,0.08,0.87,0.05,-0.4215,negative_comment,1,0,0


In [27]:
# delete redundant variables
del data['pos']
del data['neg']
del data['neu']
del data['compound']
del data['body']
del data['sentiment']
del binary_sentiment

KeyError: 'pos'

In [37]:
# export data
data.to_csv('sentiment_data.csv')