# Sentiment analysis

## Import and setup pandas dataframe

In [1]:
# Pandas is used for data manipulation
import numpy as np
import pandas as pd
import pickle
import contractions
import fasttext
import string
fasttext.FastText.eprint = lambda x: None


# NLTK tokenization, stopword removal and vader 
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
analyzer = SIA()

----

## Sentiment analysis of 2020 pinkbike comments

In [2]:
# Read in data and display first 10 rows
pd.set_option("display.max_colwidth", 10000)
df = pd.read_csv('src/2020_pinkbike_comments.csv')
df.drop("id", axis = 1, inplace = True)
df.dropna(axis = 0, inplace = True)
df = df.loc[:, ["comment_publishing_date", "comment_upvotes", "comment_downvotes", "comment_content"]]

### Text preprocessing

In [3]:
# Expanding shortforms
df["no_contract"] = df['comment_content'].apply(lambda x: [contractions.fix(word) for word in x.split()])
df["comment_content_str"] = [' '.join(map(str, l)) for l in df['no_contract']]


# Detect language and delete non-english comments

pretrained_model = "src/lid.176.bin"
model = fasttext.load_model(pretrained_model)
langs = []

for sent in df["comment_content_str"]:
    lang = model.predict(sent)[0]
    langs.append(str(lang)[11:13])
df["langs"] = langs

new_df = df[~df["langs"].str.contains("en", na = False)]

    ## Get indexes of foreign languages
indexNames = new_df.index 

    ## Delete these row indexes from dataFrame
df.drop(indexNames , inplace = True)


# Remove URLs
df['no_url'] = df['comment_content_str'].replace(r'http\S+', '', regex = True).replace(r'www\S+', '', regex = True)


# Clear table
delete_columns = ["comment_content", "no_contract", "comment_content_str", "langs"]
df.drop(delete_columns, inplace = True, axis = 1)
df.rename(columns = {"no_url": "comments"}, inplace = True)


# Vader sentiment analysis
    ## Generating sentiment for all the sentence present in the dataset
emptyline=[]

for row in df["comments"]:
    vs = analyzer.polarity_scores(row)
    emptyline.append(vs)
    
    ## Creating new dataframe with sentiments
df_sentiments = pd.DataFrame(emptyline)


    ## Merging the sentiments back to reviews dataframe
df = pd.concat([df.reset_index(drop = True), df_sentiments], axis = 1)

    ## Convert scores into positive and negetive sentiments using some threshold
df['sentiment'] = np.where(df['compound'] >= 0 , 'Positive','Negative')

df.head(10)

Unnamed: 0,comment_publishing_date,comment_upvotes,comment_downvotes,comments,neg,neu,pos,compound,sentiment
0,1580468520,453,4,"My level is I can rebuild a shock, install a dropper or build a full bike, but I also pay $200 for new lowers because I stripped out the threads while installing a brake caliper. ¯\_(ツ)_/¯",0.165,0.835,0.0,-0.4854,Negative
1,1580468940,95,0,Sounds just like me. I will build a bike or do a lower leg service no probs then cross thread a pedal fitting after a couple too many beers,0.158,0.753,0.09,-0.2263,Negative
2,1580469240,45,52,"I have come to realize by the time you replace a couple parts you broke with diy, it ends up costing the same to pay a pro as it costs to do yourself.",0.13,0.87,0.0,-0.4939,Negative
3,1580469480,195,1,Cross threads are tight threads!,0.0,1.0,0.0,0.0,Positive
4,1580469960,50,4,You did not have to spend $ 200 because there are threaded inserts. They cost much less.,0.0,1.0,0.0,0.0,Positive
5,1580470080,4,2,@iantmcg: yep. Lesson learned for me too,0.0,0.732,0.268,0.296,Positive
6,1580470140,65,4,Heli-Coil... better than new.,0.0,0.508,0.492,0.4404,Positive
7,1580471280,18,0,@Chilliwacker: sure blame the beer lol!!!,0.211,0.264,0.525,0.5538,Positive
8,1580472360,216,11,"In 2006 I got the hottest fork of the time 66RC2X, I changed the oil because a friend told me to not trust factory state. I carefully dealt with super fragile internals requiring some skill and finesse, only to put it together and cut the steerer too short...",0.092,0.744,0.164,0.5856,Positive
9,1580472600,13,22,"@Jahtaka: Yeah, but threaded inserts are crap most of the time and you don`t have enough matter to put any.Sincerely, instead of putting 200bucks into new lowers, and knowing that you don`t have to dismantle your breaks everyday, I would have used silicon to glue that fvcking bolt in these damaged threads and that was it. Russian way",0.141,0.816,0.043,-0.8047,Negative


### Save as clean PKL-File

In [5]:
df_clean = df.loc[:, ["comment_publishing_date", "comment_upvotes", "comment_downvotes", "comments", "neg", "neu", "pos", "compound", "sentiment"]]
with open('src/2020_clean_pinkbike_comments.pkl', 'wb') as pickle_file:
    pickle.dump(df_clean, pickle_file)


--------


## Sentiment analysis of 2019 pinkbike comments

In [6]:
# Read in data and display first 10 rows
pd.set_option("display.max_colwidth", 10000)
df = pd.read_csv('src/2019_pinkbike_comments.csv')
df.drop("id", axis = 1, inplace = True)
df.dropna(axis = 0, inplace = True)
df = df.loc[:, ["comment_publishing_date", "comment_upvotes", "comment_downvotes", "comment_content"]]

### Text preprocessing

In [7]:
# Expanding shortforms
df["no_contract"] = df['comment_content'].apply(lambda x: [contractions.fix(word) for word in x.split()])
df["comment_content_str"] = [' '.join(map(str, l)) for l in df['no_contract']]


# Detect language and delete non-english comments

pretrained_model = "src/lid.176.bin"
model = fasttext.load_model(pretrained_model)
langs = []

for sent in df["comment_content_str"]:
    lang = model.predict(sent)[0]
    langs.append(str(lang)[11:13])
df["langs"] = langs

new_df = df[~df["langs"].str.contains("en", na = False)]

    ## Get indexes of foreign languages
indexNames = new_df.index 

    ## Delete these row indexes from dataFrame
df.drop(indexNames , inplace = True)


# Remove URLs
df['no_url'] = df['comment_content_str'].replace(r'http\S+', '', regex = True).replace(r'www\S+', '', regex = True)


# Clear table
delete_columns = ["comment_content", "no_contract", "comment_content_str", "langs"]
df.drop(delete_columns, inplace = True, axis = 1)
df.rename(columns = {"no_url": "comments"}, inplace = True)


# Vader sentiment analysis
    ## Generating sentiment for all the sentence present in the dataset
emptyline=[]

for row in df["comments"]:
    vs = analyzer.polarity_scores(row)
    emptyline.append(vs)
    
    ## Creating new dataframe with sentiments
df_sentiments = pd.DataFrame(emptyline)


    ## Merging the sentiments back to reviews dataframe
df = pd.concat([df.reset_index(drop = True), df_sentiments], axis = 1)

    ## Convert scores into positive and negetive sentiments using some threshold
df['sentiment'] = np.where(df['compound'] >= 0 , 'Positive','Negative')

df.head(10)

Unnamed: 0,comment_publishing_date,comment_upvotes,comment_downvotes,comments,neg,neu,pos,compound,sentiment
0,1549015740,3,0,I see a slow but sure slow transition from bmx to mtb!!,0.0,0.718,0.282,0.5475,Positive
1,1549013820,3,0,good,0.0,0.0,1.0,0.4404,Positive
2,1549016400,2,0,Great news. Looking forward to see both Godziek´s styling it out!,0.0,0.695,0.305,0.6588,Positive
3,1549027980,1,0,do you think he will catch the pedals this time?,0.0,1.0,0.0,0.0,Positive
4,1549012560,1,3,Sweet! Did he launch his own bike brand too??,0.0,0.687,0.313,0.565,Positive
5,1548805020,211,0,"So, the top of the range pedal being the 'penthouse flat' Can we assume the new budget pedal will be called the 'council flat?' I wonder if Burgtec would consider this....",0.0,0.933,0.067,0.2682,Positive
6,1548809820,5,0,Ha ha. They definitely should.,0.0,0.211,0.789,0.7579,Positive
7,1548811140,2,2,You beat me to it! It should defo be the council flat,0.0,1.0,0.0,0.0,Positive
8,1548815340,7,11,Naaaah if the penthouse flat is the top of the range then the Wakefield readers wives flats are the budget ones.,0.0,0.917,0.083,0.2023,Positive
9,1548839040,27,7,british joke that falls deaf on american ears,0.0,0.761,0.239,0.296,Positive


### Save as clean PKL-File

In [9]:
df_clean = df.loc[:, ["comment_publishing_date", "comment_upvotes", "comment_downvotes", "comments", "neg", "neu", "pos", "compound", "sentiment"]]
with open('src/2019_clean_pinkbike_comments.pkl', 'wb') as pickle_file:
    pickle.dump(df_clean, pickle_file)

----

## Sentiment analysis of 2018 pinkbike comments

In [10]:
# Read in data and display first 10 rows
pd.set_option("display.max_colwidth", 10000)
df = pd.read_csv('src/2018_pinkbike_comments.csv')
df.drop("id", axis = 1, inplace = True)
df.dropna(axis = 0, inplace = True)
df = df.loc[:, ["comment_publishing_date", "comment_upvotes", "comment_downvotes", "comment_content"]]

### Text preprocessing

In [11]:
# Expanding shortforms
df["no_contract"] = df['comment_content'].apply(lambda x: [contractions.fix(word) for word in x.split()])
df["comment_content_str"] = [' '.join(map(str, l)) for l in df['no_contract']]


# Detect language and delete non-english comments

pretrained_model = "src/lid.176.bin"
model = fasttext.load_model(pretrained_model)
langs = []

for sent in df["comment_content_str"]:
    lang = model.predict(sent)[0]
    langs.append(str(lang)[11:13])
df["langs"] = langs

new_df = df[~df["langs"].str.contains("en", na = False)]

    ## Get indexes of foreign languages
indexNames = new_df.index 

    ## Delete these row indexes from dataFrame
df.drop(indexNames , inplace = True)


# Remove URLs
df['no_url'] = df['comment_content_str'].replace(r'http\S+', '', regex = True).replace(r'www\S+', '', regex = True)


# Clear table
delete_columns = ["comment_content", "no_contract", "comment_content_str", "langs"]
df.drop(delete_columns, inplace = True, axis = 1)
df.rename(columns = {"no_url": "comments"}, inplace = True)


# Vader sentiment analysis
    ## Generating sentiment for all the sentence present in the dataset
emptyline=[]

for row in df["comments"]:
    vs = analyzer.polarity_scores(row)
    emptyline.append(vs)
    
    ## Creating new dataframe with sentiments
df_sentiments = pd.DataFrame(emptyline)


    ## Merging the sentiments back to reviews dataframe
df = pd.concat([df.reset_index(drop = True), df_sentiments], axis = 1)

    ## Convert scores into positive and negetive sentiments using some threshold
df['sentiment'] = np.where(df['compound'] >= 0 , 'Positive','Negative')

df.head(10)

Unnamed: 0,comment_publishing_date,comment_upvotes,comment_downvotes,comments,neg,neu,pos,compound,sentiment
0,1517781780,1,0,"Hey man. Note: 1-6 month is a forest fire prevention season in Kunming, and some forest trails may not be allowed to enter.But some forest trails are still allowed to enter. You cannot use a fire source.",0.127,0.873,0.0,-0.5859,Negative
1,1517802540,1,0,Hey hoop. Yep I know about the fire prevention. Thankfully they do not close off Bao Zhu. let us ride sometime!,0.097,0.688,0.214,0.4389,Positive
2,1517424960,30,0,Matt Jones' video Frames of Mind video was one of the best from last year.,0.0,0.769,0.231,0.6369,Positive
3,1517441280,18,1,I had literally never considered that they might be brothers. This is embarrassing.,0.191,0.809,0.0,-0.3818,Negative
4,1517444640,15,0,"I actually thought for a while it was the same person, Jono being the nickname or something like that...",0.0,0.865,0.135,0.3612,Positive
5,1517459940,1,0,@bonkywonky: Makes two of dude!,0.0,1.0,0.0,0.0,Positive
6,1517460540,3,0,"I have to say, to my shame, that before this video I never even realized Matt had a sibling at all, especially not a twin... :O",0.123,0.794,0.083,-0.2495,Negative
7,1517460840,3,0,@bonkywonky: I thought that but I did not want to admit it when i found out. Glad I was not the only one,0.139,0.689,0.172,0.4174,Positive
8,1517433840,16,0,2 well rounded lads that have their head on straight..,0.0,0.792,0.208,0.2732,Positive
9,1517429040,14,0,UK riders are badass..,0.0,1.0,0.0,0.0,Positive


### Save as clean PKL-File

In [13]:
df_clean = df.loc[:, ["comment_publishing_date", "comment_upvotes", "comment_downvotes", "comments", "neg", "neu", "pos", "compound", "sentiment"]]
with open('src/2018_clean_pinkbike_comments.pkl', 'wb') as pickle_file:
    pickle.dump(df_clean, pickle_file)