# Sentiment Tagging with Vader

In [13]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nnsplit import NNSplit
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
import re
import pandas as pd

## Input unseen data

### New review

In [2]:
new_review = "You When I booked with your company on line you showed me pictures of a room I thought I was getting and paying for and then when we arrived that s room was booked and the staff told me we could only book the villa suite theough them directly Which was completely false advertising After being there we realised that you have grouped lots of rooms on the photos together leaving me the consumer confused and extreamly disgruntled especially as its my my wife s 40th birthday present Please make your website more clear through pricing and photos as again I didn t really know what I was paying for and how much it had wnded up being Your photos told me I was getting something I wasn t Not happy and won t be using you again "

### Split into sentence using NNSplit

In [3]:
sent_list = []
splitter = NNSplit("en")

sent = splitter.split([new_review.strip()])
for i in sent[0]:
    new_string = ''
    for j in i:
        new_string += j.text + " "
    sent_list.append(new_string)
    
sent_list

['You When I booked with your company on line you showed me pictures of a room ',
 'I thought I was getting and paying for and then when we arrived that s room was booked and the staff told me we could only book the villa suite theough them directly ',
 'Which was completely false advertising ',
 'After being there we realised that you have grouped lots of rooms on the photos together ',
 'leaving me ',
 'the consumer confused and extreamly disgruntled especially as its my ',
 'my wife s 40th birthday ',
 'present ',
 'Please make your website more clear through pricing and photos ',
 'as again ',
 'I didn t really know what I was paying for and how much it had wnded up being ',
 'Your photos told me I was getting something I wasn t Not happy and won t be using you again ']

### Data Cleaning

In [35]:
def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
def clean_text(text):
    # lower text
    text = text.lower()
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text.split() if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(text)
    # lemmatize text
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)

In [39]:
clean_sent_list = []
for sent in sent_list:
    cleaned_text = clean_text(sent)
    clean_sent_list.append(cleaned_text)

#### Convert all words to lower

In [71]:
sent_list_lower = [sent.lower() for sent in sent_list]
sent_list_lower

[' ',
 'you when i booked with your company on line you showed me pictures of a room ',
 'i thought i was getting and paying for and then when we arrived that s room was booked and the staff told me we could only book the villa suite theough them directly ',
 'which was completely false advertising ',
 'after being there we realised that you have grouped lots of rooms on the photos together ',
 'leaving me ',
 'the consumer confused and extreamly disgruntled especially as its my my wife s 40th birthday ',
 'present ',
 'please make your website more clear through pricing and photos ',
 'as again ',
 'i didn t really know what i was paying for and how much it had wnded up being ',
 'your photos told me i was getting something i wasn t not happy and won t be using you again ']

#### Remove Stopwords

In [73]:
stop_list = stopwords.words('english')
sent_list_lower_no_stopword_list = [[word for word in sent.split() if not word in stop_list] for sent in sent_list_lower]
sent_list_lower_no_stopword = []
for sent in sent_list_lower_no_stopword_list:
    new_sent = ' '.join(sent)
    sent_list_lower_no_stopword.append(new_sent)
print(sent_list_lower_no_stopword)

['', 'booked company line showed pictures room', 'thought getting paying arrived room booked staff told could book villa suite theough directly', 'completely false advertising', 'realised grouped lots rooms photos together', 'leaving', 'consumer confused extreamly disgruntled especially wife 40th birthday', 'present', 'please make website clear pricing photos', '', 'really know paying much wnded', 'photos told getting something happy using']


### Convert list to dataframe

In [40]:
data = pd.DataFrame(clean_sent_list, columns=["sentence"])
data

Unnamed: 0,sentence
0,book company line show picture room
1,think get pay arrived room book staff told cou...
2,completely false advertising
3,realise grouped lot room photos together
4,leave
5,consumer confuse extreamly disgruntle especially
6,wife 40th birthday
7,present
8,please make website clear pricing photo
9,


### Get polarity

In [41]:
# This function gets the polarity of reviews using Vader # 
def get_polarity(sentence):
    analyser = SentimentIntensityAnalyzer()
    score = analyser.polarity_scores(sentence)
    compound = score['compound']
    return compound

data['polarity'] = data['sentence'].apply(get_polarity)
data

Unnamed: 0,sentence,polarity
0,book company line show picture room,0.0
1,think get pay arrived room book staff told cou...,-0.1027
2,completely false advertising,0.0
3,realise grouped lot room photos together,0.0
4,leave,-0.0516
5,consumer confuse extreamly disgruntle especially,-0.2263
6,wife 40th birthday,0.0
7,present,0.0
8,please make website clear pricing photo,0.5994
9,,0.0


### Filter sentence above 8 words

In [42]:
def number_words(sentence):
    return len(re.findall(r'\w+', str(sentence)))

length = (data['sentence'].apply(number_words) >= 8)

data = data.loc[length]
data

Unnamed: 0,sentence,polarity
1,think get pay arrived room book staff told cou...,-0.1027


### Aggregated polarity score

In [43]:
polarity = data['polarity'].mean()
if polarity >= 0.05:
    sentiment = ('positive', polarity)
elif polarity > -0.05 and polarity < 0.05: 
    sentiment = ('neutral', polarity)
else: 
    sentiment = ('negative', polarity)
print(sentiment)

('negative', -0.1027)


## Not valid beyond this point

In [None]:
data = pd.read_csv("./data/sentence_data.csv")
data

In [None]:
data.columns = ['city', 'country', 'reviews', 'sentence']
data = data[0:100000]

## Get Vader polarity score

In [44]:
# This function gets the polarity of reviews using Vader # 
def get_polarity(sentence):
    analyser = SentimentIntensityAnalyzer()
    score = analyser.polarity_scores(sentence)
    compound = score['compound']
    return compound
    
data['polarity'] = data['sentence'].apply(get_polarity)
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,sentence,polarity
1,think get pay arrived room book staff told cou...,-0.1027


In [45]:
def number_words(sentence):
    return len(re.findall(r'\w+', str(sentence)))

length = (data['sentence'].apply(number_words) > 1)
data = data.loc[length]

## Aggregate the polarity by grouping sentence by reviews

In [46]:
data['agg_polarity'] = data.groupby('reviews')['polarity'].transform('mean')

KeyError: 'reviews'

In [47]:
# data = data.drop_duplicates(subset="reviews", keep="first")
# data = data.drop(['sentence','polarity'], axis=1)

In [None]:
data["review_sentiment"] = ["positive" if x>=0.196725 else ("neutral" if x>=0.096725 else "negative") for x in data['agg_polarity']]
data

In [None]:
data["sen_sentiment"] = ["positive" if x>=0.1779 else ("neutral" if x>=0.01779 else "negative") for x in data['polarity']]
data

## Export 

In [None]:
data.to_csv('./data/tagged_sentence_data.csv', index=False)

## Setting to run to see all dataframe row

In [None]:
# view_data = data.loc[:, "sentence":"sentiment"]
data.sort_values(by=['agg_polarity'], axis=0, inplace=True,ascending=False)

pd.set_option('display.max_rows',None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)
data
# Threshold: if polarity <= 0.1779: Negative

In [9]:
pd.set_option('display.max_rows',None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

  This is separate from the ipykernel package so we can avoid doing imports until
