# Sentiment Tagging with Vader

In [64]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nnsplit import NNSplit
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

import re
import pandas as pd

## Input unseen data

### New review

In [65]:
new_review = "The scenary is great with a seaview right in front of my eyes However the windows is not clean for me to enjoy the view Food is good and tasty fish is fresh with nice presentation The service quality is good and efficient and the staff are all polite"

### Split into sentence using NNSplit

In [66]:
sent_list = []
splitter = NNSplit("en")

sent = splitter.split([new_review.strip()])
for i in sent[0]:
    new_string = ''
    for j in i:
        new_string += j.text + " "
    sent_list.append(new_string)
    
sent_list

['The scenary is great with a seaview right in front of my eyes ',
 'However the windows is not clean for me to enjoy the view ',
 'Food is good and tasty fish is fresh with nice presentation ',
 'The service quality is good and efficient and the staff are all polite ']

### Filter sentence above 8 words

In [67]:
def number_words(sentence):
    return len(re.findall(r'\w+', str(sentence)))

new_sent_list = [] 
for sent in sent_list:
    if number_words(sent) > 4:
        new_sent_list.append(sent)


### Data Cleaning

In [68]:
def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
def lemmatize_text(text):
    text = text.lower()
    text = [t for t in text.split() if len(t) > 2]
    lemmatizer = WordNetLemmatizer()
    lemma_list = [lemmatizer.lemmatize(w) for w in text]
    text = " ".join(lemma_list)
    print("Lemma text: ", text)
    return text
    
def stem_text(text):
    text = text.lower()
    text = [t for t in text.split() if len(t) > 2]
    stemmer = PorterStemmer()
    stem_list = [stemmer.stem(w) for w in text]
    text = " ".join(stem_list)
    print("Stemmed text :", text)
    return text
    
def clean_text(text):
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    # text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    # text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text.split() if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(text)
    # lemmatize text
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)

In [69]:
clean_sent_list = []
lemma_sent_list = []
stem_sent_list = []
for sent in new_sent_list:
    print("sentence :", sent)
    cleaned_text = clean_text(sent)
    lemma_text = lemmatize_text(sent)
    stemmed_text = stem_text(sent)
    clean_sent_list.append(cleaned_text)
    lemma_sent_list.append(lemma_text)
    stem_sent_list.append(stemmed_text)
    print("---------------------------")

sentence : The scenary is great with a seaview right in front of my eyes 
Lemma text:  the scenary great with seaview right front eye
Stemmed text : the scenari great with seaview right front eye
---------------------------
sentence : However the windows is not clean for me to enjoy the view 
Lemma text:  however the window not clean for enjoy the view
Stemmed text : howev the window not clean for enjoy the view
---------------------------
sentence : Food is good and tasty fish is fresh with nice presentation 
Lemma text:  food good and tasty fish fresh with nice presentation
Stemmed text : food good and tasti fish fresh with nice present
---------------------------
sentence : The service quality is good and efficient and the staff are all polite 
Lemma text:  the service quality good and efficient and the staff are all polite
Stemmed text : the servic qualiti good and effici and the staff are all polit
---------------------------


In [70]:
clean_sent_list

['the scenari great with seaview right front eye',
 'howev the window not clean for enjoy the view',
 'food good and tasti fish fresh with nice present',
 'the servic qualiti good and effici and the staff are all polit']

### Convert list to dataframe

In [71]:
data = pd.DataFrame(lemma_sent_list, columns=["sentence"])
data

Unnamed: 0,sentence
0,the scenary great with seaview right front eye
1,however the window not clean for enjoy the view
2,food good and tasty fish fresh with nice prese...
3,the service quality good and efficient and the...


### Get polarity

In [72]:
# This function gets the polarity of reviews using Vader # 
def get_polarity(sentence):
    analyser = SentimentIntensityAnalyzer()
    score = analyser.polarity_scores(sentence)
    compound = score['compound']
    return compound

data['polarity'] = data['sentence'].apply(get_polarity)
data

Unnamed: 0,sentence,polarity
0,the scenary great with seaview right front eye,0.6249
1,however the window not clean for enjoy the view,-0.5975
2,food good and tasty fish fresh with nice prese...,0.7906
3,the service quality good and efficient and the...,0.6908


### Aggregated polarity score

In [73]:
polarity = data['polarity'].mean()
if polarity >= 0.05:
    sentiment = ('positive', polarity)
elif polarity > -0.05 and polarity < 0.05: 
    sentiment = ('neutral', polarity)
else: 
    sentiment = ('negative', polarity)
print(sentiment)

('positive', 0.3772)


## Get Vader polarity score

In [None]:
# This function gets the polarity of reviews using Vader # 
def get_polarity(sentence):
    analyser = SentimentIntensityAnalyzer()
    score = analyser.polarity_scores(sentence)
    compound = score['compound']
    return compound
    
data['polarity'] = data['sentence'].apply(get_polarity)
data

In [None]:
def number_words(sentence):
    return len(re.findall(r'\w+', str(sentence)))

length = (data['sentence'].apply(number_words) > 1)
data = data.loc[length]

## Aggregate the polarity by grouping sentence by reviews

In [None]:
data['agg_polarity'] = data.groupby('reviews')['polarity'].transform('mean')

In [None]:
# data = data.drop_duplicates(subset="reviews", keep="first")
# data = data.drop(['sentence','polarity'], axis=1)

In [None]:
data["review_sentiment"] = ["positive" if x>=0.196725 else ("neutral" if x>=0.096725 else "negative") for x in data['agg_polarity']]
data

In [None]:
data["sen_sentiment"] = ["positive" if x>=0.1779 else ("neutral" if x>=0.01779 else "negative") for x in data['polarity']]
data

## Export 

In [None]:
data.to_csv('./data/tagged_sentence_data.csv', index=False)

## Setting to run to see all dataframe row

In [None]:
# view_data = data.loc[:, "sentence":"sentiment"]
data.sort_values(by=['agg_polarity'], axis=0, inplace=True,ascending=False)

pd.set_option('display.max_rows',None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)
data
# Threshold: if polarity <= 0.1779: Negative

In [9]:
pd.set_option('display.max_rows',None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

  This is separate from the ipykernel package so we can avoid doing imports until
