## Import Statement

In [1]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nnsplit import NNSplit
import pandas as pd 
import re

## Load data and get all the reviews

In [2]:
df = pd.read_csv("./data/RGeo_Hotel_Reviews.csv", index_col=0)

## Filter the columns and combine positive and negative review

In [3]:
df_negative_review = df[['Negative_Review', 'city', 'country']]
df_positive_review = df[['Positive_Review', 'city', 'country']]
df_uncleaned_reviews = pd.concat([df_negative_review, df_positive_review], ignore_index=True)

In [4]:
df_uncleaned_reviews.head()

Unnamed: 0,Negative_Review,city,country,Positive_Review
0,I am so angry that i made this post available...,Amsterdam,Netherlands,
1,No Negative,Amsterdam,Netherlands,
2,Rooms are nice but for elderly a bit difficul...,Amsterdam,Netherlands,
3,My room was dirty and I was afraid to walk ba...,Amsterdam,Netherlands,
4,You When I booked with your company on line y...,Amsterdam,Netherlands,


In [5]:
df_uncleaned_reviews['Review'] = df_uncleaned_reviews['Negative_Review'].combine_first(df_uncleaned_reviews['Positive_Review'])
df_reviews = df_uncleaned_reviews.drop(columns=['Negative_Review', 'Positive_Review'])

In [6]:
df_reviews = df_reviews.sample(frac=1).reset_index(drop=True)
df_reviews[0:100]

Unnamed: 0,city,country,Review
0,Milan,Italy,No Negative
1,Barri Gòtic,Spain,Excellent location and extremely helpful staf...
2,Barnsbury,United Kingdom,We asked for a quiet room and that is just wh...
3,Poplar,United Kingdom,No Negative
4,Kensington,United Kingdom,To reach the room with heavy bags was like go...
...,...,...,...
95,Dreta de l'Eixample,Spain,Lovely lobby with free refreshments inc Cava ...
96,Amsterdam,Netherlands,Staff was extremely helpful Great location fo...
97,Amsterdam,Netherlands,The breakfast does not have a large selection
98,Amsterdam,Netherlands,No Negative


## Splitting reviews in to sentence using NNSplit

In [7]:
def split_review(reviews):
    reviews = [reviews.strip()]
    result_list = []
    splitter = NNSplit('en')
    results = splitter.split(reviews)
    for result in results[0]:
        sentence = ''
        for token in result:
            sentence += ' ' + token.text
        result_list.append(sentence.strip())
    return result_list      
        
df_reviews['sent_list'] = df_reviews['Review'].apply(split_review)

KeyboardInterrupt: 

## Split the list into different rows of sentences in the dataframe

In [None]:
s = df_reviews.apply(lambda x: pd.Series(x['sent_list']), axis=1).stack().reset_index(level=1, drop=True)

In [None]:
s.name = 'sent_list'
df_reviews_sentence = df_reviews.drop('sent_list', axis=1).join(s)
df_reviews_sentence['sentence'] = pd.Series(df_reviews_sentence['sent_list'], dtype=object)
df_reviews_sentence.reset_index(inplace=True)
df_reviews_sentence.drop(columns=['index', 'Review'])

## Filter sentence that is above length of 8

In [None]:
df_reviews_sentence['sentence']

In [None]:
def number_words(sentence):
    return len(re.findall(r'\w+', str(sentence)))

length = (df_reviews_sentence['sentence'].apply(number_words) >= 8)

In [None]:
# length = (df_reviews_sentence['sentence'].str.split().len() >= 8)
df_reviews_sentence = df_reviews_sentence.loc[length]
df_reviews_sentence

## Export sentences file into csv

In [None]:
df_reviews.to_csv('./data/cleaned_hotelreviews.csv', index=False)

## Vader get polarity score

In [None]:
# This function gets the polarity of reviews using Vader # 
def get_polarity(sentence):
    analyser = SentimentIntensityAnalyzer()
    score = analyser.polarity_scores(sentence)
    compound = score['compound']
    if compound >= 0.05:
        return ('positive',compound)
    elif compound > -0.05 and compound < 0.05: 
        return ('neutral',compound)
    else: 
        return ('negative',compound)
    
df_reviews_sentence['polarity'] = df_reviews_sentence['sentence'].apply(get_polarity)
df_reviews_sentence

In [None]:
print(type(df_reviews_sentence['polarity']))

In [None]:
df_reviews_sentence[df_reviews_sentence['polarity']=='positive']