## Import Statement

In [74]:
from nnsplit import NNSplit
import pandas as pd 
import re

## Load data and get all the reviews

In [75]:
df = pd.read_csv("../data/RGeo_Hotel_Reviews.csv", index_col=0)
df.dropna(inplace= True)

  interactivity=interactivity, compiler=compiler, result=result)


## Filter the columns and combine positive and negative review

In [76]:
df_negative_review = df[['Negative_Review', 'city', 'country']]
df_positive_review = df[['Positive_Review', 'city', 'country']]
df_uncleaned_reviews = pd.concat([df_negative_review, df_positive_review], ignore_index=True)

In [77]:
df_uncleaned_reviews['Review'] = df_uncleaned_reviews['Negative_Review'].combine_first(df_uncleaned_reviews['Positive_Review'])
df_reviews = df_uncleaned_reviews.drop(columns=['Negative_Review', 'Positive_Review'])

## Shuffle and slice dataframe

In [78]:
df_reviews = df_reviews.sample(frac=1).reset_index(drop=True)
df_reviews[0:20000]

Unnamed: 0,city,country,Review
0,Amsterdam,Netherlands,This hotel is awesome I took it sincirely bec...
1,Amsterdam,Netherlands,You When I booked with your company on line y...
2,Amsterdam,Netherlands,Only the park outside of the hotel was beauti...
3,Amsterdam,Netherlands,Comfy bed good location
4,Amsterdam,Netherlands,Apart from the price for the brekfast Everyth...
5,Amsterdam,Netherlands,Backyard of the hotel is total mess shouldn t...
6,Amsterdam,Netherlands,Good restaurant with modern design great chil...
7,Amsterdam,Netherlands,Public areas are lovely and the room was nice...
8,Amsterdam,Netherlands,Transportation was a bit of a pain but on rou...
9,Amsterdam,Netherlands,Great location in nice surroundings the bar a...


## Splitting reviews in to sentence using NNSplit

In [79]:
def split_review(reviews):
    reviews = [reviews.strip()]
    result_list = []
    splitter = NNSplit('en')
    results = splitter.split(reviews)
    for result in results[0]:
        sentence = ''
        for token in result:
            sentence += ' ' + token.text
        result_list.append(sentence.strip())
    return result_list      
        
df_reviews['sent_list'] = df_reviews['Review'].apply(split_review)

## Split the list into different rows of sentences in the dataframe

In [80]:
s = df_reviews.apply(lambda x: pd.Series(x['sent_list']), axis=1).stack().reset_index(level=1, drop=True)

In [100]:
s.name = 'sent_list'
df_reviews_sentence = df_reviews.drop('sent_list', axis=1).join(s)
df_reviews_sentence['sent_list'] = pd.Series(df_reviews_sentence['sent_list'], dtype=object)
df_reviews_sentence.reset_index(inplace=True)
df_reviews_sentence.drop(columns=['index', 'Review'])

Unnamed: 0,city,country,sent_list
0,Amsterdam,Netherlands,This hotel is awesome
1,Amsterdam,Netherlands,I took it sincirely because a bit cheaper but ...
2,Amsterdam,Netherlands,Arrive in the city are like 10 minutes by tram...
3,Amsterdam,Netherlands,The hotel inside is awesome and really cool an...
4,Amsterdam,Netherlands,I ll come back for sure there
...,...,...,...
207,Amsterdam,Netherlands,The true beauty of the building has been kept ...
208,Amsterdam,Netherlands,Also the bath was lovely and big and inviting ...
209,Amsterdam,Netherlands,Restaurant menu was a bit pricey but there wer...
210,Amsterdam,Netherlands,Would recommend this hotel to anyone it s unbe...


## Filter sentence that is above length of 8

In [101]:
def number_words(sentence):
    return len(re.findall(r'\w+', str(sentence)))

length = (df_reviews_sentence['sent_list'].apply(number_words) >= 8)

df_reviews_sentence = df_reviews_sentence.loc[length]
df_reviews_sentence

Unnamed: 0,index,city,country,Review,sent_list
1,0,Amsterdam,Netherlands,This hotel is awesome I took it sincirely bec...,I took it sincirely because a bit cheaper but ...
2,0,Amsterdam,Netherlands,This hotel is awesome I took it sincirely bec...,Arrive in the city are like 10 minutes by tram...
3,0,Amsterdam,Netherlands,This hotel is awesome I took it sincirely bec...,The hotel inside is awesome and really cool an...
5,0,Amsterdam,Netherlands,This hotel is awesome I took it sincirely bec...,The staff very gentle one Spanish man really r...
6,1,Amsterdam,Netherlands,You When I booked with your company on line y...,You When I booked with your company on line yo...
...,...,...,...,...,...
207,46,Amsterdam,Netherlands,Rooms were stunningly decorated and really sp...,The true beauty of the building has been kept ...
208,46,Amsterdam,Netherlands,Rooms were stunningly decorated and really sp...,Also the bath was lovely and big and inviting ...
209,46,Amsterdam,Netherlands,Rooms were stunningly decorated and really sp...,Restaurant menu was a bit pricey but there wer...
210,46,Amsterdam,Netherlands,Rooms were stunningly decorated and really sp...,Would recommend this hotel to anyone it s unbe...


# Write to CSV after NNSplit

In [103]:
df_reviews_sentence.to_csv('../data/sentence_data.csv', index=False)