### Import libraries

In [1]:
import pandas as pd
import numpy as np
import csv
import nltk.data
import string

### Define useful functions

In [2]:
def determine_travel_type(row):
    if 'Business trip' in row:
        return 'Business'
    elif 'Leisure trip' in row:
        return 'Leisure'
    else:
        return None
    
def determine_traveler_type(row):
    if 'Family with young children' in row:
        return 'Family with young children'
    elif 'Couple' in row:
        return 'Couple'
    elif 'Solo traveller' in row:
        return 'Solo traveler'
    elif 'Group' in row:
        return 'Group'
    elif 'People with friends':
        return 'People with friends'
    else:
        return None
    
def determine_time_of_travel(row):
    row=str(row)
    if ('December' in row) or ('January' in row) or ('February' in row):
        return 'Winter'
    elif ('March' in row) or ('April' in row) or ('May' in row):
        return 'Spring'
    elif ('June' in row) or ('July' in row) or ('August' in row):
        return 'Summer'
    elif ('ptember' in row) or ('October' in row) or ('November' in row):
        return 'Fall'
    else:
        return None

def remove_new_line(review_item):
    if pd.isnull(review_item):
        return np.nan
    else:
        return review_item.replace('\n', '. ').replace('\r', '. ')
    
def split_reviews(review_item):
    if pd.isnull(review_item):
        return np.nan
    else:
        return tokenizer.tokenize(review_item)
    
def text_process_prelim(review_sentence):
    review_sentence_str = str(review_sentence)
    nopunc = [char for char in review_sentence_str if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    
    if nopunc == '':
        return np.nan
    else:
        return nopunc

### Import complete dataset of all reviews

In [41]:
df_complete  = pd.read_csv('./datasets/sample0_nys_hotel_reviews_list.csv')

### Replace '\n' and '\r' in review sentences

In [43]:
df_complete['review_pos'] = df_complete['review_pos'].apply(remove_new_line)
df_complete['review_neg'] = df_complete['review_neg'].apply(remove_new_line)

### Split reviews into sentences

In [44]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [45]:
df_complete['review_pos_split']=df_complete['review_pos'].apply(split_reviews)
df_complete['review_neg_split']=df_complete['review_neg'].apply(split_reviews)

### Create a new data point for each sentence

In [46]:
df_complete_pos_col = df_complete.apply(lambda x: pd.Series(x['review_pos_split']),axis=1).stack().reset_index(level=1, drop=True)
df_complete_pos_col.name = 'review_pos_sentence'

df_complete_pos = df_complete.copy()
df_complete_pos.drop('review_pos_split', axis=1, inplace=True)
df_complete_pos = df_complete_pos.join(df_complete_pos_col)

  df_complete_pos_col = df_complete.apply(lambda x: pd.Series(x['review_pos_split']),axis=1).stack().reset_index(level=1, drop=True)


In [47]:
df_complete_neg_col = df_complete.apply(lambda x: pd.Series(x['review_neg_split']),axis=1).stack().reset_index(level=1, drop=True)
df_complete_neg_col.name = 'review_neg_sentence'

df_complete_neg = df_complete.copy()
df_complete_neg.drop('review_neg_split', axis=1, inplace=True)
df_complete_neg = df_complete_neg.join(df_complete_neg_col)

  df_complete_neg_col = df_complete.apply(lambda x: pd.Series(x['review_neg_split']),axis=1).stack().reset_index(level=1, drop=True)


### Clean up dataframes by removing unneeded columns and adding label column

In [48]:
df_complete_pos.drop(['review_pos','review_neg','review_neg_split'], axis=1, inplace=True)
df_complete_pos['review_label']=1

In [49]:
df_complete_neg.drop(['review_pos','review_neg','review_pos_split'], axis=1, inplace=True)
df_complete_neg['review_label']=0

### Remove reviews without text

In [50]:
df_complete_pos = df_complete_pos[~pd.isnull(df_complete_pos['review_pos_sentence'])]

In [51]:
df_complete_neg = df_complete_neg[~pd.isnull(df_complete_neg['review_neg_sentence'])]

### Merge dataframes

In [52]:
df_complete_pos.rename(columns={'review_pos_sentence': 'review_sentence'},inplace=True)
df_complete_neg.rename(columns={'review_neg_sentence': 'review_sentence'},inplace=True)

In [82]:
df_preprocessed = pd.concat([df_complete_pos,df_complete_neg],ignore_index=True)

### Convert 'review_info_tag' column into 'traveler_type' and 'time_of_travel' columns

In [83]:
df_preprocessed['review_info_tag'] = df_preprocessed['review_info_tag'].apply(lambda x:x.strip('[]\'').split('\', \''))

In [84]:
df_preprocessed['review_traveler_type'] = df_preprocessed['review_info_tag'].apply(determine_traveler_type)

In [85]:
df_preprocessed['review_time_of_travel'] = df_preprocessed['review_staydate'].apply(determine_time_of_travel)

### Remove 'Everything' and 'Nothing' reviews

In [86]:
filter_remove_oneliner_1 = (df_preprocessed['review_sentence']=='Everything.') | (df_preprocessed['review_sentence']=='everything.') | (df_preprocessed['review_sentence']=='Everything') | (df_preprocessed['review_sentence']=='everything') | (df_preprocessed['review_sentence']=='Nothing.') | (df_preprocessed['review_sentence']=='nothing.') | (df_preprocessed['review_sentence']=='Nothing') | (df_preprocessed['review_sentence']=='nothing')
df_preprocessed = df_preprocessed[~filter_remove_oneliner_1]

filter_remove_oneliner_2 = (df_preprocessed['review_sentence']=='None.') | (df_preprocessed['review_sentence']=='none.') | (df_preprocessed['review_sentence']=='None') | (df_preprocessed['review_sentence']=='none') | (df_preprocessed['review_sentence']=='All.') | (df_preprocessed['review_sentence']=='all.') | (df_preprocessed['review_sentence']=='All') | (df_preprocessed['review_sentence']=='all')
df_preprocessed = df_preprocessed[~filter_remove_oneliner_2]

filter_remove_oneliner_3 = (df_preprocessed['review_sentence']=='N/A.') | (df_preprocessed['review_sentence']=='N/A') | (df_preprocessed['review_sentence']=='n/a.') | (df_preprocessed['review_sentence']=='n/a') | (df_preprocessed['review_sentence']=='NA.') | (df_preprocessed['review_sentence']=='na.') | (df_preprocessed['review_sentence']=='NA') | (df_preprocessed['review_sentence']=='na') | (df_preprocessed['review_sentence']=='Na') | (df_preprocessed['review_sentence']=='Na.') | (df_preprocessed['review_sentence']=='N/a.')
df_preprocessed = df_preprocessed[~filter_remove_oneliner_3]

### Replace '\n','\r' with '.' from header content

In [88]:
df_preprocessed['review_item_header_content'] = df_preprocessed['review_item_header_content'].apply(lambda x:str(x).replace('\n', '. ').replace('\r', '. '))

### Write out to csv file as intermediate step

In [90]:
df_preprocessed.to_csv('./datasets/df_preprocessed.csv',index=False)

### Remove punctuation

In [None]:
df_preprocessed['review_sentence'] = df_preprocessed['review_sentence'].apply(text_process_prelim)

In [None]:
df_preprocessed['review_item_header_content'] = df_preprocessed['review_item_header_content'].apply(text_process_prelim)

### Remove reviews that are now empty

In [None]:
df_preprocessed = df_preprocessed[~pd.isnull(df_preprocessed['review_sentence'])]

In [None]:
len(df_preprocessed)

### Write out negative review sentences to csv file

In [None]:
df_preprocessed_neg_sentences = df_preprocessed[df_preprocessed['review_label']==0]
df_preprocessed_neg_sentences['review_topic'] = np.nan
df_preprocessed_neg_sentences.to_csv('./datasets/df_negative_sentences.csv',index=False)

### Write out positive review sentences to csv file

In [None]:
df_preprocessed_pos_sentences = df_preprocessed[df_preprocessed['review_label']==1]
df_preprocessed_pos_sentences['review_topic'] = np.nan
df_preprocessed_pos_sentences.to_csv('./datasets/df_positive_sentences.csv',index=False)