### Import libraries

In [1]:
import pandas as pd
import numpy as np
import csv
import nltk.data

### Define useful functions

In [2]:
def determine_travel_type(row):
    if 'Business trip' in row:
        return 'Business'
    elif 'Leisure trip' in row:
        return 'Leisure'
    else:
        return None
    
def determine_traveler_type(row):
    if 'Family with young children' in row:
        return 'Family with young children'
    elif 'Couple' in row:
        return 'Couple'
    elif 'Solo traveller' in row:
        return 'Solo traveler'
    elif 'Group' in row:
        return 'Group'
    elif 'People with friends':
        return 'People with friends'
    else:
        return None
    
def determine_time_of_travel(row):
    row=str(row)
    if ('December' in row) or ('January' in row) or ('February' in row):
        return 'Winter'
    elif ('March' in row) or ('April' in row) or ('May' in row):
        return 'Spring'
    elif ('June' in row) or ('July' in row) or ('August' in row):
        return 'Summer'
    elif ('ptember' in row) or ('October' in row) or ('November' in row):
        return 'Fall'
    else:
        return None
    
def split_reviews(review_item):
    if pd.isnull(review_item):
        return np.nan
    else:
        return tokenizer.tokenize(review_item)

### Fix datasets

In [30]:
df_az_1 = pd.read_csv('./datasets/old_files/sample1_arizona_hotel_reviews_list.csv')
df_az_1['state']='AZ'
df_az_1.drop(['reviewer_name','reviewer_country'],axis=1,inplace=True)
df_az_1.to_csv('./datasets/final_arizona_hotel_reviews_list.csv', index=False)

df_ny_1 = pd.read_csv('./datasets/old_files/sample1_nys_hotel_reviews_list.csv')
df_ny_1['state']='NY'
df_ny_1.drop(['reviewer_name','reviewer_country'],axis=1,inplace=True)

df_ny_2 = pd.read_csv('./datasets/old_files/sample2_nys_hotel_reviews_list.csv')
df_ny_2['state']='NY'
df_ny_2.drop(['reviewer_name','reviewer_country'],axis=1,inplace=True)

df_ny_3 = pd.read_csv('./datasets/old_files/sample3_nys_hotel_reviews_list.csv')
df_ny_3['state']='NY'

df_ny = pd.concat([df_ny_1,df_ny_2,df_ny_3],ignore_index=True)
df_ny.to_csv('./datasets/final_nys_hotel_reviews_list.csv', index=False)

### Import datasets

In [31]:
df_az = pd.read_csv('./datasets/final_arizona_hotel_reviews_list.csv',lineterminator='\n')
df_ny = pd.read_csv('./datasets/final_nys_hotel_reviews_list.csv',lineterminator='\n')

### Combine into one dataset

In [9]:
#df_complete = pd.concat([df_az,df_ny],ignore_index=True)
df_complete = df_ny.copy()

### Split reviews into sentences

In [11]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [12]:
df_complete['review_pos_split']=df_complete['review_pos'].apply(split_reviews)
df_complete['review_neg_split']=df_complete['review_neg'].apply(split_reviews)

### Create a new data point for each sentence

In [13]:
df_complete_pos_col = df_complete.apply(lambda x: pd.Series(x['review_pos_split']),axis=1).stack().reset_index(level=1, drop=True)
df_complete_pos_col.name = 'review_pos_sentence'

df_complete_pos = df_complete.copy()
df_complete_pos.drop('review_pos_split', axis=1, inplace=True)
df_complete_pos = df_complete_pos.join(df_complete_pos_col)

  df_complete_pos_col = df_complete.apply(lambda x: pd.Series(x['review_pos_split']),axis=1).stack().reset_index(level=1, drop=True)


In [14]:
df_complete_neg_col = df_complete.apply(lambda x: pd.Series(x['review_neg_split']),axis=1).stack().reset_index(level=1, drop=True)
df_complete_neg_col.name = 'review_neg_sentence'

df_complete_neg = df_complete.copy()
df_complete_neg.drop('review_neg_split', axis=1, inplace=True)
df_complete_neg = df_complete_neg.join(df_complete_neg_col)

  df_complete_neg_col = df_complete.apply(lambda x: pd.Series(x['review_neg_split']),axis=1).stack().reset_index(level=1, drop=True)


### Clean up dataframes by removing unneeded columns and adding label column

In [15]:
df_complete_pos.drop(['review_pos','review_neg','review_neg_split'], axis=1, inplace=True)
df_complete_pos['review_label']=1

In [16]:
df_complete_neg.drop(['review_pos','review_neg','review_pos_split'], axis=1, inplace=True)
df_complete_neg['review_label']=0

### Remove reviews without text

In [17]:
df_complete_pos = df_complete_pos[~pd.isnull(df_complete_pos['review_pos_sentence'])]

In [18]:
df_complete_neg = df_complete_neg[~pd.isnull(df_complete_neg['review_neg_sentence'])]

### Merge dataframes

In [19]:
df_complete_pos.rename(columns={'review_pos_sentence': 'review_sentence'},inplace=True)
df_complete_neg.rename(columns={'review_neg_sentence': 'review_sentence'},inplace=True)

In [20]:
df_preprocessed = pd.concat([df_complete_pos,df_complete_neg],ignore_index=True)

### Convert review_info_tag column into multiple columns

In [21]:
df_preprocessed['review_info_tag'] = df_preprocessed['review_info_tag'].apply(lambda x:x.strip('[]\'').split('\', \''))

In [22]:
df_preprocessed['review_traveler_type'] = df_preprocessed['review_info_tag'].apply(determine_traveler_type)

In [23]:
df_preprocessed['review_time_of_travel'] = df_preprocessed['review_staydate'].apply(determine_time_of_travel)

### Remove 'Everything' and 'Nothing' reviews

In [24]:
filter_remove_oneliner = (df_preprocessed['review_sentence']=='Everything.') | (df_preprocessed['review_sentence']=='everything.') | (df_preprocessed['review_sentence']=='Everything') | (df_preprocessed['review_sentence']=='everything') | (df_preprocessed['review_sentence']=='Nothing.') | (df_preprocessed['review_sentence']=='nothing.') | (df_preprocessed['review_sentence']=='Nothing') | (df_preprocessed['review_sentence']=='nothing')

In [25]:
# to do: Location, Staff, Breakfast, None, All, N/A, na, ...

In [26]:
df_preprocessed = df_preprocessed[~filter_remove_oneliner]

### Replace '\n','\r' with '.' from review sentences and header content

In [27]:
df_preprocessed['review_sentence'] = df_preprocessed['review_sentence'].apply(lambda x:x.replace('\n', '. ').replace('\r', '. '))
df_preprocessed['review_item_header_content'] = df_preprocessed['review_item_header_content'].apply(lambda x:str(x).replace('\n', '. ').replace('\r', '. '))

### Write out to csv file

In [29]:
df_preprocessed.to_csv('./datasets/df_preprocessed.csv',index=False)