In [1]:
# Import modules
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from comet_ml import Experiment
from sklearn import metrics

In [2]:
# Reading secret JSON with api_keys
import json

with open ('data/secret.json', 'r') as f:
    secret = json.load(f)

In [3]:
# Select random state for project
random_state = 42

In [4]:
# Reading csv income data as dataframe and show head 5 
hotels_df = pd.read_csv('data/hotels.csv')
hotels_df.head()

Unnamed: 0,hotel_address,additional_number_of_scoring,review_date,average_score,hotel_name,reviewer_nationality,negative_review,review_total_negative_word_counts,total_number_of_reviews,positive_review,review_total_positive_word_counts,total_number_of_reviews_reviewer_has_given,reviewer_score,tags,days_since_review,lat,lng
0,Stratton Street Mayfair Westminster Borough Lo...,581,2/19/2016,8.4,The May Fair Hotel,United Kingdom,Leaving,3,1994,Staff were amazing,4,7,10.0,"[' Leisure trip ', ' Couple ', ' Studio Suite ...",531 day,51.507894,-0.143671
1,130 134 Southampton Row Camden London WC1B 5AF...,299,1/12/2017,8.3,Mercure London Bloomsbury Hotel,United Kingdom,poor breakfast,3,1361,location,2,14,6.3,"[' Business trip ', ' Couple ', ' Standard Dou...",203 day,51.521009,-0.123097
2,151 bis Rue de Rennes 6th arr 75006 Paris France,32,10/18/2016,8.9,Legend Saint Germain by Elegancia,China,No kettle in room,6,406,No Positive,0,14,7.5,"[' Leisure trip ', ' Solo traveler ', ' Modern...",289 day,48.845377,2.325643
3,216 Avenue Jean Jaures 19th arr 75019 Paris Fr...,34,9/22/2015,7.5,Mercure Paris 19 Philharmonie La Villette,United Kingdom,No Negative,0,607,Friendly staff quiet comfortable room spotles...,11,8,10.0,"[' Leisure trip ', ' Solo traveler ', ' Standa...",681 day,48.888697,2.39454
4,Molenwerf 1 1014 AG Amsterdam Netherlands,914,3/5/2016,8.5,Golden Tulip Amsterdam West,Poland,Torn sheets,4,7586,The staff was very friendly and helpful Break...,20,10,9.6,"[' Business trip ', ' Couple ', ' Standard Dou...",516 day,52.385601,4.84706


In [5]:
# Checking data frame size and information about type of data
print(f'Data frame size: {hotels_df.shape}')
print()
display(hotels_df.info())

Data frame size: (386803, 17)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 386803 entries, 0 to 386802
Data columns (total 17 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   hotel_address                               386803 non-null  object 
 1   additional_number_of_scoring                386803 non-null  int64  
 2   review_date                                 386803 non-null  object 
 3   average_score                               386803 non-null  float64
 4   hotel_name                                  386803 non-null  object 
 5   reviewer_nationality                        386803 non-null  object 
 6   negative_review                             386803 non-null  object 
 7   review_total_negative_word_counts           386803 non-null  int64  
 8   total_number_of_reviews                     386803 non-null  int64  
 9   positive_review                        

None

In [6]:
# Finding nulls numbers in lat and lng columns
print(f"lat column have {round(hotels_df['lat'].isnull().sum() / hotels_df.shape[0] * 100, 2)}% nulls")
print(f"lng column have {round(hotels_df['lng'].isnull().sum() / hotels_df.shape[0] * 100, 2)}% nulls")

lat column have 0.63% nulls
lng column have 0.63% nulls


In [7]:
# Finding hotel unique numbers
print(f"Number of unique hotle names: {hotels_df['hotel_name'].nunique()}")

Number of unique hotle names: 1492


In [8]:
# Converting review date column to datetime format
hotels_df['review_date'] = pd.to_datetime(hotels_df['review_date'], format='%m/%d/%Y')
# Checking date interval
print(f'Last review was: {hotels_df["review_date"].max().date()}')
print(f'First review was: {hotels_df["review_date"].min().date()}')

Last review was: 2017-08-03
First review was: 2015-08-04


In [None]:
# # Checking and clean data from duplicates
# print(f'Duplicates in data: {hotels_df.duplicated().sum()}')

# hotels_df = hotels_df.drop_duplicates()

In [None]:
# Converting string with tags to list in dataframe
def get_tags_list(tags_string):
    tags_list = tags_string.strip("[]").split(", ")
    tags_list = [tag.strip("' ") for tag in tags_list]
    return tags_list

hotels_df['tags'] = hotels_df['tags'].apply(lambda x: x.lower())
hotels_df['tags'] = hotels_df['tags'].apply(get_tags_list)
# Finding unique tags
all_tags = []
for tags_list in hotels_df['tags']:
    all_tags.extend(tags_list)
unique_tags = set(all_tags)

In [None]:
# Displaying numbers of unique tags
print(f'Number of unique tags: {len(unique_tags)}')
# Dsiplaying top tags by counting
all_tags = pd.Series(all_tags)
all_tags.value_counts()[all_tags.value_counts()>10000]

Was finded tags with information about stayed nights numbers. Let's create new column with these information and clean it from tags.

In [None]:
# Finding number of stayed nights from tags
def get_night_numbers(tags):
    for tag in tags:
        if 'stayed' in tag:
            tag = tag.split(sep=' ')
            return int(tag[1])
        
# Checking recieved Data
print(f"Unique recieved values: {hotels_df['tags'].apply(get_night_numbers).unique()}")
hotels_df['nights_satyed'] = hotels_df['tags'].apply(get_night_numbers)

# Cleaning tags from stayed nights information
def remove_stayed_night(tags):
    tags_list = [tag for tag in tags if not tag.startswith('stayed')]
    return tags_list

hotels_df['tags'] = hotels_df['tags'].apply(remove_stayed_night)

In [None]:
# Finding nan's in stayed nights column
print(f"{hotels_df['nights_satyed'].isna().sum()} nan's")

In [None]:
# Filling night_satyed NaN as mode for the Hotel
for hotel in hotels_df[hotels_df['nights_satyed'].isnull()]['hotel_name'].unique():
    mode_value = hotels_df[hotels_df['hotel_name'] == hotel]['nights_satyed'].mode()[0]
    hotels_df.loc[hotels_df['hotel_name'] == hotel, 'nights_satyed'] = hotels_df[hotels_df['hotel_name'] == hotel]['nights_satyed'].fillna(mode_value)

In [None]:
# Recieving all tags for find most popular
all_tags = []
for tags_list in hotels_df['tags']:
    all_tags.extend(tags_list)
    
all_tags = pd.Series(all_tags)
# Displaying top 20 tags count list 
display(all_tags.value_counts()[0:19])
# Create top 20 tags list
top_tags_list = all_tags.value_counts()[0:19].index

In [None]:
# Creating binar feature for top tags
for tag in top_tags_list:
    hotels_df[tag] = hotels_df['tags'].apply(lambda x: 1 if tag in x else 0)

In [None]:
# Create country column from hotel address
hotels_df['country'] = hotels_df['hotel_address'].apply(lambda x: x.split(sep=' ')[-1])
hotels_df['country'] = hotels_df['country'].apply(lambda x: 'United_Kingdom' if x == 'Kingdom' else x)

In [None]:
# Convert to integer days_since_review column
hotels_df['days_since_review'] = hotels_df['days_since_review'].apply(lambda x: int(x.split(sep=' ')[0]))

In [None]:
# Creating month of review column
hotels_df['review_month'] = hotels_df['review_date'].apply(lambda x: x.month)

In [None]:
#Fill nans in lan and lng columns
# Import time library
import time
# Import OpenCageGeocode for fill nulls in lat and lng
from opencage.geocoder import OpenCageGeocode

key = secret['API_KEY_OPENCAGE']
geocoder = OpenCageGeocode(key)
# Get latitude and longtitude from OpenCageGeocode api
for address in hotels_df[hotels_df['lat'].isnull()]['hotel_address'].unique():
    query = address
    results = geocoder.geocode(query)
    # print(results[0]['geometry']['lat'])
    # print(results[0]['geometry']['lng'])
    lat = float(results[0]['geometry']['lat'])
    lng = float(results[0]['geometry']['lng'])
    hotels_df.loc[hotels_df['hotel_address'] == address, 'lat'] = lat
    hotels_df.loc[hotels_df['hotel_address'] == address, 'lng'] = lng
    # 1 second pause becouse api accept 1 request per second at free subscription
    time.sleep(1)

In [None]:
hotels_df['positive_review'] = hotels_df['positive_review'].apply(lambda x: x.lower())
# Get nothing positive
def get_positive_nothing(review):
    if review == 'no positive' or review == ' nothing ' or review == ' n a' or review == ' nothing' or review == ' ':
        return 1
    else: 
        return 0
    
hotels_df['positive_nothing'] = hotels_df['positive_review'].apply(get_positive_nothing)

# Get nothing negative
def get_negative_nothing(review):
    if review == 'no negative' or review == ' nothing ' or review == ' n a' or review == ' nothing' or review == ' ':
        return 1
    else: 
        return 0

hotels_df['negative_nothing'] = hotels_df['negative_review'].apply(get_negative_nothing)

# Get location from reviews
def get_review_location(review):
    if 'location' in review:
        return 1
    else:
        return 0
    
hotels_df['positive_location'] = hotels_df['positive_review'].apply(get_review_location)
hotels_df['negative_location'] = hotels_df['negative_review'].apply(get_review_location)

# Get everything from reviews
def get_everything_review(review):
    if 'everything' in review:
        return 1
    else:
        return 0
    
hotels_df['positive_everything'] = hotels_df['positive_review'].apply(get_everything_review)
hotels_df['negative_everything'] = hotels_df['negative_review'].apply(get_everything_review)

# Get breakfast from reviews
def get_breakfast_reviews(review):
    if 'breakfast' in review or 'brekfast' in review:
        return 1
    else:
        return 0
    
hotels_df['positive_breakfast'] = hotels_df['positive_review'].apply(get_breakfast_reviews)
hotels_df['negative_breakfast'] = hotels_df['negative_review'].apply(get_breakfast_reviews)

In [None]:
hotels_df = hotels_df.drop(columns=['reviewer_nationality'], axis=1)
hotels_df = hotels_df.drop(['hotel_address', 'review_date', 'negative_review', 'positive_review', 'tags'], axis=1)

In [None]:
hotels_df = pd.get_dummies(hotels_df, columns=['country'])

In [None]:
import category_encoders as ce

bin_encoder = ce.BinaryEncoder(cols=['hotel_name'])
hotel_bin = bin_encoder.fit_transform(hotels_df['hotel_name'])

hotels_df = pd.concat([hotels_df, hotel_bin], axis=1)
hotels_df = hotels_df.drop(columns=['hotel_name'], axis=1)

In [None]:
fig = plt.figure(figsize=(46,16))
sns.heatmap(hotels_df.corr(), annot=True, linecolor='black', linewidths=1, cmap='coolwarm')

In [None]:
hotels_1_df = hotels_df.drop(columns=['lat', 'lng', 'business trip', 'total_number_of_reviews'], axis=1)

In [None]:
fig = plt.figure(figsize=(46,16))
sns.heatmap(hotels_1_df.corr(), annot=True, linecolor='black', linewidths=1, cmap='coolwarm')

In [None]:
cat_cols = ['leisure trip', 'submitted from a mobile device', 'couple', 'solo traveler', 'group', 'family with young children', 'double room', 'standard double room', 'superior double room', 
            'family with older children', 'deluxe double room', 'double or twin room', 'standard double or twin room', 'classic double room', 'superior double or twin room', '2 rooms', 
            'standard twin room', 'single room', 'positive_nothing', 'negative_nothing', 'positive_location', 'negative_location', 'positive_everything', 'negative_everything', 'positive_breakfast', 
            'negative_breakfast', 'country_Austria', 'country_France', 'country_Italy', 'country_Netherlands', 'country_Spain', 'country_United_Kingdom', 'hotel_name_0', 'hotel_name_1', 
            'hotel_name_2', 'hotel_name_3', 'hotel_name_4', 'hotel_name_5', 'hotel_name_6', 'hotel_name_7', 'hotel_name_8', 'hotel_name_9', 'hotel_name_10']

num_cols = ['additional_number_of_scoring', 'average_score', 'review_total_negative_word_counts', 'review_total_positive_word_counts', 'total_number_of_reviews_reviewer_has_given', 
            'days_since_review', 'nights_satyed',]

In [None]:
X = hotels_1_df.drop(['reviewer_score'], axis = 1)  
y = hotels_1_df['reviewer_score']  
      
# Загружаем специальный инструмент для разбивки:  
from sklearn.model_selection import train_test_split  
      
# Наборы данных с меткой "train" будут использоваться для обучения модели, "test" — для тестирования.  
# Для тестирования мы будем использовать 25 % от исходного датасета.  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
y=y.astype('int')

...
from sklearn.feature_selection import chi2 # хи-квадрат

imp_cat = pd.Series(chi2(X[cat_cols], y)[0], index=cat_cols)
imp_cat.sort_values(inplace = True)
fig = plt.figure(figsize=(16,10))
imp_cat.plot(kind = 'barh')

In [None]:
from sklearn.feature_selection import f_classif # anova

imp_num = pd.Series(f_classif(X[num_cols], y)[0], index = num_cols)
imp_num.sort_values(inplace = True)
imp_num.plot(kind = 'barh')

In [None]:
hotels_2_df = hotels_1_df.drop(columns=['days_since_review', 'nights_satyed', 'hotel_name_8', 'hotel_name_4', 'hotel_name_9', 'hotel_name_5', 'hotel_name_3', 'deluxe double room', 'double or twin room', 'hotel_name_6', 'hotel_name_10', 'hotel_name_2', '2 rooms', 'family with older children', 'family with older children', 'country_France', 'country_Italy', 'group', 'hotel_name_0', 'superior double room', 'classic double room', 'standard double or twin room', 'country_Netherlands', 'double room', 'family with young children'])

In [None]:
# hotels_2_df['spring'] = hotels_2_df['review_month'].apply(lambda x: 1 if 3 <= x <= 5 else 0)
# hotels_2_df['summer'] = hotels_2_df['review_month'].apply(lambda x: 1 if 6 <= x <= 8 else 0)
# hotels_2_df['autumn'] = hotels_2_df['review_month'].apply(lambda x: 1 if 9 <= x <= 11 else 0)
# hotels_2_df['winter'] = hotels_2_df['review_month'].apply(lambda x: 1 if ((x == 12) or (1 <= x <= 2)) else 0)

In [None]:
# hotels_2_df = hotels_2_df.drop(columns=['review_month'])

In [None]:
hotels_2_df.shape

In [None]:
X = hotels_2_df.drop(['reviewer_score'], axis = 1)  
y = hotels_2_df['reviewer_score']  
      
# Загружаем специальный инструмент для разбивки:  
from sklearn.model_selection import train_test_split  
      
# Наборы данных с меткой "train" будут использоваться для обучения модели, "test" — для тестирования.  
# Для тестирования мы будем использовать 25 % от исходного датасета.  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestRegressor # инструмент для создания и обучения модели  
from sklearn import metrics # инструменты для оценки точности модели  
      
# Создаём модель  
regr = RandomForestRegressor(n_estimators=100)  
      
# Обучаем модель на тестовом наборе данных  
regr.fit(X_train, y_train)  
      
# Используем обученную модель для предсказания рейтинга отелей в тестовой выборке.  
# Предсказанные значения записываем в переменную y_pred  
y_pred = regr.predict(X_test)

In [None]:
print('MAPE:', metrics.mean_absolute_percentage_error(y_test, y_pred))

In [None]:
print('MAPE:', metrics.mean_absolute_percentage_error(y_test, y_pred))

In [None]:
hotels_1_df.info()