# Проект №3: Exploratory Data Analysis and Feature Engineering #

In [19]:
import pandas as pd

import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

# Для корректной работы Jupyter Notebook
%matplotlib inline
# Для корректного отображения графиков в тёмной теме
plt.style.use('default')

DATA_PATH = 'data/'
# DATA_PATH = '/kaggle/input/sf-booking/'
OUTPUT_PATH = 'output/'
# OUTPUT_PATH = ''
RANDOM_STATE = 42

## Сбор и предварительный анализ данных (Data Collection) ##

In [52]:
train_ini = pd.read_csv(DATA_PATH + 'hotels_train.csv')
test_ini = pd.read_csv(DATA_PATH + 'hotels_test.csv')
submission_df = pd.read_csv(DATA_PATH + 'submission.csv')

In [62]:
# Далее временно удобно работать с копиями датасетов
train_df = train_ini.copy()
test_df = test_ini.copy()

In [22]:
train_df.head(3)

Unnamed: 0,hotel_address,additional_number_of_scoring,review_date,average_score,hotel_name,reviewer_nationality,negative_review,review_total_negative_word_counts,total_number_of_reviews,positive_review,review_total_positive_word_counts,total_number_of_reviews_reviewer_has_given,reviewer_score,tags,days_since_review,lat,lng
0,Stratton Street Mayfair Westminster Borough Lo...,581,2/19/2016,8.4,The May Fair Hotel,United Kingdom,Leaving,3,1994,Staff were amazing,4,7,10.0,"[' Leisure trip ', ' Couple ', ' Studio Suite ...",531 day,51.507894,-0.143671
1,130 134 Southampton Row Camden London WC1B 5AF...,299,1/12/2017,8.3,Mercure London Bloomsbury Hotel,United Kingdom,poor breakfast,3,1361,location,2,14,6.3,"[' Business trip ', ' Couple ', ' Standard Dou...",203 day,51.521009,-0.123097
2,151 bis Rue de Rennes 6th arr 75006 Paris France,32,10/18/2016,8.9,Legend Saint Germain by Elegancia,China,No kettle in room,6,406,No Positive,0,14,7.5,"[' Leisure trip ', ' Solo traveler ', ' Modern...",289 day,48.845377,2.325643


In [36]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 386803 entries, 0 to 386802
Data columns (total 17 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   hotel_address                               386803 non-null  object 
 1   additional_number_of_scoring                386803 non-null  int64  
 2   review_date                                 386803 non-null  object 
 3   average_score                               386803 non-null  float64
 4   hotel_name                                  386803 non-null  object 
 5   reviewer_nationality                        386803 non-null  object 
 6   negative_review                             386803 non-null  object 
 7   review_total_negative_word_counts           386803 non-null  int64  
 8   total_number_of_reviews                     386803 non-null  int64  
 9   positive_review                             386803 non-null  object 
 

In [37]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128935 entries, 0 to 128934
Data columns (total 16 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   hotel_address                               128935 non-null  object 
 1   additional_number_of_scoring                128935 non-null  int64  
 2   review_date                                 128935 non-null  object 
 3   average_score                               128935 non-null  float64
 4   hotel_name                                  128935 non-null  object 
 5   reviewer_nationality                        128935 non-null  object 
 6   negative_review                             128935 non-null  object 
 7   review_total_negative_word_counts           128935 non-null  int64  
 8   total_number_of_reviews                     128935 non-null  int64  
 9   positive_review                             128935 non-null  object 
 

### Создание признаков ###

Извлечение из **`tags`** числа проведённых ночей и создание числового признака **`stayed_nights`**

In [63]:
def get_stayed_nights(tags_str: str) -> int:
    """ Разбивает строку признака 'tags' на отдельные тэги. Затем ищет
    тэг 'Stayed * nights' и возвращает число ночёвок. Если тэг не
    найден, возвращает 0.
    Arguments:
        tags_str [str] -- Содержимое ячейки признака 'tags'
    Returns:
        [int] -- Число проведённых в отеле ночей
    """
    tag_list = tags_str.lstrip('[').rstrip(']').split(', ')
    for tag in tag_list:
        tag = tag.lstrip("\' ").rstrip(" \'")
        words_list = tag.split()
        if words_list[0] == 'Stayed':
            return int(words_list[1])
    return 0

train_df['stayed_nights'] = train_df.tags.apply(get_stayed_nights)
test_df['stayed_nights'] = test_df.tags.apply(get_stayed_nights)

### Преобразование признаков ###

Перевод в числовой вид признака **`days_since_review`**

In [64]:
train_df.days_since_review = train_df.days_since_review.apply(
                                lambda x: int(x.rstrip(' day').rstrip(' days')))
test_df.days_since_review = test_df.days_since_review.apply(
                                lambda x: int(x.rstrip(' day').rstrip(' days')))

In [65]:
train_df.review_date = pd.to_datetime(train_df.review_date)
test_df.review_date = pd.to_datetime(test_df.review_date)

### Отбор признаков ###

In [66]:
drop_features_list = [
    'hotel_address',
    'review_date',
    'hotel_name',
    'reviewer_nationality',
    'negative_review',
    'positive_review',
    'tags',
    # 'days_since_review',
    'lat',
    'lng'
]
train_df.drop(columns=drop_features_list, inplace=True)
test_df.drop(columns=drop_features_list, inplace=True)

### Обучение модели ###

In [67]:
X = train_df.drop(columns=['reviewer_score'])
y = train_df.reviewer_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=RANDOM_STATE)

In [68]:
regressor = RandomForestRegressor(
                                    n_estimators=100,
                                    # criterion='absolute_error',
                                    n_jobs=4,
                                    random_state=RANDOM_STATE
                                 )
regressor.fit(X_train, y_train)
y_predictor = regressor.predict(X_test)
print(f'MAPE:', metrics.mean_absolute_percentage_error(y_test, y_predictor))

MAPE: 0.13755783975356978


### Формирование предсказаний ###

In [69]:
predictions = regressor.predict(test_df)
submission_df.reviewer_score = predictions
submission_df.head()

Unnamed: 0,reviewer_score,id
0,8.744,488440
1,7.097,274649
2,8.349,374688
3,9.56,404352
4,9.856,451596


In [70]:
submission_df.shape

(128935, 2)

In [71]:
submission_df.to_csv(OUTPUT_PATH + 'submission.csv', index=False)