# Проект: Exploratory Data Analysis and Feature Engineering #

In [16]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

# Для корректной работы Jupyter Notebook
%matplotlib inline
# Для корректного отображения графиков в тёмной теме
plt.style.use('default')

In [17]:
train_ini = pd.read_csv('data/hotels_train.csv')
test_ini = pd.read_csv('data/hotels_test.csv')
submission_ini = pd.read_csv('data/submission.csv')

In [18]:
train_ini.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 386803 entries, 0 to 386802
Data columns (total 17 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   hotel_address                               386803 non-null  object 
 1   additional_number_of_scoring                386803 non-null  int64  
 2   review_date                                 386803 non-null  object 
 3   average_score                               386803 non-null  float64
 4   hotel_name                                  386803 non-null  object 
 5   reviewer_nationality                        386803 non-null  object 
 6   negative_review                             386803 non-null  object 
 7   review_total_negative_word_counts           386803 non-null  int64  
 8   total_number_of_reviews                     386803 non-null  int64  
 9   positive_review                             386803 non-null  object 
 

In [19]:
test_ini.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128935 entries, 0 to 128934
Data columns (total 16 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   hotel_address                               128935 non-null  object 
 1   additional_number_of_scoring                128935 non-null  int64  
 2   review_date                                 128935 non-null  object 
 3   average_score                               128935 non-null  float64
 4   hotel_name                                  128935 non-null  object 
 5   reviewer_nationality                        128935 non-null  object 
 6   negative_review                             128935 non-null  object 
 7   review_total_negative_word_counts           128935 non-null  int64  
 8   total_number_of_reviews                     128935 non-null  int64  
 9   positive_review                             128935 non-null  object 
 

In [20]:
submission_ini.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128935 entries, 0 to 128934
Data columns (total 2 columns):
 #   Column          Non-Null Count   Dtype
---  ------          --------------   -----
 0   reviewer_score  128935 non-null  int64
 1   id              128935 non-null  int64
dtypes: int64(2)
memory usage: 2.0 MB


In [24]:
X = train_ini.drop(columns=[
                            'hotel_address',
                            'review_date',
                            'hotel_name',
                            'reviewer_nationality',
                            'negative_review',
                            'positive_review',
                            'reviewer_score',
                            'tags',
                            'days_since_review',
                            'lat',
                            'lng'
                           ])
y = train_ini.reviewer_score

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [26]:
regressor = RandomForestRegressor(n_estimators=100)
regressor.fit(X_train, y_train)
y_predictor = regressor.predict(X_test)
print(f'MAPE:', metrics.mean_absolute_percentage_error(y_test, y_predictor))

MAPE: 0.14137823019247045


In [27]:
y_predictor.shape

(96701,)

In [28]:
submission_ini.shape

(128935, 2)

In [30]:
X_submit = test_ini.drop(columns=[
                            'hotel_address',
                            'review_date',
                            'hotel_name',
                            'reviewer_nationality',
                            'negative_review',
                            'positive_review',
                            'tags',
                            'days_since_review',
                            'lat',
                            'lng'
                           ])
y_submit = regressor.predict(X_submit)

In [31]:
y_submit.shape

(128935,)

In [32]:
submission_fin = submission_ini.copy()
submission_fin.reviewer_score = y_submit
submission_fin.head()

Unnamed: 0,reviewer_score,id
0,8.525,488440
1,7.558,274649
2,8.279,374688
3,9.697,404352
4,9.50393,451596


In [33]:
submission_fin.to_csv('output/submission.csv')