# Загрузка Pandas и очистка данных

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline

# Загружаем специальный удобный инструмент для разделения датасета:
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('main_task.csv')

In [3]:
df.head(5)

Unnamed: 0,Restaurant_id,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA
0,id_5569,Paris,"['European', 'French', 'International']",5570.0,3.5,$$ - $$$,194.0,"[['Good food at your doorstep', 'A good hotel ...",/Restaurant_Review-g187147-d1912643-Reviews-R_...,d1912643
1,id_1535,Stockholm,,1537.0,4.0,,10.0,"[['Unique cuisine', 'Delicious Nepalese food']...",/Restaurant_Review-g189852-d7992032-Reviews-Bu...,d7992032
2,id_352,London,"['Japanese', 'Sushi', 'Asian', 'Grill', 'Veget...",353.0,4.5,$$$$,688.0,"[['Catch up with friends', 'Not exceptional'],...",/Restaurant_Review-g186338-d8632781-Reviews-RO...,d8632781
3,id_3456,Berlin,,3458.0,5.0,,3.0,"[[], []]",/Restaurant_Review-g187323-d1358776-Reviews-Es...,d1358776
4,id_615,Munich,"['German', 'Central European', 'Vegetarian Fri...",621.0,4.0,$$ - $$$,84.0,"[['Best place to try a Bavarian food', 'Nice b...",/Restaurant_Review-g187309-d6864963-Reviews-Au...,d6864963


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Restaurant_id      40000 non-null  object 
 1   City               40000 non-null  object 
 2   Cuisine Style      30717 non-null  object 
 3   Ranking            40000 non-null  float64
 4   Rating             40000 non-null  float64
 5   Price Range        26114 non-null  object 
 6   Number of Reviews  37457 non-null  float64
 7   Reviews            40000 non-null  object 
 8   URL_TA             40000 non-null  object 
 9   ID_TA              40000 non-null  object 
dtypes: float64(3), object(7)
memory usage: 3.1+ MB


In [5]:
# Для примера я возьму столбец Number of Reviews
df['Number_of_Reviews_isNAN'] = pd.isna(df['Number of Reviews']).astype('uint8')

In [6]:
df['Number_of_Reviews_isNAN']

0        0
1        0
2        0
3        0
4        0
        ..
39995    0
39996    0
39997    0
39998    0
39999    0
Name: Number_of_Reviews_isNAN, Length: 40000, dtype: uint8

In [7]:
# Далее заполняем пропуски 0, вы можете попробовать заполнением средним или средним по городу и тд...
df['Number of Reviews'].fillna(0, inplace=True)

df['Price Range'].fillna(df['Price Range'].mode()[0], inplace=True)
df['Cuisine Style'] = df['Cuisine Style'].fillna("['Other']")

In [8]:
df.nunique(dropna=False)

Restaurant_id              11909
City                          31
Cuisine Style               9008
Ranking                    11936
Rating                         9
Price Range                    3
Number of Reviews           1460
Reviews                    33516
URL_TA                     39980
ID_TA                      39980
Number_of_Reviews_isNAN        2
dtype: int64

In [9]:
# для One-Hot Encoding в pandas есть готовая функция - get_dummies. Особенно радует параметр dummy_na
df = pd.get_dummies(df, columns=[ 'City',], dummy_na=True)

In [10]:
df.head(5)

Unnamed: 0,Restaurant_id,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA,Number_of_Reviews_isNAN,...,City_Oporto,City_Oslo,City_Paris,City_Prague,City_Rome,City_Stockholm,City_Vienna,City_Warsaw,City_Zurich,City_nan
0,id_5569,"['European', 'French', 'International']",5570.0,3.5,$$ - $$$,194.0,"[['Good food at your doorstep', 'A good hotel ...",/Restaurant_Review-g187147-d1912643-Reviews-R_...,d1912643,0,...,0,0,1,0,0,0,0,0,0,0
1,id_1535,['Other'],1537.0,4.0,$$ - $$$,10.0,"[['Unique cuisine', 'Delicious Nepalese food']...",/Restaurant_Review-g189852-d7992032-Reviews-Bu...,d7992032,0,...,0,0,0,0,0,1,0,0,0,0
2,id_352,"['Japanese', 'Sushi', 'Asian', 'Grill', 'Veget...",353.0,4.5,$$$$,688.0,"[['Catch up with friends', 'Not exceptional'],...",/Restaurant_Review-g186338-d8632781-Reviews-RO...,d8632781,0,...,0,0,0,0,0,0,0,0,0,0
3,id_3456,['Other'],3458.0,5.0,$$ - $$$,3.0,"[[], []]",/Restaurant_Review-g187323-d1358776-Reviews-Es...,d1358776,0,...,0,0,0,0,0,0,0,0,0,0
4,id_615,"['German', 'Central European', 'Vegetarian Fri...",621.0,4.0,$$ - $$$,84.0,"[['Best place to try a Bavarian food', 'Nice b...",/Restaurant_Review-g187309-d6864963-Reviews-Au...,d6864963,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# обработки признака Price Range
df['Price Range'].value_counts()

$$ - $$$    32298
$            6279
$$$$         1423
Name: Price Range, dtype: int64

In [12]:
# Присваиваем числовой формат категориям
prices = { '$$$$':3,
            '$$ - $$$':2, 
            '$':1}
df['Price Range'] = df['Price Range'].replace(to_replace = prices)

In [13]:
df['Price Range']

0        2
1        2
2        3
3        2
4        2
        ..
39995    2
39996    2
39997    2
39998    2
39999    2
Name: Price Range, Length: 40000, dtype: int64

In [14]:
# обработка признака по отзывам
df['Reviews'][1]

"[['Unique cuisine', 'Delicious Nepalese food'], ['07/06/2017', '06/19/2016']]"

In [15]:
# находим пропущенные
df['Reviews'].isna().sum()

0

In [16]:
# паттерн для поиска дат
pattern = re.compile('\'\d+\/\d+\/\d+\'')
dates = df['Reviews'].apply(pattern.findall)
# врeменные признаки
df['date1'] = pd.to_datetime(dates.apply(lambda x: x[0] if len(x) > 0 else None))
df['date2'] = pd.to_datetime(dates.apply(lambda x: x[1] if len(x) > 1 else None))

display(df.loc[:, ['date1', 'date2']].sample(5))

Unnamed: 0,date1,date2
10111,2017-07-27,2017-07-18
29237,2017-11-25,2017-10-26
8909,2017-08-10,2017-06-21
5405,2017-08-19,2017-04-18
14864,2017-12-30,2017-11-18


In [17]:
# Создаем столбец с количеством дней между отзывами и заполняем
df['Between review'] = df['date1'] - df['date2']
df['Between review'] = df['Between review'].apply(lambda x: abs(x.days))

# заполняем пропуски
df['Between review'].fillna(df['Between review'].median(), inplace=True)

In [18]:
# Создаем столбец с количеством дней после последнего отзыва и заполняем
from datetime import date
df['Last review'] = datetime.today() - df['date1']
df['Last review'] = df['Last review'].apply(lambda x: x.days)

df['Last review'].fillna(df['Last review'].median(), inplace=True)

In [19]:
# check
display(df)

Unnamed: 0,Restaurant_id,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA,Number_of_Reviews_isNAN,...,City_Rome,City_Stockholm,City_Vienna,City_Warsaw,City_Zurich,City_nan,date1,date2,Between review,Last review
0,id_5569,"['European', 'French', 'International']",5570.0,3.5,2,194.0,"[['Good food at your doorstep', 'A good hotel ...",/Restaurant_Review-g187147-d1912643-Reviews-R_...,d1912643,0,...,0,0,0,0,0,0,2017-12-31,2017-11-20,41.0,1208.0
1,id_1535,['Other'],1537.0,4.0,2,10.0,"[['Unique cuisine', 'Delicious Nepalese food']...",/Restaurant_Review-g189852-d7992032-Reviews-Bu...,d7992032,0,...,0,1,0,0,0,0,2017-07-06,2016-06-19,382.0,1386.0
2,id_352,"['Japanese', 'Sushi', 'Asian', 'Grill', 'Veget...",353.0,4.5,3,688.0,"[['Catch up with friends', 'Not exceptional'],...",/Restaurant_Review-g186338-d8632781-Reviews-RO...,d8632781,0,...,0,0,0,0,0,0,2018-01-08,2018-01-06,2.0,1200.0
3,id_3456,['Other'],3458.0,5.0,2,3.0,"[[], []]",/Restaurant_Review-g187323-d1358776-Reviews-Es...,d1358776,0,...,0,0,0,0,0,0,NaT,NaT,67.0,1318.0
4,id_615,"['German', 'Central European', 'Vegetarian Fri...",621.0,4.0,2,84.0,"[['Best place to try a Bavarian food', 'Nice b...",/Restaurant_Review-g187309-d6864963-Reviews-Au...,d6864963,0,...,0,0,0,0,0,0,2017-11-18,2017-02-19,272.0,1251.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,id_499,"['Italian', 'Vegetarian Friendly', 'Vegan Opti...",500.0,4.5,2,79.0,"[['The real Italian experience!', 'Wonderful f...",/Restaurant_Review-g187849-d2104414-Reviews-Ro...,d2104414,0,...,0,0,0,0,0,0,2017-12-16,2017-11-12,34.0,1223.0
39996,id_6340,"['French', 'American', 'Bar', 'European', 'Veg...",6341.0,3.5,2,542.0,"[['Parisian atmosphere', 'Bit pricey but inter...",/Restaurant_Review-g187147-d1800036-Reviews-La...,d1800036,0,...,0,0,0,0,0,0,2017-12-21,2017-12-12,9.0,1218.0
39997,id_1649,"['Japanese', 'Sushi']",1652.0,4.5,2,4.0,"[['Good by swedish standards', 'A hidden jewel...",/Restaurant_Review-g189852-d947615-Reviews-Sus...,d947615,0,...,0,1,0,0,0,0,2016-11-03,2008-04-12,3127.0,1631.0
39998,id_640,"['Polish', 'European', 'Eastern European', 'Ce...",641.0,4.0,2,70.0,"[['Underground restaurant', 'Oldest Restaurant...",/Restaurant_Review-g274856-d1100838-Reviews-Ho...,d1100838,0,...,0,0,0,1,0,0,2017-07-11,2017-06-18,23.0,1381.0


# Разбиваем датафрейм на части, необходимые для обучения и тестирования модели

In [20]:
# Х - данные с информацией о ресторанах, у - целевая переменная (рейтинги ресторанов)
X = df.drop(['Restaurant_id','Rating','Cuisine Style'], axis = 1)
y = df['Rating'].values

# Загружаем специальный инструмент для разбивки:
from sklearn.model_selection import train_test_split

# Наборы данных с меткой "train" будут использоваться для обучения модели, "test" - для тестирования.
# Для тестирования мы будем использовать 25% от исходного датасета.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Создаём, обучаем и тестируем модель

In [21]:
# Импортируем необходимые библиотеки:
from sklearn.ensemble import RandomForestRegressor # инструмент для создания и обучения модели
from sklearn import metrics # инструменты для оценки точности модели

# Создаём модель
regr = RandomForestRegressor(n_estimators=100)

# Обучаем модель на тестовом наборе данных
regr.fit(X_train, y_train)

# Используем обученную модель для предсказания рейтинга ресторанов в тестовой выборке.
# Предсказанные значения записываем в переменную y_pred
y_pred = regr.predict(X_test)

# Сравниваем предсказанные значения (y_pred) с реальными (y_test), и смотрим насколько они в среднем отличаются
# Метрика называется Mean Absolute Error (MAE) и показывает среднее отклонение предсказанных значений от фактических.
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))

ValueError: could not convert string to float: "[['Chills and Breath taking'], ['01/10/2016']]"