# Модель предсказания рейтингов европейских ресторанов

## 0. Подключение библиотек для работы с данными

In [None]:
!pip install validators pandarallel

In [None]:
import math
import numbers
import datetime as dt
import regex
import validators

import numpy as np
import pandas as pd
from tqdm import tqdm
from pandarallel import pandarallel
import multiprocessing

import urllib3
import requests

import matplotlib.pyplot as plt

In [None]:
!pip freeze > requirements.txt

### 0.1 Оформление

In [None]:
pd.set_option('display.encoding', "UTF-8")
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

pd.set_option('display.html.border', 2)
pd.set_option('display.html.table_schema', True)

In [None]:
from IPython.core.display import display, HTML


def css_border(x):
    return ["border-left: 2px solid black" if (i % 2 == 0) else "border-left: 2px solid black" for i, col in enumerate(x)]


def pd_html(df: pd.DataFrame):
    return HTML(( df.style.apply(css_border, axis=1).render() ))


def display_pd_html(df: pd.DataFrame):
    return display(pd_html(df))

### 0.2 Инициализация

In [None]:
tqdm.pandas()
pandarallel.initialize(progress_bar=False)

CPU_COUNT = multiprocessing.cpu_count()
print(f"Number of CPUs: {CPU_COUNT}")

### 0.3 Логирование событий
Фрагмент кода ниже позаимствован по ссылке: [Colour Logging - Works in Jupyter Lab/Notebook](https://gist.github.com/joshbode/58fac7ababc700f51e2a9ecdebe563ad)

In [None]:
import sys
import logging
from typing import Optional, Dict

from colorama import Fore, Back, Style


class ColoredFormatter(logging.Formatter):
    """Colored log formatter."""

    def __init__(self, *args, colors: Optional[Dict[str, str]]=None, **kwargs) -> None:
        """Initialize the formatter with specified format strings."""

        super().__init__(*args, **kwargs)

        self.colors = colors if colors else {}

    def format(self, record) -> str:
        """Format the specified record as text."""

        record.color = self.colors.get(record.levelname, '')
        record.reset = Style.RESET_ALL

        return super().format(record)


formatter = ColoredFormatter(
    '{asctime} |{color} {levelname:8} {reset}| {name} | {message}',
    style='{', datefmt='%Y-%m-%d %H:%M:%S',
    colors={
        'DEBUG': Fore.CYAN,
        'INFO': Fore.GREEN,
        'WARNING': Fore.YELLOW,
        'ERROR': Fore.RED,
        'CRITICAL': Fore.RED + Back.WHITE + Style.BRIGHT,
    }
)

handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(formatter)

LOGGER = logging.getLogger()
LOGGER.handlers[:] = []
LOGGER.addHandler(handler)
LOGGER.setLevel(logging.DEBUG)

LOGGER.info("This is a test log message.")

## 1. Загрузка сырых данных

### 1.1 Загрузка данных о рейтингах ресторанов

In [None]:
train_data = pd.read_csv('/kaggle/input/sf-dst-restaurant-rating/main_task.csv')

In [None]:
display_pd_html(train_data.sample(5))

### 1.2 Загрузка тестовых данных и данных для kaggle submission

In [None]:
test_data = pd.read_csv('/kaggle/input/sf-dst-restaurant-rating/kaggle_task.csv')
sample_submission = pd.read_csv('/kaggle/input/sf-dst-restaurant-rating/sample_submission.csv')

### 1.3 Разметка данных для обучения и для теста

In [None]:
train_data["sample"] = 1
test_data["sample"] = 0
test_data["Rating"] = 0

data = test_data.append(train_data, sort=False).reset_index(drop=True)

In [None]:
data.info(verbose=True, null_counts=True)

In [None]:
display_pd_html(data.sample(5))

### 1.4 Загрузка данных о городах мира

In [None]:
raw_world_cities = pd.read_csv('/kaggle/input/world-cities/worldcities.csv')

In [None]:
raw_world_cities2 = pd.read_csv('/kaggle/input/world-cities-database/worldcitiespop.csv', low_memory=False)

## 2. Функции для преобразования записей в колонках

### 2.1 Преобразование строки с оценкой интервала цены чека за заказ в ресторане

#### 2.1.1 Регулярные выражения

In [None]:
STRING_REGEX_PRICE_RANGE = "\${1,4}\s*-\s*\${1,4}"
STRING_REGEX_PRICE_LEFT_RANGE_VALUE = "^\${1,4}(?=\s*-)"
STRING_REGEX_PRICE_RIGHT_RANGE_VALUE = "(?<=-\s*)\${1,4}$"

STRING_REGEX_PRICE_SINGLE = "\${1,4}"

REGEX_PRICE_RANGE = regex.compile(STRING_REGEX_PRICE_RANGE)
REGEX_PRICE_LEFT_RANGE_VALUE = regex.compile(STRING_REGEX_PRICE_LEFT_RANGE_VALUE)
REGEX_PRICE_RIGHT_RANGE_VALUE = regex.compile(STRING_REGEX_PRICE_RIGHT_RANGE_VALUE)
REGEX_PRICE_SINGLE = regex.compile(STRING_REGEX_PRICE_SINGLE)

#### 2.1.2 Функции/словари для преобразования значения ячейки оценки ценника в ресторане

In [None]:
DICTIONARY_PRICE_RANGE_TO_INTEGER = {
    None:         np.nan,
    math.nan:     np.nan,
    np.nan:       np.nan,
    "$":          1,
    "$$ - $$$":   2,
    "$$$$":       3
}

### 2.2 Преобразование строковых данных в массивы/списки

#### 2.2.1 Регулярные выражения

In [None]:
STRING_REGEX_SEPARATOR = "[,;]"
STRING_REGEX_SEPARATOR_SQUARE = "[,;\]]"
STRING_REGEX_QUOTES = """['"]"""
STRING_REGEX_SYMBOLS = """\p{IsLatin}\p{Other_Letter}\p{Mark}\p{Digit}\p{Currency_Symbol}\p{Math_Symbol}\p{Initial_Punctuation}\p{Final_Punctuation}\p{Dash_Punctuation}\p{Connector_Punctuation}\p{Other_Symbol}\p{Separator}\p{Space_Separator}\p{Line_Separator}p{Enclosing_Mark}&\.!\?\\\/\s"""
STRING_REGEX_COMMON_SYMBOLS = f"""{STRING_REGEX_SYMBOLS}\"\'"""

STRING_REGEX_LIST_ITEM = f"""[{STRING_REGEX_COMMON_SYMBOLS}]*"""
STRING_REGEX_LIST_ITEM_QUOTED = f"""{STRING_REGEX_QUOTES}{STRING_REGEX_LIST_ITEM}{STRING_REGEX_QUOTES}"""
STRING_REGEX_LIST_COMMA_WITH_BRACKETS_SQUARE = f"""\s*\[\s*({STRING_REGEX_LIST_ITEM_QUOTED}\s*{STRING_REGEX_SEPARATOR}\s*)*({STRING_REGEX_LIST_ITEM_QUOTED})?\s*\]\s*"""
STRING_REGEX_LIST_OF_LISTS_COMMA_WITH_BRACKETS_SQUARE = f"""\s*\[\s*({STRING_REGEX_LIST_COMMA_WITH_BRACKETS_SQUARE}\s*{STRING_REGEX_SEPARATOR}\s*)*({STRING_REGEX_LIST_COMMA_WITH_BRACKETS_SQUARE})?\s*\]\s*"""

REGEX_LIST_COMMA_WITH_BRACKETS_SQUARE = regex.compile(STRING_REGEX_LIST_COMMA_WITH_BRACKETS_SQUARE)
REGEX_LIST_ITEM_COMMA_WITH_BRACKETS_SQUARE = regex.compile(f"""(?<={STRING_REGEX_QUOTES}){STRING_REGEX_LIST_ITEM}(?={STRING_REGEX_QUOTES}\s*{STRING_REGEX_SEPARATOR_SQUARE})""")
REGEX_LIST_OF_LISTS_COMMA_WITH_BRACKETS_SQUARE = regex.compile(STRING_REGEX_LIST_OF_LISTS_COMMA_WITH_BRACKETS_SQUARE)
REGEX_LIST_ITEM_OF_LISTS_COMMA_WITH_BRACKETS_SQUARE = regex.compile(f"""\[[{STRING_REGEX_COMMON_SYMBOLS},;]*\]""")

# Additional regular expressions to clean the values up
STRING_REGEX_SEPARATOR_INSIDE = f"""(?<=[{STRING_REGEX_SYMBOLS}]+){STRING_REGEX_SEPARATOR}(?=[{STRING_REGEX_SYMBOLS}]+)"""
STRING_REGEX_QUOTES_DOUBLED = f"""\"{{2,}}|\'{{2,}}"""
STRING_REGEX_QUOTES_SPACE_SIDE = f"""(^['"\s]+|['"\s]+$)"""
REGEX_SEPARATOR_INSIDE = regex.compile(STRING_REGEX_SEPARATOR_INSIDE)
REGEX_QUOTES_DOUBLED = regex.compile(STRING_REGEX_QUOTES_DOUBLED)
REGEX_QUOTES_SPACE_SIDE = regex.compile(STRING_REGEX_QUOTES_SPACE_SIDE)

#### 2.2.2 Функции для преобразования значений строковых ячеек со списками или списками списков в соответствующие объекты списков в оперативной памями

In [None]:
def string_to_list_of_strings(string_list):
    """Converts a string list surrounded with square brackets to a list of strings.
    """
    if isinstance(string_list, list):
        return string_list
    if isinstance(string_list, str):
        if not string_list:
            return None
        elif REGEX_LIST_COMMA_WITH_BRACKETS_SQUARE.match(string_list):
            new_list = list(
                map(lambda value: regex.sub(REGEX_QUOTES_SPACE_SIDE, "", value),
                    filter(lambda value: bool(value.strip()), REGEX_LIST_ITEM_COMMA_WITH_BRACKETS_SQUARE.findall(
                        regex.sub(REGEX_QUOTES_DOUBLED, "\"", regex.sub(
                            REGEX_SEPARATOR_INSIDE, "", string_list))
                    )
                )))
            return new_list if new_list else None
        else:
            raise Exception(f"Wrong string list: \"{string_list}\"")
    else:
        if isinstance(string_list, numbers.Number):
            if np.isnan(string_list):
                return None
            else:
                raise Exception(f"Wrong string list: \"{string_list}\"")
        else:
            raise Exception(f"Wrong string list: \"{string_list}\"")


def string_to_list_of_lists(string_list):
    """Converts a string list surrounded with square brackets to a list of lists of strings.
    """
    if isinstance(string_list, str):
        if not string_list:
            return None
        elif REGEX_LIST_OF_LISTS_COMMA_WITH_BRACKETS_SQUARE.match(string_list):
            list_strings = REGEX_LIST_ITEM_OF_LISTS_COMMA_WITH_BRACKETS_SQUARE.findall(
                string_list)
            list_of_lists = list(map(lambda list_of_strings: string_to_list_of_strings(
                list_of_strings), list_strings))
            if not list_of_lists:
                return None
            else:
                return list_of_lists
    else:
        if isinstance(string_list, numbers.Number):
            if np.isnan(string_list):
                return None
            else:
                raise Exception(f"Wrong string list: \"{string_list}\"")
        else:
            raise Exception(f"Wrong string list: \"{string_list}\"")


def interpret_list_of_lists_as_comments(list_of_lists: list):
    if not list_of_lists:
        return None
    assert len(list_of_lists) == 2, f"The list cannot beinterpreted as a list of comments: " + str(list_of_lists)
    list_comments = list_of_lists[0]
    list_dates = list_of_lists[1]

    date_format = "%m/%d/%Y"
    comments = list()
    if list_comments and list_dates:
        assert len(list_comments) == len(list_dates)
        for i in range(0, len(list_comments)):
            comment_text = list_comments[i]
            comment_date = dt.datetime.strptime(list_dates[i], date_format)
            comments.append((comment_text, comment_date))
    else:
        if list_comments and not list_dates:
            for i in range(0, len(list_comments)):
                comment_text = list_comments[i]
                comment_date = None
                comments.append((comment_text, comment_date))
        elif not list_comments and list_dates:
            for i in range(0, len(list_dates)):
                comment_text = None
                comment_date = dt.datetime.strptime(list_dates[i], date_format)
                comments.append((comment_text, comment_date))

    return comments if comments else None

### 2.3 Функция преобразования относительных URIs ресторанов на сайте TripAdvisor в абсолютные URIs/URLs

In [None]:
URL_WEB_SITE = "https://www.tripadvisor.com"


def relative_uri_to_full_url(uri_relative: str, url_web_site: str):
    assert uri_relative, f"Got empty or None relative URI: {uri_relative}"
    assert url_web_site, f"Got empty or None website URL: {url_web_site}"
    full_url_string = f"{url_web_site}{uri_relative if uri_relative.startswith('/') else '/' + uri_relative}"
    assert validators.url(full_url_string), "Invalid URL {full_url_string}, check the domain and the relarive path."
    if not uri_relative:
        return None
    return urllib3.util.parse_url(full_url_string)

## 3. Преобразование колонок в исходных сырых данных

### 3.1 Создание копии сырых данных

In [None]:
pre_data = pd.DataFrame(data)

### 3.2 Создание/преобразование простых признаков

#### 3.2.1 Преобразование колонки "Price Range"

In [None]:
pre_data.replace({"Price Range": DICTIONARY_PRICE_RANGE_TO_INTEGER}, inplace=True)

In [None]:
display_pd_html(pre_data.sample(5))

### 3.3 Создание/преобразование сложных данных

#### 3.3.1 Преобразование строковой колонки "Cuisine Style" в колонку списка

In [None]:
pre_data["Cuisine Style"] = pre_data["Cuisine Style"].apply(string_to_list_of_strings)

#### 3.3.2 Преобразование строковой колонки "Reviews" в колонку списков со списками

In [None]:
def convert_restaurant_comments_to_list(restaurant_row: pd.Series):
    restaurant_reviews = restaurant_row["Reviews"]
    restaurant = restaurant_row.copy()
    restaurant_reviews_list = string_to_list_of_lists(restaurant_reviews)
    restaurant_comments = interpret_list_of_lists_as_comments(restaurant_reviews_list)
    restaurant["Reviews"] = restaurant_comments
    return restaurant


pre_data[["Reviews"]] = pre_data[["Reviews"]].apply(convert_restaurant_comments_to_list, axis=1)

In [None]:
pre_data.sample(2)

#### 3.3.3 Преобразование строковой колонки URL_TA с относительными URLs ресторанов в полные URLs типа `urllib3.util.url.Url`

In [None]:
def fill_restaurant_urls(restaurant_row: pd.Series):
    restaurant = restaurant_row.copy()
    restaurant_relative_url_string = restaurant["URL_TA"]
    if isinstance(restaurant_relative_url_string, str):
        restaurant_url = relative_uri_to_full_url(restaurant_relative_url_string, URL_WEB_SITE)
        restaurant["URL_TA"] = restaurant_url
    else:
        if isinstance(restaurant_relative_url_string, numbers.Number):
            if np.isnan(restaurant_relative_url_string):
                restaurant["URL_TA"] = np.nan
            else:
                raise Exception(f"Wrong URL_TA: {restaurant_relative_url_string}")
        else:
            raise Exception(f"Wrong URL_TA: {restaurant_relative_url_string}")

    return restaurant


pre_data[["URL_TA"]] = pre_data[["URL_TA"]].apply(fill_restaurant_urls, axis=1)
restaurant_url_column = pre_data.pop("URL_TA")
pre_data.insert(len(pre_data.columns), "URL_TA", restaurant_url_column)

In [None]:
pre_data.sample(2)

## 4. Удаление ненужных колонок

### 4.1 Удаление колонки "ID_TA"

In [None]:
pre_data.drop(["ID_TA"], axis=1, inplace=True)

## 5. Перестановка колонок

In [None]:
pre_data.insert(len(pre_data.columns) - 1, "Reviews", pre_data.pop("Reviews"))
pre_data.insert(pre_data.columns.get_loc("Number of Reviews") + 1, "City", pre_data.pop("City"))
pre_data.insert(pre_data.columns.get_loc("Number of Reviews") + 2, "Cuisine Style", pre_data.pop("Cuisine Style"))
pre_data.insert(pre_data.columns.get_loc("Number of Reviews") + 3, "URL_TA", pre_data.pop("URL_TA"))

pre_data.insert(pre_data.columns.get_loc("Restaurant_id") + 1, "Rating", pre_data.pop("Rating"))

In [None]:
pre_data.sample(2)

## 6. Проверка на пропуски

In [None]:
gaps = pd.DataFrame(data={
    "Колонка": ["Rating", "Ranking", "Price Range", "Number of Reviews", "City", "Cuisine Style", "Reviews", "URL_TA"],
    "Число пропусков": [
        pre_data["Rating"].isna().sum(),
        pre_data["Ranking"].isna().sum(),
        pre_data["Price Range"].isna().sum(),
        pre_data["Number of Reviews"].isna().sum(),
        pre_data["City"].isna().sum(),
        pre_data["Cuisine Style"].isna().sum(),
        pre_data["Reviews"].isna().sum(),
        pre_data["URL_TA"].isna().sum()
    ]
})
display_pd_html(gaps)

## 7. Создание новых признаков

### 7.1 Колонки "Last Review Year", "Last Review Season", "Last Review Month", "Last Review Day", "Last Review Is Weekend"
Создаётся из колонки Reviews. Сама колонка Reviews удаляется.

In [None]:
MONTH_SEASONS = [(month%12 + 3)//3 for month in range(1, 13)]


def month_to_season(month: numbers.Number):
    assert 1 <= month <= 12, f"Invalid month number: {month}"
    return MONTH_SEASONS[month - 1]


def day_to_weekend(day: numbers.Number):
    return 1.0 if day == 5 or day == 6 else 0.0


def last_review_date_columns(restaurant_row: pd.Series):
    restaurant_reviews = restaurant_row["Reviews"]
    restaurant_review_dates = map(lambda review: review[1], restaurant_reviews) if restaurant_reviews else None
    last_review_date = max(restaurant_review_dates) if restaurant_review_dates else None

    restaurant = restaurant_row.copy()
    if last_review_date:
        restaurant["Last Review Year"] = last_review_date.year
        restaurant["Last Review Season"] = month_to_season(last_review_date.month)
        restaurant["Last Review Month"] = last_review_date.month
        restaurant["Last Review Day"] = last_review_date.day
        restaurant["Last Review Is Weekend"] = day_to_weekend(last_review_date.weekday())
    else:
        restaurant["Last Review Year"] = np.nan
        restaurant["Last Review Season"] = np.nan
        restaurant["Last Review Month"] = np.nan
        restaurant["Last Review Day"] = np.nan
        restaurant["Last Review Is Weekend"] = np.nan
    
    return restaurant


pre_data.insert(pre_data.columns.get_loc("Number of Reviews") + 1, "Last Review Year", np.nan)
pre_data.insert(pre_data.columns.get_loc("Number of Reviews") + 2, "Last Review Season", np.nan)
pre_data.insert(pre_data.columns.get_loc("Number of Reviews") + 3, "Last Review Month", np.nan)
pre_data.insert(pre_data.columns.get_loc("Number of Reviews") + 4, "Last Review Day", np.nan)
pre_data.insert(pre_data.columns.get_loc("Number of Reviews") + 5, "Last Review Is Weekend", np.nan)


review_columns = ["Reviews", "Last Review Year", "Last Review Season", "Last Review Month", "Last Review Day", "Last Review Is Weekend"]
pre_data[review_columns] = pre_data[review_columns].apply(last_review_date_columns, axis=1)
pre_data.drop(["Reviews"], axis=1, inplace=True)

In [None]:
display_pd_html(pre_data.sample(5))

### 7.2 Перенос колонки "URL_TA"

In [None]:
pre_data.insert(pre_data.columns.get_loc("Last Review Is Weekend") + 1, "URL_TA", pre_data.pop("URL_TA"))
display_pd_html(pre_data.sample(2))

### 7.3 Создание dummy признаков из колонки "Last Review Season"

In [None]:
SEASON_DICTIONARY = {
    None: np.nan,
    np.nan: np.nan,
    math.nan: np.nan,
    1: "Winter",
    2: "Spring",
    3: "Summer",
    4: "Autumn"
}

SEASON_NAMES = list(filter(lambda value: isinstance(value, str), SEASON_DICTIONARY.values()))
SEASON_COLUMN_PREFIX = "LR_Season"
SEASON_COLUMN_NAMES = [f"{SEASON_COLUMN_PREFIX}_{season}" for season in SEASON_NAMES] + [f"{SEASON_COLUMN_PREFIX}_nan"]

pre_data.replace({ "Last Review Season": SEASON_DICTIONARY }, inplace=True)
pre_data = pd.concat([pre_data, pd.get_dummies(pre_data["Last Review Season"], prefix=SEASON_COLUMN_PREFIX, dummy_na=True)], axis=1)
pre_data.drop(["Last Review Season"], axis=1, inplace=True)

for season_index, season_column in enumerate(SEASON_COLUMN_NAMES):
    pre_data.insert(pre_data.columns.get_loc("Last Review Is Weekend") + (season_index + 1), season_column, pre_data.pop(season_column))

In [None]:
display_pd_html(pre_data.sample(2))

### 7.4 Создание dummy признаков из колонки "City"

In [None]:
CITY_NAMES = sorted(pre_data["City"].unique())
CITY_COLUMN_PREFIX = "City"
CITY_COLUMN_NAMES = [f"{CITY_COLUMN_PREFIX}_{city_name}" for city_name in CITY_NAMES]

pre_data = pd.concat([pre_data, pd.get_dummies(pre_data["City"], prefix=CITY_COLUMN_PREFIX)], axis=1)
pre_data.drop(["City"], axis=1, inplace=True)

In [None]:
display_pd_html(pre_data.sample(2))

## 8 Работа с пропусками и создание новых признаков

### 8.0 Импорт библиотек для парсинга сайтов

In [None]:
import requests
from bs4 import BeautifulSoup
from lxml import html
from lxml.etree import tostring

### 8.1 Ещё раз отображение информации о пропусках

In [None]:
gaps = pd.DataFrame(data={
    "Колонка": ["Price Range", "Number of Reviews",
                "Last Review Year", "Last Review Month", "Last Review Day", "Last Review Is Weekend",
                "LR_Season_Winter", "LR_Season_Spring", "LR_Season_Summer", "LR_Season_Autumn", "LR_Season_nan",
                "Cuisine Style"],
    "Число пропусков": [
        pre_data["Price Range"].isna().sum(),
        pre_data["Number of Reviews"].isna().sum(),
        pre_data["Last Review Year"].isna().sum(),
        pre_data["Last Review Month"].isna().sum(),
        pre_data["Last Review Day"].isna().sum(),
        pre_data["Last Review Is Weekend"].isna().sum(),
        pre_data["LR_Season_Winter"].isna().sum(),
        pre_data["LR_Season_Spring"].isna().sum(),
        pre_data["LR_Season_Summer"].isna().sum(),
        pre_data["LR_Season_Autumn"].isna().sum(),
        pre_data["LR_Season_nan"].sum(),
        pre_data["Cuisine Style"].isna().sum()
    ]
})
display_pd_html(gaps)
print(f"Общее число записей: {len(pre_data)}")

### 8.1 Функции для парсинга HTML

In [None]:
LOGGER.setLevel(logging.INFO)


def get_element_text_by_xpath(xpath_string: str, url: urllib3.util.Url):
    page_content=requests.get( url.url, headers={"Accept-Language": "en"} )
    tree = html.fromstring(page_content.content)
    result = tree.xpath(xpath_string)
    return result


def get_contents_text_by_xpath(xpath_strings: dict, url: urllib3.util.Url):
    page_content=requests.get( url.url, headers={"Accept-Language": "en"} )
    tree = html.fromstring(page_content.content)
    results = dict()

    for xpath_string in xpath_strings.items():
        result = tree.xpath(xpath_string[1])
        results[xpath_string[0]] = result

    return results

### 8.2 Заполнение пропусков в колонках по данным на сайте TripAdvisor:
Колонки с пропусками или данные для их устранения:
- "Number Of Reviews";
- "Price Range";
- "Cuisine Style";
- "Last Review Date";

<div style="color: gray">

Новые признаки для дополнения набора данных:
- "Center Distance" - примерное расстояние до условного центра города;
- "Travelers' Choice" - пометка на сайте "Выбор путешественников";
- "MICHELIN" - наличие в списке Мишлен (наличие звезды Мишлен).

</div>
☝️<br />
<span style="color: red">По факту пока использовать такой детальный парсинг не удалось.<br />
Время отработки 50000 строк таблицы для каждого ресторана оказывается большим.<br />
Либо это означает, что надо использовать не XPath выражения,
либо отладить индикацию прогресса и ждать положенное время.<br />
Возможно, что сайт начинает замечать активность, или на некоторые страницы долго отвечает.<br />
Надо подождать больше времени и потом отправить это второй версией вне обучения.</span>

In [None]:
# CUISINE_COLUMN_NAMES = list()
# CUISINE_NAMES = list()


# CUISINE_NAMES_SET = set()
# for index, entry_cuisine_list in pre_data["Cuisine Style"].iteritems():
#     for cuisine in entry_cuisine_list:
#         CUISINE_NAMES_SET.add(cuisine)
# CUISINE_NAMES = sorted(cuisine_style_set)
# print(f"Cuisine names: {CUISINE_NAMES}")


# def fill_cuisine_style_columns_each_row(restaurant_row: pd.Series):
#     restaurant = restaurant_row.copy()
#     for cuisine_style in restaurant["Cuisine Style"]:
#         restaurant[f"Cuisine_{cuisine_style}"] = 1
#     return restaurant

In [None]:
# from tqdm import tqdm

# tqdm.pandas()

# from pandarallel import pandarallel
# pandarallel.initialize(progress_bar=False)

# -----------

FAILED_GAPS = list()


def fill_restaurant_data_from_Internet(restaurant_row: pd.Series):
    number_of_reviews = restaurant_row["Number of Reviews"]
    price_range = restaurant_row["Price Range"]
    cuisine_style = restaurant_row["Cuisine Style"]
    last_review_year = restaurant_row["Last Review Year"]

    restaurant_url = restaurant_row["URL_TA"]
    if (not number_of_reviews or np.isnan(number_of_reviews)) \
        or (not price_range or np.isnan(price_range)) \
        or (not cuisine_style) \
        or (not last_review_year or np.isnan(last_review_year)):
        website_data = get_contents_text_by_xpath(
            {
                "Number of Reviews": """//div[@data-tab = "TABS_OVERVIEW"]/div[1]/div[1]/div[1]/div[1]/div[1]/a[@href = "#REVIEWS"]/text()""",
                "Price Range": """//div[@data-test-target = "restaurant-detail-info"]/div[2]/span[3]/a[1]/text()""",
                "Cuisine Style": """//div[h2[text() = "Details"]]/following-sibling::div[1]/div/div[text() = "CUISINES"]/following-sibling::div/text()""",
                "Last Review Date": """(//span[@class="ratingDate"]/@title)[1]""",
                "Center Distance": """//div[@data-tab = "TABS_OVERVIEW"]/div[1]/div[3]/div[1]/div[1]/div[2]/span[2]/div[2]/b[1]/text()""",
                "Travelers' Choice": """//span[contains(text(), "Travelers' Choice")]/text()""",
                "MICHELIN": """(//div[contains(text(), "MICHELIN")]/text())[1]"""
            },
            restaurant_url
        )
        if not all([website_data["Number of Reviews"], website_data["Price Range"], website_data["Cuisine Style"], website_data["Last Review Date"]]):
            FAILED_GAPS.append({
                "URL": restaurant_url.url,
                "Number of Reviews": website_data["Number of Reviews"],
                "Price Range": website_data["Price Range"],
                "Cuisine Style": website_data["Cuisine Style"],
                "Last Review Date": website_data["Last Review Date"]
            })

        restaurant = restaurant_row.copy()

        if (np.isnan(number_of_reviews)) and website_data["Number of Reviews"]:
            web_number_of_reviews = int(website_data["Number of Reviews"][0].split(" ")[0])
            restaurant["Number of Reviews"] = web_number_of_reviews
        if (np.isnan(price_range)) and website_data["Price Range"]:
            if REGEX_PRICE_RANGE.match(website_data["Price Range"][0]) \
                or REGEX_PRICE_SINGLE.match(website_data["Price Range"][0]):
                web_price_range = DICTIONARY_PRICE_RANGE_TO_INTEGER[website_data["Price Range"][0]]
                restaurant["Price Range"] = web_price_range

        if (not cuisine_style) and website_data["Cuisine Style"]:
            web_cuisine_style = website_data["Cuisine Style"][0]
            web_cuisine_style = f"""[\"{web_cuisine_style}\"]""".replace(",", "\",\"" )
            restaurant["Cuisine Style"] = string_to_list_of_strings(web_cuisine_style)
        elif not isinstance(cuisine_style, list):
            if isinstance(cuisine_style, numbers.Number):
                if np.isnan(cuisine_style) and website_data["Cuisine Style"]:
                    web_cuisine_style = website_data["Cuisine Style"][0]
                    web_cuisine_style = f"""[\"{web_cuisine_style}\"]""".replace(",", "\",\"" )
                    restaurant["Cuisine Style"] = string_to_list_of_strings(web_cuisine_style)

        if (np.isnan(last_review_year) or np.isnan(number_of_reviews)) and website_data["Last Review Date"]:
            web_last_review_date =  website_data["Last Review Date"][0]
            web_last_review_date = dt.datetime.strptime(web_last_review_date, "%B %d, %Y")
            restaurant["Last Review Year"] = web_last_review_date.year
            restaurant["Last Review Month"] = web_last_review_date.month
            restaurant["Last Review Day"] = web_last_review_date.day
            restaurant["Last Review Is Weekend"] = day_to_weekend(web_last_review_date.dayofweek)

            season = SEASON_DICTIONARY[month_to_season(web_last_review_date.month)]
            if isinstance(season, str) and season:
                season_column = f"{SEASON_COLUMN_PREFIX}_{season}"
                restaurant[season_column] = 1
                restaurant["LR_Season_nan"] = 0
            else:
                restaurant["LR_Season_nan"] = 1
            
            web_number_of_reviews = int(website_data["Number of Reviews"][0].split(" ")[0].replace(",", ""))
            restaurant["Number of Reviews"] = web_number_of_reviews
        
        # In miles
        if website_data["Center Distance"]:
            web_center_distance = float(website_data["Center Distance"][0].split(" ")[0])
            restaurant["Center Distance"] = web_center_distance
        if website_data["Travelers' Choice"]:
            web_travelers_choice = 1.0 if website_data["Travelers' Choice"][0] else 0.0
            restaurant["Travelers' Choice"] = web_travelers_choice
        if website_data["MICHELIN"]:
            web_michelin_guide = 1.0 if website_data["MICHELIN"][0] else 0.0
            restaurant["MICHELIN"] = web_michelin_guide
    
        return restaurant
    else:
        website_data = get_contents_text_by_xpath(
            {
                "Center Distance": """//div[@data-tab = "TABS_OVERVIEW"]/div[1]/div[3]/div[1]/div[1]/div[2]/span[2]/div[2]/b[1]/text()""",
                "Travelers' Choice": """//span[contains(text(), "Travelers' Choice")]/text()""",
                "MICHELIN": """(//div[contains(text(), "MICHELIN")]/text())[1]"""
            },
            restaurant_url
        )

        restaurant = restaurant_row.copy()

        # In miles
        if website_data["Center Distance"]:
            web_center_distance = float(website_data["Center Distance"][0].split(" ")[0])
            restaurant["Center Distance"] = web_center_distance
        if website_data["Travelers' Choice"]:
            web_travelers_choice = 1.0 if website_data["Travelers' Choice"][0] else 0.0
            restaurant["Travelers' Choice"] = web_travelers_choice
        if website_data["MICHELIN"]:
            web_michelin_guide = 1.0 if website_data["MICHELIN"][0] else 0.0
            restaurant["MICHELIN"] = web_michelin_guide
        return restaurant

In [None]:
# from tqdm import tqdm

# tqdm.pandas()

# from pandarallel import pandarallel
# pandarallel.initialize(progress_bar=False)

# if not ("Center Distance" in pre_data.columns):
#     pre_data.insert(pre_data.columns.get_loc("Number of Reviews") + 1, "Center Distance", np.nan)
# if not ("Travelers' Choice" in pre_data.columns):
#     pre_data.insert(pre_data.columns.get_loc("Number of Reviews") + 2, "Travelers' Choice", np.nan)
# if not ("MICHELIN" in pre_data.columns):
#     pre_data.insert(pre_data.columns.get_loc("Number of Reviews") + 3, "MICHELIN", np.nan)

# data_columns = [
#     "Number of Reviews",
#     "Price Range",
#     "Cuisine Style",
#     "Last Review Year",
#     "Last Review Month",
#     "Last Review Day",
#     "Last Review Is Weekend",
#     "LR_Season_Winter", "LR_Season_Spring", "LR_Season_Summer", "LR_Season_Autumn", "LR_Season_nan",
#     "Center Distance",
#     "Travelers' Choice",
#     "MICHELIN",
#     "URL_TA"
# ]

# pre_data[data_columns] = pre_data[data_columns].parallel_apply(fill_restaurant_data_from_Internet, axis=1)

In [None]:
# pre_data.to_csv("kaggle/output/intermediate_data/pre_data.csv")

In [None]:
# # pre_data_sample = pd.DataFrame(pre_data[pre_data["Restaurant_id"] == "id_13936"])
# pre_data_sample = pd.DataFrame(pre_data.sample(100))

# if not ("Center Distance" in pre_data_sample.columns):
#     pre_data_sample.insert(pre_data_sample.columns.get_loc("Number of Reviews") + 1, "Center Distance", np.nan)
# if not ("Travelers' Choice" in pre_data.columns):
#     pre_data_sample.insert(pre_data_sample.columns.get_loc("Number of Reviews") + 2, "Travelers' Choice", np.nan)
# if not ("MICHELIN" in pre_data.columns):
#     pre_data_sample.insert(pre_data_sample.columns.get_loc("Number of Reviews") + 3, "MICHELIN", np.nan)

# data_columns = [
#     "Number of Reviews",
#     "Price Range",
#     "Cuisine Style",
#     "Last Review Year",
#     "Last Review Month",
#     "Last Review Day",
#     "Last Review Is Weekend",
#     "LR_Season_Winter", "LR_Season_Spring", "LR_Season_Summer", "LR_Season_Autumn", "LR_Season_nan",
#     "Center Distance",
#     "Travelers' Choice",
#     "MICHELIN",
#     "URL_TA"
# ]

# pre_data_sample[data_columns] = pre_data_sample[data_columns].apply(fill_restaurant_data_from_Internet, axis=1)
# pre_data_sample

In [None]:
display_pd_html(pre_data.sample(5))

### 8.3 Устранение пропусков в "Number of Reviews"

In [None]:
number_of_reviews_typical = pre_data[~pre_data["Number of Reviews"].isna()]["Number of Reviews"].mean()
pre_data["Number of Reviews"].fillna(number_of_reviews_typical, inplace=True)
pre_data["Number of Reviews"].isna().sum()

### 8.4 Устранение пропусков в "Price Range"

In [None]:
price_range_typical = pre_data[~pre_data["Price Range"].isna()]["Price Range"].mean()
pre_data["Price Range"].fillna(price_range_typical, inplace=True)
pre_data["Price Range"].isna().sum()

### 8.5 Удаление пропусков в колонках "Last Review Year", "Last Review Month", "Last Review Day", "Last Review Weekend Day", "LR_Season_Winter", "LR_Season_Spring", "LR_Season_Summer", "LR_Season_Autumn", "LR_Season_nan"
Заполним пропуски в колонках "Last Review Year", "Last Review Season", "Last Review Month", "Last Review Day" и "Last Review Weekend Day"<br />
данными согласно тому году, месяцу и дню, когда о ресторанах в городе, в котором находится этот ресторан, оставляли больше всего отзывов.<br />
Если ресторан в городе всего один, то мы смотрим<br />
наиболее частые посещения европейских ресторанов по всему набору данных в целом, то есть считаем отзывы по региону "Европа".<br />
Позднее результыты пожно будет улучшить, скорректировав данные в наборе со страниц ресторанов на сайте TripAdvisor.<br />

(Можно было бы попытаться ещё помимо города подбирать рестораны с похожей кухней (по длине вертора dummy признаков типов кухонь).)

In [None]:
# If an entry in the column pre_data["Last Review Year"] is NaN
# other entries in the same row are NaN too in the columns
# "Last Review Season", "Last Review Month", "Last Review Day" и "Last Review Weekend Day".
STRING_DATE_FORMAT = "%Y/%m/%d"

restaurants_region = pre_data[~pre_data["Last Review Year"].isna()]

region_max_review_year = restaurants_region["Last Review Year"].value_counts().index[0]
region_max_review_month = restaurants_region[
    restaurants_region["Last Review Year"] == region_max_review_year
]["Last Review Month"].value_counts().index[0]
region_max_review_day = restaurants_region[
    (restaurants_region["Last Review Year"] == region_max_review_year)&(restaurants_region["Last Review Month"] == region_max_review_month)
]["Last Review Day"].value_counts().index[0]
region_max_review_date = pd.to_datetime(dt.datetime(int(region_max_review_year), int(region_max_review_month), int(region_max_review_day)))

LOGGER.info(f"Most probable date of a review of a restaurant visitor in the region of Europe: {region_max_review_date.strftime(STRING_DATE_FORMAT)}")

city_max_review_dates = dict()

for index, restaurant in pre_data[pre_data["Last Review Year"].isna()].iterrows():
    restaurant_city_data = restaurant[CITY_COLUMN_NAMES]
    restaurant_city = restaurant_city_data[restaurant_city_data == 1].index[0]

    city_max_review_date = None
    if restaurant_city in city_max_review_dates:
        city_max_review_date = city_max_review_dates[restaurant_city]
    else:
        restaurants_same_city = pre_data[(pre_data[restaurant_city] == 1)&(~pre_data["Last Review Year"].isna())]
        if len(restaurants_same_city) > 0:
            city_max_review_year = restaurants_same_city["Last Review Year"].value_counts().index[0]
            city_max_review_month = restaurants_same_city[
                restaurants_same_city["Last Review Year"] == city_max_review_year
            ]["Last Review Month"].value_counts().index[0]
            city_max_review_day = restaurants_same_city[
                (restaurants_same_city["Last Review Year"] == city_max_review_year)&(restaurants_same_city["Last Review Month"] == city_max_review_month)
            ]["Last Review Day"].value_counts().index[0]
            city_max_review_date = pd.to_datetime(dt.datetime(int(city_max_review_year), int(city_max_review_month), int(city_max_review_day)))

            city_max_review_dates[restaurant_city] = city_max_review_date
        else:
            LOGGER.error(f"The restaurant with ID {restaurant['Restaurant_id']} in the city {restaurant_city} has no other sibling restaurants. Will use the common European date from the dataset: {region_max_review_date.strftime(STRING_DATE_FORMAT)}")
            city_max_review_date = region_max_review_date
            break

    pre_data.at[index, "Review Last Date"] = city_max_review_date
    pre_data.at[index, "Last Review Year"] = city_max_review_date.year
    pre_data.at[index, "Last Review Month"] = city_max_review_date.month
    pre_data.at[index, "Last Review Day"] = city_max_review_date.day
    pre_data.at[index, "Last Review Is Weekend"] = day_to_weekend(city_max_review_date.dayofweek)
    
    season = SEASON_DICTIONARY[month_to_season(int(city_max_review_date.month))]
    season_column = f"{SEASON_COLUMN_PREFIX}_{season}"
    pre_data.at[index, season_column] = 1.0
    pre_data.at[index, f"{SEASON_COLUMN_PREFIX}_nan"] = 0.0

In [None]:
gaps = pd.DataFrame(data={
    "Колонка": ["Price Range", "Number of Reviews",
                "Last Review Year", "Last Review Month", "Last Review Day", "Last Review Is Weekend",
                "LR_Season_Winter", "LR_Season_Spring", "LR_Season_Summer", "LR_Season_Autumn", "LR_Season_nan",
                "Cuisine Style"],
    "Число пропусков": [
        pre_data["Price Range"].isna().sum(),
        pre_data["Number of Reviews"].isna().sum(),
        pre_data["Last Review Year"].isna().sum(),
        pre_data["Last Review Month"].isna().sum(),
        pre_data["Last Review Day"].isna().sum(),
        pre_data["Last Review Is Weekend"].isna().sum(),
        pre_data["LR_Season_Winter"].isna().sum(),
        pre_data["LR_Season_Spring"].isna().sum(),
        pre_data["LR_Season_Summer"].isna().sum(),
        pre_data["LR_Season_Autumn"].isna().sum(),
        pre_data["LR_Season_nan"].sum(),
        pre_data["Cuisine Style"].isna().sum()
    ]
})
display_pd_html(gaps)
print(f"Общее число записей: {len(pre_data)}")

### 8.6 Устранение пропусков в "Cuisine Style"

In [None]:
CUISINE_COUNTER = 0


def fill_restaurant_cuisines_from_Internet(restaurant_row: pd.Series):
    global CUISINE_COUNTER
    cuisine_style = restaurant_row["Cuisine Style"]
    restaurant_url = restaurant_row["URL_TA"]
    if not cuisine_style or cuisine_style == None:
        website_data = get_contents_text_by_xpath(
            {
               "Cuisine Style": """//div[h2[text() = "Details"]]/following-sibling::div[1]/div/div[text() = "CUISINES"]/following-sibling::div/text()""",
            },
            restaurant_url
        )
        restaurant = restaurant_row.copy()
        if website_data["Cuisine Style"]:
            web_cuisine_style = website_data["Cuisine Style"][0]
            web_cuisine_style = f"""[\"{web_cuisine_style}\"]""".replace(",", "\",\"" )
            web_cuisine_style_list = string_to_list_of_strings(web_cuisine_style)
            CUISINE_COUNTER = CUISINE_COUNTER + 1
            LOGGER.info(f"Cuisines-{CUISINE_COUNTER*CPU_COUNT}: {web_cuisine_style_list}")
            restaurant["Cuisine Style"] = web_cuisine_style_list
        return restaurant
    return restaurant_row

In [None]:
# Парсить или нет веб-страницы ресторанов (занимает много времени)
DO_WEB_SCRAPPING = False

In [None]:
data_columns = [
    "Cuisine Style",
    "URL_TA"
]

if DO_WEB_SCRAPPING:
    pre_data[data_columns] = pre_data[data_columns].parallel_apply(fill_restaurant_cuisines_from_Internet, axis=1)

In [None]:
if DO_WEB_SCRAPPING:
    pre_data.to_csv("/kaggle/input/pre-datarestaurant-ratingsweb-scrapping-cuisines/pre_data-restaurant_ratings-web_scrapping_cuisines.csv", index=False)

#### 8.6.1 Прогрузка предобработанных данных из своего сформированного веб-скрапингом единого набора данных

In [None]:
pre_data = pd.read_csv('/kaggle/input/pre-datarestaurant-ratingsweb-scrapping-cuisines/pre_data-restaurant_ratings-web_scrapping_cuisines.csv')

In [None]:
gaps = pd.DataFrame(data={
    "Колонка": ["Cuisine Style"],
    "Число пропусков": [
        pre_data["Cuisine Style"].isna().sum()
    ]
})
display_pd_html(gaps)
print(f"Общее число записей: {len(pre_data)}")

In [None]:
pre_data.sample(5)

In [None]:
if not "Cuisine_nan" in pre_data.columns:
    pre_data.insert(pre_data.columns.get_loc("Cuisine Style"), "Cuisine_nan", 0.0)


def fillna_cuisines(restaurant_row: pd.Series):
    restaurant_cuisine = restaurant_row["Cuisine Style"]
    if not restaurant_cuisine or restaurant_cuisine == None:
        restaurant = restaurant_row.copy()
        restaurant["Cuisine_nan"] = 1.0
        restaurant["Cuisine Style"] = []
        return restaurant
    elif isinstance(restaurant_cuisine, numbers.Number):
        if np.isnan(restaurant_cuisine):
            restaurant = restaurant_row.copy()
            restaurant["Cuisine_nan"] = 1.0
            restaurant["Cuisine Style"] = []
            return restaurant
    else:
        return restaurant_row

data_columns = ["Cuisine_nan", "Cuisine Style"]
pre_data[data_columns] = pre_data[data_columns].apply(fillna_cuisines, axis=1)

In [None]:
gaps = pd.DataFrame(data={
    "Колонка": ["Cuisine Style"],
    "Число пропусков": [
        pre_data["Cuisine Style"].isna().sum()
    ]
})
display_pd_html(gaps)
print(f"Общее число записей: {len(pre_data)}")

#### 8.6.1 Разбиваем признак "Cuisine Style" на соответствующее число уникальных dummy признаков

Придётся заново распарсить списки кухонь из строковых значений.

In [None]:
pre_data["Cuisine Style"] = pre_data["Cuisine Style"].apply(string_to_list_of_strings)

In [None]:
CUISINE_NAME_SET = set()
for index, entry_cuisine_list in pre_data["Cuisine Style"].iteritems():
    for cuisine in entry_cuisine_list:
        CUISINE_NAME_SET.add(cuisine)

CUISINE_NAMES = sorted(CUISINE_NAME_SET)
print(f"Unique cuisine names: {CUISINE_NAMES}")

In [None]:
CUISINE_COLUMN_PREFIX = "Cuisine"
CUISINE_COLUMN_NAMES = []

for index, cuisine_name in enumerate(CUISINE_NAMES):
    cuisine_column_name = f"{CUISINE_COLUMN_PREFIX}_{cuisine_name}"
    CUISINE_COLUMN_NAMES.append(cuisine_column_name)
    if not cuisine_column_name in pre_data.columns:
        pre_data.insert(pre_data.columns.get_loc("Cuisine Style") + (index + 1), cuisine_column_name, 0.0)


def fill_cuisine_style_columns_each_row(restaurant_row: pd.Series):
    restaurant = restaurant_row.copy()
    for cuisine_style in restaurant["Cuisine Style"]:
        restaurant[f"{CUISINE_COLUMN_PREFIX}_{cuisine_style}"] = 1.0
    return restaurant


pre_data[["Cuisine Style"] + CUISINE_COLUMN_NAMES] = pre_data[["Cuisine Style"] + CUISINE_COLUMN_NAMES].apply(fill_cuisine_style_columns_each_row, axis=1)
print("------------------------------")
print("Visual Test:")
print(pre_data.iloc[12][["Restaurant_id", "Cuisine Style", "Cuisine_Greek", "Cuisine_Xinjiang"]])
print("---------------")
print(pre_data.iloc[80][["Restaurant_id", "Cuisine Style", 'Cuisine_French', 'Cuisine_Asian', 'Cuisine_European', 'Cuisine_Gluten Free Options', 'Cuisine_Vegetarian Friendly', 'Cuisine_Vegan Options', "Cuisine_Xinjiang"]])
print("------------------------------")

In [None]:
if "Review Last Date" in pre_data.columns:
    pre_data.drop("Review Last Date", axis=1, inplace=True)
if "Cuisine Style" in pre_data.columns:
    pre_data.drop("Cuisine Style", axis=1, inplace=True)
display_pd_html(pre_data.sample(5))

### 8.7 Добавление информации о численности населения в городе и о статусе столицы

In [None]:
world_cities = raw_world_cities.copy()
world_cities.drop(["city", "lat", "lng", "iso2", "iso3", "admin_name", "id"], axis=1, inplace=True)
world_cities.rename(columns={"city_ascii": "city"}, inplace=True)
world_cities["capital"] = world_cities["capital"].apply(lambda status: 1 if status == "primary" else 0)
display_pd_html(world_cities[world_cities["capital"] == 1].sample(3))
display_pd_html(world_cities[world_cities["capital"] == 0].sample(3))

In [None]:
not_found_cities_set1 = [city for city in CITY_NAMES if city not in world_cities["city"].unique()]
not_found_cities_set1

In [None]:
world_cities2 = raw_world_cities2.copy()
world_cities2.drop(["Latitude", "Longitude", "Region", "City"], axis=1, inplace=True)
world_cities2.rename(columns={"AccentCity": "city", "Country": "country", "Population": "population"}, inplace=True)
display_pd_html(world_cities2.sample(3))

In [None]:
not_european_country = [
    "Venezuela",
    "Canada",
    "United States"
]

restaurant_cities = world_cities[
    (world_cities["city"].isin(CITY_NAMES))&(~world_cities["country"].isin(not_european_country))
].sort_values(by=["country", "city"])
for not_found_city in not_found_cities_set1:
    city_entry = world_cities2[world_cities2["city"] == not_found_city].iloc[0]
    restaurant_cities = restaurant_cities.append({
        "city": city_entry["city"],
        "country": city_entry["country"],
        "capital": 0,
        "population": city_entry["population"]
    }, ignore_index=True)

restaurant_cities.loc[
    restaurant_cities["city"] == "Copenhagen", ["country", "capital"]
] = ["Denmark", 1]
restaurant_cities.loc[
    restaurant_cities["city"] == "Oporto", ["country", "capital", "population"]
] = ["Portugal", 0, 237559.0]
restaurant_cities = restaurant_cities.sort_values(by=["country", "city"])
restaurant_cities

In [None]:
if not "City Is a Capital" in pre_data.columns:
    pre_data.insert(pre_data.columns.get_loc("Number of Reviews") + 1, "City Is a Capital", np.nan)
if not "City Population" in pre_data.columns:
    pre_data.insert(pre_data.columns.get_loc("City Is a Capital") + 1, "City Population", np.nan)


def apply_city_data(restaurant_row: pd.Series):
    restaurant_city_data = restaurant_row[CITY_COLUMN_NAMES]
    restaurant_city = restaurant_city_data[restaurant_city_data == 1].index[0]
    restaurant_city = restaurant_city.replace(f"{CITY_COLUMN_PREFIX}_", "")
    
    restaurant = restaurant_row.copy()
    restaurant_city_entry = restaurant_cities[restaurant_cities["city"] == restaurant_city]
    restaurant["City Is a Capital"] = int(restaurant_city_entry["capital"].values[0])
    restaurant["City Population"] = float(restaurant_city_entry["population"])
    return restaurant


data_columns = ["City Is a Capital", "City Population"] + CITY_COLUMN_NAMES
pre_data[data_columns] = pre_data[data_columns].parallel_apply(apply_city_data, axis=1)

In [None]:
display_pd_html(pre_data.sample(5))

### 8.8 Признак количества кухонь в ресторане

In [None]:
if not "Cuisine Number" in pre_data.columns:
    pre_data.insert(pre_data.columns.get_loc("Ranking") + 1, "Cuisine Number", 0)


def fill_cuisine_number(restaurant_row: pd.Series):
    cuisine_data = restaurant_row[CUISINE_COLUMN_NAMES]
    cuisine_number = cuisine_data.sum()
    restaurant = restaurant_row.copy()
    restaurant["Cuisine Number"] = cuisine_number
    return restaurant
    

data_columns = ["Cuisine Number"] + CUISINE_COLUMN_NAMES
pre_data[data_columns] = pre_data[data_columns].parallel_apply(fill_cuisine_number, axis=1)

In [None]:
display_pd_html(pre_data.sample(5))

### 8.9 Признаки различных категорий местного ранга ресторана, основанные на колонке Ranking

In [None]:
np.sqrt(pre_data["Ranking"][(pre_data["Rating"] > 4)&(pre_data["Rating"] <= 5)]).hist(bins=100)

In [None]:
np.sqrt(pre_data["Ranking"][(pre_data["Rating"] > 3)&(pre_data["Rating"] <= 4)]).hist(bins=100)

In [None]:
np.sqrt(pre_data["Ranking"][(pre_data["Rating"] > 2)&(pre_data["Rating"] <= 3)]).hist(bins=100)

In [None]:
np.sqrt(pre_data["Ranking"][(pre_data["Rating"] > 1)&(pre_data["Rating"] <= 2)]).hist(bins=100)

In [None]:
np.sqrt(pre_data["Ranking"][pre_data["Rating"] <= 1]).hist(bins=100)

Предыдущие построения гистограммы квадратного корня местного ранга ресторана для ресторанов с не экстремально высокими или низкими рейтингами от 1 до 4 показазали наличие примерно 3-х отдельных распределений с границами в районе 65 и 95 по значениям квадратного корня местного ранга "Ranking".

In [None]:
if not "sqrt(Ranking)" in pre_data.columns:
    pre_data.insert(pre_data.columns.get_loc("Ranking") + 1, "sqrt(Ranking)", np.nan)

pre_data["sqrt(Ranking)"] = pre_data["Ranking"].apply(np.sqrt)
pre_data.drop("Ranking", axis=1, inplace=True)

In [None]:
pre_data.sample(3)

In [None]:
if not "sqrt_Ranking_1" in pre_data.columns:
    pre_data.insert(pre_data.columns.get_loc("sqrt(Ranking)") + 1, "sqrt_Ranking_1", 0)
if not "sqrt_Ranking_2" in pre_data.columns:
    pre_data.insert(pre_data.columns.get_loc("sqrt(Ranking)") + 2, "sqrt_Ranking_2", 0)
if not "sqrt_Ranking_3" in pre_data.columns:
    pre_data.insert(pre_data.columns.get_loc("sqrt(Ranking)") + 3, "sqrt_Ranking_3", 0)


def fill_rank_parameters(restaurant_row: pd.Series):
    ranking = restaurant_row["sqrt(Ranking)"]
    restaurant = restaurant_row.copy()
    restaurant["sqrt_Ranking_1"] = 1.0 if ranking < 65 else 0.0
    restaurant["sqrt_Ranking_2"] = 1.0 if 65 <= ranking < 95 else 0.0
    restaurant["sqrt_Ranking_3"] = 1.0 if ranking >= 95 else 0.0
    return restaurant


data_columns = ["sqrt(Ranking)", "sqrt_Ranking_1", "sqrt_Ranking_2", "sqrt_Ranking_3"]
pre_data[data_columns] = pre_data[data_columns].apply(fill_rank_parameters, axis=1)

In [None]:
pre_data.sample(3)

### 8.10 Логарифмирование признака населения города - log10

In [None]:
pre_data["City Population"].hist(bins=100)

In [None]:
np.log10(pre_data["City Population"]).hist(bins=100)

In [None]:
len(pre_data["City Population"][pre_data["City Population"] <= 0])

In [None]:
pre_data.sample(2)

In [None]:
if not "log10(City Population)" in pre_data.columns:
    pre_data.insert(pre_data.columns.get_loc("City Population") + 1, "log10(City Population)", np.nan)

pre_data["log10(City Population)"] = pre_data["City Population"].apply(np.log10)
pre_data.drop("City Population", axis=1, inplace=True)

In [None]:
pre_data.sample(2)

### 8.11 Произведение признака "sqrt(Ranking)" и признака "log10(City Population)"

In [None]:
pre_data[[
    "Rating",
    "sqrt(Ranking)",
    "Cuisine Number",
    "Price Range",
    "Number of Reviews",
    "City Is a Capital",
    "log10(City Population)",
    "Last Review Year",
    "Last Review Month",
    "Last Review Is Weekend"
]].corr()

In [None]:
from sklearn.preprocessing import PolynomialFeatures

pf = PolynomialFeatures(2)
ranking_population = pf.fit_transform(pre_data[["sqrt(Ranking)", "log10(City Population)"]])
ranking_population_list = [features[4] for features in ranking_population]

if not "sqrt(Ranking)_x_log10(City Population))" in pre_data.columns:
    pre_data.insert(pre_data.columns.get_loc("sqrt(Ranking)") + 1, "sqrt(Ranking)_x_log10(City Population)", np.nan)

pre_data["sqrt(Ranking)_x_log10(City Population)"] = ranking_population_list

In [None]:
pre_data.sample(5)

In [None]:
pre_data[[
    "Rating",
    "sqrt(Ranking)",
    "sqrt(Ranking)_x_log10(City Population)",
    "Cuisine Number",
    "Price Range",
    "Number of Reviews",
    "City Is a Capital",
    "log10(City Population)",
    "Last Review Year",
    "Last Review Month",
    "Last Review Is Weekend"
]].corr()

## 9. Итоговый набор данных

### 9.1 Датасет

In [None]:
dataset = pre_data[[
    "sample",
    "Rating",
    "sqrt(Ranking)",
    "sqrt_Ranking_1",
    "sqrt_Ranking_2",
    "sqrt_Ranking_3",
    "sqrt(Ranking)_x_log10(City Population)",
    "Cuisine Number",
    "Price Range",
    "Number of Reviews",
    "City Is a Capital",
    "log10(City Population)",
    "Last Review Year",
    "Last Review Month",
    "Last Review Is Weekend"
] + CITY_COLUMN_NAMES + CUISINE_COLUMN_NAMES]

In [None]:
display_pd_html(dataset.sample(10))

In [None]:
dataset.info(verbose=True, null_counts=True)

### 9.2 Обучение и результаты

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import train_test_split

dataset_train = dataset.query("sample == 1").drop(["sample"], axis=1)

y = dataset_train.Rating.values
X = dataset_train.drop(["Rating"], axis = 1)

RANDOM_SEED = 42

In [None]:
dataset_train.shape, X.shape, y.shape,

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

In [None]:
dataset_train.shape, X.shape, y.shape, X_train.shape, X_test.shape

In [None]:
model = RandomForestRegressor(n_estimators=100, verbose=1, n_jobs=-1, random_state=RANDOM_SEED)

model.fit(X_train, y_train)

y_predicted = model.predict(X_test)

In [None]:
print("Mean Absolute Error (MAE):", metrics.mean_absolute_error(y_test, y_predicted))

In [None]:
plt.rcParams["figure.figsize"] = (10, 5)
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(20).plot(kind="barh")

## 10. Выводы
Можно сделать вывод о том, что следующие признаки оказывают наибольшее влияние на построение модели рейтинга европейского ресторана методом RandomForestRegressor:<br />
1. `sqrt(Ranking)_x_log10(City Population)` - Произведение квадратного корня Ranking и десятичного логарифма населения города.
2. `sqrt(Ranking)` - Квадратный корень Ranking (местный ранг ресторана). Возможно его стоит исключить из модели, поскольку он сильно коррелирует с `sqrt(Ranking)_x_log10(City Population)`, но на практике это исключение не уменьшает MAE.
3. `log10(City Population)` - Десятичный логарифм населения города. Показывает каков порядок числа людей в городе.
4. `Number of Reviews` - Число обзоров, которое оставили люди про каждый ресторан. Возможно webs-crapping этих данных даст дополнительноеулучшение модели в будущем, вместо замены пропусков на среднее значение.

Необходим более полный web-scapping данных о ресторанах и дальнейший анализ распределений признаков.
На настоящий момент парсинг страниц ресторанов сделатн только для заполнения пропусков по типам кухонь ресторана.
Возможно стоит в дальнейшем ввести страны/регионы для городов и более регионы для различных типов кухонь, чтобы уменьшить количество переменных и улучшить категоризацию различных ресторанов, некоторые типы кухонь похожи и коррелируют между собой,
например "Italian"-"Pizza", "Japaneze"-"Sushi" и т.д.

## 11. Submission

In [None]:
test_data = dataset.query("sample == 0").drop(["sample"], axis=1)
test_data = test_data.drop(["Rating"], axis = 1)

In [None]:
display_pd_html(test_data.sample(10))

In [None]:
test_data.shape

In [None]:
predict_submission = model.predict(test_data)

In [None]:
sample_submission["Rating"] = predict_submission
sample_submission.to_csv("submission.csv", index=False)
sample_submission.head()