<a href="https://colab.research.google.com/github/psaw/hse-ai24-ml/blob/main/Chocolate_baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Baseline-решение

## Импорт библиотек, загрузка данных

In [1]:
import pandas as pd
import numpy as np


In [2]:
TRAIN = "https://raw.githubusercontent.com/evgpat/edu_stepik_practical_ml/main/datasets/chocolate_train.csv"
TEST = "https://raw.githubusercontent.com/evgpat/edu_stepik_practical_ml/main/datasets/chocolate_test_new.csv"

In [3]:
train_df = pd.read_csv(TRAIN)

## Обзор данных, подготовка к обучению

In [4]:
train_df.head()

Unnamed: 0,Company,Specific Bean Origin,REF,Review,Cocoa Percent,Company Location,Rating,Bean Type,Broad Bean Origin
0,Willie's Cacao,Rio Caribe,457,2009,72%,U.K.,3.25,Trinitario,Venezuela
1,Beschle (Felchlin),"Ocumare, Premier Cru, Quizas No. 2",508,2010,72%,Switzerland,3.5,,Venezuela
2,Dark Forest,Tanzania,1554,2015,70%,U.S.A.,3.0,,Tanzania
3,Brasstown aka It's Chocolate,Cooproagro,1125,2013,72%,U.S.A.,3.0,Trinitario,Dominican Republic
4,Pralus,"Java, Indonesie",32,2006,75%,France,3.5,Criollo,Indonesia


Удаляем все строки с пропусками

In [5]:
train_df.dropna(axis=0, inplace=True)

Выделяем матрицу объект-признак и вектор с целевой переменной

In [6]:
X_train = train_df.drop('Rating', axis=1)
y_train = train_df['Rating']

## Преобразование данных

In [7]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1253 entries, 0 to 1254
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Company               1253 non-null   object
 1   Specific Bean Origin  1253 non-null   object
 2   REF                   1253 non-null   int64 
 3   Review                1253 non-null   int64 
 4   Cocoa Percent         1253 non-null   object
 5   Company Location      1253 non-null   object
 6   Bean Type             1253 non-null   object
 7   Broad Bean Origin     1253 non-null   object
dtypes: int64(2), object(6)
memory usage: 88.1+ KB


In [8]:
def preprocess_cocoa_percent(df, drop_original=False):
    df['Cocoa_Percent_Numeric'] = df['Cocoa Percent'].str.rstrip('%').astype(float) / 100.0
    if drop_original:
        df.drop('Cocoa Percent', axis=1, inplace=True)
    return df


def preprocess_bean_type(df, drop_original=False):
    df['Bean_Type_cat'] = df['Bean Type'].str.split(r'[ ,(]', expand=True)[0].replace('\xa0', 'Unknown')
    if drop_original:
        df.drop('Bean Type', axis=1, inplace=True)
    return df


def preprocess_bean_origin(df, drop_original=False):
    df['Bean_Origin_cat'] = df['Broad Bean Origin'].str.split(r'[,]', expand=True)[0].replace('\xa0', 'Unknown')
    if drop_original:
        df.drop('Broad Bean Origin', axis=1, inplace=True)
    return df


def preprocess_specific_bean_origin(df, drop_original=False):
    df['Specific_Bean_Origin_cat'] = df['Specific Bean Origin'].str.split(r'[,]', expand=True)[0].replace('\xa0', 'Unknown')
    if drop_original:
        df.drop('Specific Bean Origin', axis=1, inplace=True)
    return df


def drop_unused_columns(df):
    df.drop(['REF'], axis=1, inplace=True)
    return df


In [9]:
preprocess_cocoa_percent(X_train, drop_original=True)
preprocess_bean_type(X_train, drop_original=True)
preprocess_bean_origin(X_train, drop_original=True)
preprocess_specific_bean_origin(X_train, drop_original=True)
drop_unused_columns(X_train)

Unnamed: 0,Company,Review,Company Location,Cocoa_Percent_Numeric,Bean_Type_cat,Bean_Origin_cat,Specific_Bean_Origin_cat
0,Willie's Cacao,2009,U.K.,0.72,Trinitario,Venezuela,Rio Caribe
1,Beschle (Felchlin),2010,Switzerland,0.72,Unknown,Venezuela,Ocumare
2,Dark Forest,2015,U.S.A.,0.70,Unknown,Tanzania,Tanzania
3,Brasstown aka It's Chocolate,2013,U.S.A.,0.72,Trinitario,Dominican Republic,Cooproagro
4,Pralus,2006,France,0.75,Criollo,Indonesia,Java
...,...,...,...,...,...,...,...
1250,Artisan du Chocolat,2009,U.K.,0.80,Criollo,Madagascar,Madagascar
1251,Marana,2016,Peru,0.70,Unknown,Peru,Cusco
1252,Arete,2015,U.S.A.,0.68,Forastero,Peru,Nacional
1253,Fresco,2011,U.S.A.,0.72,Unknown,Dominican Republic,Conacado


In [10]:
X_train.head()


Unnamed: 0,Company,Review,Company Location,Cocoa_Percent_Numeric,Bean_Type_cat,Bean_Origin_cat,Specific_Bean_Origin_cat
0,Willie's Cacao,2009,U.K.,0.72,Trinitario,Venezuela,Rio Caribe
1,Beschle (Felchlin),2010,Switzerland,0.72,Unknown,Venezuela,Ocumare
2,Dark Forest,2015,U.S.A.,0.7,Unknown,Tanzania,Tanzania
3,Brasstown aka It's Chocolate,2013,U.S.A.,0.72,Trinitario,Dominican Republic,Cooproagro
4,Pralus,2006,France,0.75,Criollo,Indonesia,Java


## Обучение модели

Устанавливаем катбуст и обучаем его с гиперпараметрами по умолчанию на всех данных

In [11]:
!pip install catboost -q

In [12]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1253 entries, 0 to 1254
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Company                   1253 non-null   object 
 1   Review                    1253 non-null   int64  
 2   Company Location          1253 non-null   object 
 3   Cocoa_Percent_Numeric     1253 non-null   float64
 4   Bean_Type_cat             1253 non-null   object 
 5   Bean_Origin_cat           1253 non-null   object 
 6   Specific_Bean_Origin_cat  1253 non-null   object 
dtypes: float64(1), int64(1), object(5)
memory usage: 78.3+ KB


In [13]:
cat_features = [0, 2, 4, 5, 6]

In [14]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(cat_features=cat_features)

In [15]:
model.fit(X_train, y_train)

Learning rate set to 0.042428
0:	learn: 0.4758827	total: 57.2ms	remaining: 57.2s
1:	learn: 0.4733871	total: 59.5ms	remaining: 29.7s
2:	learn: 0.4711737	total: 61.6ms	remaining: 20.5s
3:	learn: 0.4690777	total: 64.9ms	remaining: 16.2s
4:	learn: 0.4672465	total: 66.5ms	remaining: 13.2s
5:	learn: 0.4652624	total: 68ms	remaining: 11.3s
6:	learn: 0.4638599	total: 69.8ms	remaining: 9.89s
7:	learn: 0.4621881	total: 72.1ms	remaining: 8.94s
8:	learn: 0.4604140	total: 74ms	remaining: 8.14s
9:	learn: 0.4584217	total: 76ms	remaining: 7.52s
10:	learn: 0.4572265	total: 76.7ms	remaining: 6.89s
11:	learn: 0.4551530	total: 78ms	remaining: 6.42s
12:	learn: 0.4536983	total: 80.1ms	remaining: 6.08s
13:	learn: 0.4523029	total: 81.3ms	remaining: 5.73s
14:	learn: 0.4508743	total: 82.4ms	remaining: 5.41s
15:	learn: 0.4496384	total: 83.8ms	remaining: 5.15s
16:	learn: 0.4485255	total: 85.3ms	remaining: 4.93s
17:	learn: 0.4467099	total: 86.3ms	remaining: 4.71s
18:	learn: 0.4455784	total: 87.4ms	remaining: 4.51s


<catboost.core.CatBoostRegressor at 0x1449a7ad0>

In [16]:
model.score(X_train, y_train)

0.5341076465447727

In [None]:
# TODO: что можно сделать для улучшения модели:
# 1. Масштабирование признаков
# 2. Заполнение пропусков 
# 3. Удаление выбросов
# 4. Подбор гиперпараметров

## Предсказание на тестовых данных

Загружаем тестовые данные

In [23]:
test_df = pd.read_csv(TEST)

In [24]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 540 entries, 0 to 539
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Company               540 non-null    object
 1   Specific Bean Origin  540 non-null    object
 2   REF                   540 non-null    int64 
 3   Review                540 non-null    int64 
 4   Cocoa Percent         540 non-null    object
 5   Company Location      540 non-null    object
 6   Bean Type             540 non-null    object
 7   Broad Bean Origin     540 non-null    object
dtypes: int64(2), object(6)
memory usage: 33.9+ KB


In [25]:
preprocess_cocoa_percent(test_df, drop_original=True)
preprocess_bean_type(test_df, drop_original=True)
preprocess_bean_origin(test_df, drop_original=True)
preprocess_specific_bean_origin(test_df, drop_original=True)
drop_unused_columns(test_df)
test_df.head()

Unnamed: 0,Company,Review,Company Location,Cocoa_Percent_Numeric,Bean_Type_cat,Bean_Origin_cat,Specific_Bean_Origin_cat
0,Ohiyo,2015,U.S.A.,0.7,Trinitario,Trinidad,San Juan Estate
1,Blanxart,2009,Spain,0.72,Unknown,Unknown,Organic Dark
2,Brazen,2015,U.S.A.,0.8,Trinitario,Belize,Maya Mountain
3,Patric,2009,U.S.A.,0.67,Trinitario,Madagascar,Madagascar
4,Potomac,2010,U.S.A.,0.82,Matina,Costa Rica,Upala


Делаем предсказание

In [26]:
pred = model.predict(test_df)

Сохраняем предсказание в файл в нужном формате

In [27]:
test_df['id'] = np.arange(len(test_df))
test_df['Rating'] = pred

test_df[['id','Rating']].to_csv("submission_2.csv", index=False)