## Загружаем либы

In [70]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.metrics import recall_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing

## Импортируем csv

In [21]:
data = pd.read_csv('train_dataset.csv', sep=';')

## Предобрабатываем данные

выкидываем повторяющиеся колонки

In [22]:
data.drop(columns=['id', 'ticket_id', 'entrance_id', 'station_id', 'line_id'], inplace=True)

In [23]:
def sin_transformer(period):
	return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi))

def cos_transformer(period):
	return FunctionTransformer(lambda x: np.cos(x / period * 2 * np.pi))

переводим pass_dttm к datetime

In [24]:
data['pass_dttm'] = pd.to_datetime(data['pass_dttm'])

так как у нас всего 8 дней, то можем эти дни заэнкодить

In [25]:
len(set(data['pass_dttm'].dt.day))

8

In [26]:
data = pd.concat([data, pd.DataFrame(data=pd.get_dummies(data['pass_dttm'].dt.day))], axis=1)

создаем новую фичу минуты, которую потом апроскимируем с помощью синусоиды и косинусоиды, делаем так потому что луюди посещают метро в течение дня в виде волнобразной линии с пиком днем

In [27]:
data['minutes'] = data['pass_dttm'].dt.hour * 60 + data['pass_dttm'].dt.minute + data['pass_dttm'].dt.day * 1440

In [28]:
data['minutes_sin'] = sin_transformer(1440).fit_transform(data['minutes'])
data['minutes_cos'] = cos_transformer(1440).fit_transform(data['minutes'])

In [29]:
data.drop(columns=['minutes', 'pass_dttm'], inplace=True)

делим на тест и трайн

In [30]:
X, y_time, y_label = data.drop(columns=['time_to_under', 'label']), data['time_to_under'], data['label']
X_train, X_test, y_time_train, y_time_test, y_label_train, y_label_test = train_test_split(X, y_time, y_label, test_size=0.25, random_state=0)

обучаем регрессор, catboost взят за регресорную модель потому что хорошо работает с категориальными фичами


In [31]:
grid = {'learning_rate': [0.03, 0.1],
        'depth': [4, 6],
        'l2_leaf_reg': [1, 3]}

In [32]:
cat_features = ['ticket_type_nm', 'entrance_nm', 'station_nm', 'line_nm']

In [33]:
X_train_pool, X_test_pool = Pool(data=X_train, label=y_time_train, cat_features=cat_features), Pool(data=X_test, cat_features=cat_features)

In [None]:
model = CatBoostRegressor(task_type="GPU",
                           devices='0:1',
                          verbose=0)

In [19]:
grid_search_result = model.grid_search(grid, 
                                       X=X_train_pool)

bestTest = 158.7350558
bestIteration = 999
0:	loss: 158.7350558	best: 158.7350558 (0)	total: 46.7s	remaining: 5m 26s
bestTest = 158.4401265
bestIteration = 999
1:	loss: 158.4401265	best: 158.4401265 (1)	total: 1m 31s	remaining: 4m 33s
bestTest = 158.7559957
bestIteration = 999
2:	loss: 158.7559957	best: 158.4401265 (1)	total: 2m 14s	remaining: 3m 44s
bestTest = 158.4076853
bestIteration = 996
3:	loss: 158.4076853	best: 158.4076853 (3)	total: 2m 58s	remaining: 2m 58s
bestTest = 158.4502214
bestIteration = 999
4:	loss: 158.4502214	best: 158.4076853 (3)	total: 4m 8s	remaining: 2m 28s
bestTest = 158.1961437
bestIteration = 999
5:	loss: 158.1961437	best: 158.1961437 (5)	total: 5m 6s	remaining: 1m 42s
bestTest = 158.4413902
bestIteration = 997
6:	loss: 158.4413902	best: 158.1961437 (5)	total: 6m 14s	remaining: 53.5s
bestTest = 158.2151033
bestIteration = 980
7:	loss: 158.2151033	best: 158.1961437 (5)	total: 7m 15s	remaining: 0us
Estimating final quality...
Training on fold [0/3]
bestTest = 1

In [35]:
grid_search_result['params']

{'depth': 6, 'l2_leaf_reg': 1, 'learning_rate': 0.1}

In [36]:
model = CatBoostRegressor(task_type="GPU",
                           devices='0:1',
                          verbose=0,
                          **grid_search_result['params'])

In [37]:
model.fit(X_train_pool)

<catboost.core.CatBoostRegressor at 0x7f64279dab50>

In [38]:
predict_y_time = model.predict(X_test_pool)

In [39]:
X_train_pool, X_test_pool = Pool(data=X_train, label=y_label_train, cat_features=cat_features), Pool(data=X_test, cat_features=cat_features)

обучаем классификатор

In [66]:
continius_X_train = X_train.copy()
continius_X_test = X_test.copy()

In [67]:
for x in cat_features:
  continius_X_train[x] = preprocessing.LabelEncoder().fit_transform(continius_X_train[x])
  continius_X_test[x] = preprocessing.LabelEncoder().fit_transform(continius_X_test[x])

выбран knn, потому что очень много лейблов, и довольно небольшая размерность, так что от проклятия размерности не страдаем

In [68]:
model = KNeighborsClassifier(n_neighbors=100)
model.fit(continius_X_train, y_label_train)
predict_y_label = model.predict(continius_X_test)



считаем скор

In [62]:
result = 0.5*r2_score(y_time_test, predict_y_time) + 0.5*recall_score(y_label_test, predict_y_label, average='micro')
print(result)

0.3961848320237662
