In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

Мы будем работать с данными агрегатора такси [Sigma Cabs](https://www.kaggle.com/datasets/arashnic/taxi-pricing-with-mobility-analytics). В зависимости от характеристик поездки требуется предсказать один из трех типов повышенного ценообразования: [1, 2, 3]. Таким образом, это поможет компании оптимально мэтчить такси и клиентов. 

In [13]:
df = pd.read_csv('sigma_cabs.csv')
df.shape

(131662, 14)

In [14]:
# Занесем индекс колонку
df = df.set_index('Trip_ID')
df.head()

Unnamed: 0_level_0,Trip_Distance,Type_of_Cab,Customer_Since_Months,Life_Style_Index,Confidence_Life_Style_Index,Destination_Type,Customer_Rating,Cancellation_Last_1Month,Var1,Var2,Var3,Gender,Surge_Pricing_Type
Trip_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
T0005689460,6.77,B,1.0,2.42769,A,A,3.905,0,40.0,46,60,Female,2
T0005689461,29.47,B,10.0,2.78245,B,A,3.45,0,38.0,56,78,Male,2
T0005689464,41.58,,10.0,,,E,3.50125,2,,56,77,Male,2
T0005689465,61.56,C,10.0,,,A,3.45375,0,,52,74,Male,3
T0005689467,54.95,C,10.0,3.03453,B,A,3.4025,4,51.0,49,102,Male,2


Описание признаков:

1. **Trip_ID**: ID for TRIP
2. **Trip_Distance**: The distance for the trip requested by the customer
3. **TypeofCab**: Category of the cab requested by the customer
4. **CustomerSinceMonths**: Customer using cab services since n months; 0 month means current month
5. **LifeStyleIndex**: Proprietary index created by Sigma Cabs showing lifestyle of the customer based on their behaviour
6. **ConfidenceLifeStyle_Index**: Category showing confidence on the index mentioned above
7. **Destination_Type**: Sigma Cabs divides any destination in one of the 14 categories.
8. **Customer_Rating**: Average of life time ratings of the customer till date
9. **CancellationLast1Month**: Number of trips cancelled by the customer in last 1 month
10. **Var1**, **Var2** and **Var3**: Continuous variables masked by the company. Can be used for modelling purposes
11. **Gender**: Gender of the customer

**SurgePricingType**: Target (can be of 3 types)


### EDA 
Заполните пропуски в вещественных признаках медианой, а в категориальных - самым популярным классом. Изобразите марицу корреляций и выведите топ5 пар самых коррелированных признаков.

Так как в сумме уникальных значений различных категориальных признаков окажется не супер-много, примените `One-Hot-Encoding` для них. Не забудьте в методе `pd.get_dummies` указать параметр `drop_first=True`.

In [15]:
### Your code is here

numeric_columns = df.loc[:,df.dtypes!=np.object].columns
#df.loc[:,df.dtypes!=np.object].head(2)
categorical_columns = df.loc[:,df.dtypes==np.object].columns

for col in numeric_columns:
    df[col] = df[col].fillna(df[col].median())
    

for col in categorical_columns:
    most_recent = df.groupby(col).size().sort_values().index[-1]
    df[col] = df[col].fillna(most_recent)
    

def get_redundant_pairs(df):
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(df, n=5):
    au_corr = df.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[0:n]

print("Top Absolute Correlations")
print(get_top_abs_correlations(df[numeric_columns], 10))

Top Absolute Correlations
Var2                      Var3                  0.683437
Trip_Distance             Life_Style_Index      0.468332
Life_Style_Index          Var3                  0.303324
Customer_Rating           Var2                  0.302968
Trip_Distance             Var3                  0.231706
Customer_Rating           Var3                  0.227531
Life_Style_Index          Var2                  0.215944
Trip_Distance             Var2                  0.200456
Life_Style_Index          Customer_Rating       0.189165
Cancellation_Last_1Month  Surge_Pricing_Type    0.185646
dtype: float64


In [16]:
for col in categorical_columns:
        one_hot = pd.get_dummies(df[col], prefix=col, drop_first=True)
        df = pd.concat((df.drop(col, axis=1), one_hot), axis=1)
        
df.shape

(131662, 29)

In [20]:
df.columns

Index(['Trip_Distance', 'Customer_Since_Months', 'Life_Style_Index',
       'Customer_Rating', 'Cancellation_Last_1Month', 'Var1', 'Var2', 'Var3',
       'Surge_Pricing_Type', 'Type_of_Cab_B', 'Type_of_Cab_C', 'Type_of_Cab_D',
       'Type_of_Cab_E', 'Confidence_Life_Style_Index_B',
       'Confidence_Life_Style_Index_C', 'Destination_Type_B',
       'Destination_Type_C', 'Destination_Type_D', 'Destination_Type_E',
       'Destination_Type_F', 'Destination_Type_G', 'Destination_Type_H',
       'Destination_Type_I', 'Destination_Type_J', 'Destination_Type_K',
       'Destination_Type_L', 'Destination_Type_M', 'Destination_Type_N',
       'Gender_Male'],
      dtype='object')

### Training

In [17]:
np.random.seed(2022)

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [22]:
X = df.drop('Surge_Pricing_Type', axis=1)
y = df['Surge_Pricing_Type']
X_train, X_test, y_train, y_test  = train_test_split(X, y, 
                                                     test_size=0.2, 
                                                     shuffle=True, 
                                                     random_state=2022)

**Задание 1.** Обучите One-vs-Rest Logreg. Не забудьте в шаг добавить стандартизацию данных (через `StandardScaler`) Посчитайте precision, recall, f1-score и усредните по всем классам с помощью micro, macro и weighted avg. Здесь и далее округляйте до 3 знака после запятой.

Чтобы отдельно и долго не вычислять метрики, можно воспользоваться `classification_report` из `sklearn.metrics`!

In [32]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

#SGDClassifier(loss='log') ### max(0; 1-M)

pipe = Pipeline([("scaler", StandardScaler()),
                 ("one_vs_all", OneVsRestClassifier(LogisticRegression()))])

pipe.fit(X_train, y_train)

### Your code is here
from sklearn.metrics import classification_report

from sklearn.metrics import precision_score, recall_score, f1_score



print(classification_report(y_test, pipe.predict(X_test),digits = 3))

              precision    recall  f1-score   support

           1      0.723     0.542     0.619      5372
           2      0.636     0.834     0.722     11349
           3      0.741     0.571     0.645      9612

    accuracy                          0.679     26333
   macro avg      0.700     0.649     0.662     26333
weighted avg      0.692     0.679     0.673     26333



Подберите оптимальные гиперпараметры модели с помощью `GridSearchCV()` из предложенных. Для лучшего набора гиперпараметров посчитайте те же самые метрики. Валидировать параметры необходимо по `accuracy`. В этот раз проведем настояющую процедуру Кросс-Валидации! 

Для этого в метод `fit` передадим тренировочную часть наших данных, в параметр `cv` ничего не будем передавать (по дефолту 5-fold Кросс-Валидация будет проведена), а итоговые метрики замерим на тесте!

In [33]:
param_grid = {'one_vs_all__estimator__penalty': ['l1', 'l2', 'elasticnet'],
              'one_vs_all__estimator__C': [0.001, 0.01, 0.1, 1]}

In [36]:
### Your code is here
from sklearn.model_selection import GridSearchCV

search = GridSearchCV(pipe, param_grid)

search.fit(X_train, y_train)

print(f"Best parameter (CV score={search.best_score_:.5f}):")
print(search.best_params_)

Best parameter (CV score=0.68062):
{'one_vs_all__estimator__C': 0.001, 'one_vs_all__estimator__penalty': 'l2'}


Изобразите три калибровочные кривые для Logistic Classifier: 0-vs-rest, 1-vs-rest, 2-vs-rest. Хорошо ли откалиброван обученный классификатор? 

Заметьте, что `predict_proba` возвращает список из вероятностей для всех наших классов!

In [37]:
### Your code is here
pipe.set_params(one_vs_all__estimator__C=0.001, one_vs_all__estimator__penalty='l2')


Pipeline(steps=[('scaler', StandardScaler()),
                ('one_vs_all',
                 OneVsRestClassifier(estimator=LogisticRegression(C=0.001)))])

In [40]:
from sklearn.model_selection import cross_validate
cv_result_pipe = cross_validate(pipe, X_train, y_train, scoring='accuracy',
                                return_train_score=True)

print(classification_report(y_test, search.predict(X_test),digits = 3))

              precision    recall  f1-score   support

           1      0.742     0.534     0.621      5372
           2      0.635     0.839     0.723     11349
           3      0.742     0.576     0.649      9612

    accuracy                          0.681     26333
   macro avg      0.706     0.650     0.664     26333
weighted avg      0.696     0.681     0.675     26333



**Задание 2.** Обучите логистическую регрессию с гиперпараметрами из первого задания на полиномиальных признаках до 4 степени. Сравните метрики с первым заданием.


Пример: Пусть у нас был единственный признак 

$$
d_j = [1, 2, 3, 4]
$$

Тогда полиномиальные признаки до 4 степени от такого будут иметь вид:

$$
d_j^1 = [1, 2, 3, 4]
$$

$$
d_j^2 = [1, 4, 9, 16]
$$

$$
d_j^3 = [1, 8, 27, 64]
$$

$$
d_j^4 = [1, 16, 81, 256]
$$

P.S. Бинарные колонки нет смысла возводить в какие-то степени, поэтому возьмем исключительно вещественные из базовых. 

Для этого можно воспользоваться классическим циклом (или уроком из занятия про `Sberbank Housing Market`). Положите модифицированный датасет в переменную `X_polinomial`!

P.S.S Зачастую еще, создаваю полиномиальные фичи, учитывают "пересечения" признаков, то есть, например, из векторов признаков $d_j, d_i$ генерируют не просто новые степени $d_j^2, d_i^2, d_j^3, d_i^3...$, а еще и признаки вида $d_j \cdot d_i, d_j^2 \cdot d_i, d_j \cdot d_i^2...$, но здесь ограничьтесь просто степенями!

In [43]:
X.describe()

Unnamed: 0,Trip_Distance,Customer_Since_Months,Life_Style_Index,Customer_Rating,Cancellation_Last_1Month,Var1,Var2,Var3,Type_of_Cab_B,Type_of_Cab_C,...,Destination_Type_F,Destination_Type_G,Destination_Type_H,Destination_Type_I,Destination_Type_J,Destination_Type_K,Destination_Type_L,Destination_Type_M,Destination_Type_N,Gender_Male
count,131662.0,131662.0,131662.0,131662.0,131662.0,131662.0,131662.0,131662.0,131662.0,131662.0,...,131662.0,131662.0,131662.0,131662.0,131662.0,131662.0,131662.0,131662.0,131662.0,131662.0
mean,44.200909,6.015912,2.801448,2.849458,0.782838,62.474883,51.2028,75.099019,0.389983,0.213592,...,0.014811,0.011309,0.00957,0.006175,0.005279,0.005165,0.004884,0.000706,0.000744,0.71319
std,25.522882,3.544411,0.207765,0.980675,1.037559,14.893324,4.986142,11.578278,0.487748,0.409844,...,0.120795,0.105742,0.097357,0.078338,0.072463,0.071681,0.069713,0.026568,0.027272,0.452274
min,0.31,0.0,1.59638,0.00125,0.0,30.0,40.0,52.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,24.58,3.0,2.687952,2.1525,0.0,61.0,48.0,67.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,38.2,6.0,2.79805,2.895,0.0,61.0,50.0,74.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,60.73,10.0,2.912815,3.5825,1.0,61.0,54.0,82.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,109.23,10.0,4.87511,5.0,8.0,210.0,124.0,206.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [62]:
spisok_col = []
for col in X.columns:
    if X[col].nunique() > 2:
        spisok_col.append(col)
spisok_col

['Trip_Distance',
 'Customer_Since_Months',
 'Life_Style_Index',
 'Customer_Rating',
 'Cancellation_Last_1Month',
 'Var1',
 'Var2',
 'Var3']

In [70]:
X.shape

(131662, 28)

In [71]:
X_polinomial.shape

(131662, 52)

In [66]:
### Создание полиномиальных признаков

X_polinomial = X.copy()


### Your code is here
for col in spisok_col:
    for power in [2, 3, 4]:
        
        to_add = (X_polinomial[col]**power).to_frame().rename({col:f"{col}_{power}"}, axis=1)
        X_polinomial = pd.concat((X_polinomial, to_add), axis=1)

In [67]:
X_pol_train, X_pol_test, y_train, y_test  = train_test_split(X_polinomial, y, 
                                                             test_size=0.2, 
                                                             shuffle=True, 
                                                             random_state=2022)

In [76]:
### Your code is here

from sklearn.model_selection import cross_validate
pipe = Pipeline([("scaler2", StandardScaler()),
                 ("one_vs_all", OneVsRestClassifier(LogisticRegression(C=0.001, penalty='l2')))])

pipe.fit(X_pol_train, y_train)
print(classification_report(y_test, pipe.predict(X_pol_test),digits = 3))

              precision    recall  f1-score   support

           1      0.748     0.532     0.622      5372
           2      0.636     0.837     0.723     11349
           3      0.741     0.584     0.653      9612

    accuracy                          0.682     26333
   macro avg      0.708     0.651     0.666     26333
weighted avg      0.697     0.682     0.677     26333



По аналогии с первым заданием изобразите три калибровочные кривые. Стало ли лучше?

In [11]:
### Your code is here

### Your code is here
a=y_test.copy()
a[a>1]=0
CalibrationDisplay.from_predictions(a, search.predict_proba(X_pol_test)[:,0], n_bins=15)

b=y_test.copy()
b[b!=2]=0
CalibrationDisplay.from_predictions(b, pipe.predict_proba(X_pol_test)[:, 1], n_bins=15)

c=y_test.copy()
c[c<3]=0
CalibrationDisplay.from_predictions(c, pipe.predict_proba(X_pol_test)[:, 2], n_bins=15)

**Задание 3.** Обучите на датасете без полиномиальных признаков One-vs-One `SGDClassifier` из `sklearn.linear_model`, который использует стохастический градиентный спуск (узнаете о нем позже) и может обучать как `SVM`, так и, например, `LogReg`, если указать в качестве параметра `loss` либо `hinge`, либо `log` соответственно!

Посчитайте precision, recall, f1-score и усредните по всем классам с помощью micro, macro и weighted avg.

In [77]:
X_train, X_test, y_train, y_test  = train_test_split(X, y, 
                                                     test_size=0.2, 
                                                     shuffle=True, 
                                                     random_state=2022)

In [81]:
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsOneClassifier


### Your code is here


pipe_all_all = Pipeline([("scaler", StandardScaler()),
                         ("one_vs_one", OneVsOneClassifier(SGDClassifier()))])

pipe_all_all.fit(X_train, y_train)
from sklearn.metrics import classification_report
print(classification_report(y_test, pipe_all_all.predict(X_test), labels=[1, 2, 3],digits=3))


              precision    recall  f1-score   support

           1      0.737     0.524     0.612      5372
           2      0.625     0.870     0.728     11349
           3      0.758     0.531     0.625      9612

    accuracy                          0.676     26333
   macro avg      0.707     0.642     0.655     26333
weighted avg      0.697     0.676     0.667     26333



Подберите оптимальные гиперпараметры модели с помощью `GridSearchCV()`. При этом переберите всевозможные функции потерь. Таким образом, при `loss = 'hinge'`, мы обучим SVM, при `loss = 'log'` мы обучим логистическую регрессию и т.д.

Используйте прием с Кросс-Валидацией при подборе параметров, как ранее, а также замерьте метрики на тесте.

In [82]:
param_grid = {'one_vs_one__estimator__loss': ['hinge', 'log', 'modified_huber'],
              'one_vs_one__estimator__penalty': ['l1', 'l2'],
              'one_vs_one__estimator__alpha': [0.001, 0.01, 0.1]}

In [83]:
### Your code is here

from sklearn.model_selection import GridSearchCV
search = GridSearchCV(pipe_all_all, param_grid,scoring='accuracy')

search.fit(X_train, y_train)

print(f"Best parameter (CV score={search.best_score_:.5f}):")
print(search.best_params_)

Best parameter (CV score=0.68120):
{'one_vs_one__estimator__alpha': 0.1, 'one_vs_one__estimator__loss': 'modified_huber', 'one_vs_one__estimator__penalty': 'l2'}


In [85]:
pipe_all_all.set_params(one_vs_one__estimator__alpha=0.1, one_vs_one__estimator__loss='modified_huber', one_vs_one__estimator__penalty='l2')

Pipeline(steps=[('scaler', StandardScaler()),
                ('one_vs_one',
                 OneVsOneClassifier(estimator=SGDClassifier(alpha=0.1,
                                                            loss='modified_huber')))])

In [86]:
from sklearn.model_selection import cross_validate
cv_result_pipe = cross_validate(pipe_all_all, X_train, y_train, scoring='accuracy',
                                return_train_score=True)

print(classification_report(y_test, search.predict(X_test),digits = 3))

              precision    recall  f1-score   support

           1      0.747     0.530     0.620      5372
           2      0.631     0.846     0.723     11349
           3      0.745     0.567     0.644      9612

    accuracy                          0.680     26333
   macro avg      0.708     0.648     0.662     26333
weighted avg      0.697     0.680     0.673     26333



Можно ли однозначной сказать, какой подход оказался лучше: One-vs-Rest или One-vs-One?