Практическое задание:
- взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)
- сделать обзорный анализ выбранного датасета
- сделать feature engineering
- обучить любой классификатор (какой вам нравится)
- разобраться с SHAP и построить важности признаков для:
    - всего тестового набора данных (summary_plot - дать интерпретацию)
    - для топ 10%
- для отдельных наблюдений вывести force_plot и попытаться проинтерпретировать результат


In [95]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import recall_score 
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import f1_score
from sklearn.metrics import mean_absolute_error 
from sklearn.metrics import r2_score

import itertools
import xgboost


import matplotlib.pyplot as plt

%matplotlib inline

In [96]:
df = pd.read_csv("Shill Bidding Dataset.csv")
df.head(3)

Unnamed: 0,Record_ID,Auction_ID,Bidder_ID,Bidder_Tendency,Bidding_Ratio,Successive_Outbidding,Last_Bidding,Auction_Bids,Starting_Price_Average,Early_Bidding,Winning_Ratio,Auction_Duration,Class
0,1,732,_***i,0.2,0.4,0.0,2.8e-05,0.0,0.993593,2.8e-05,0.666667,5,0
1,2,732,g***r,0.02439,0.2,0.0,0.013123,0.0,0.993593,0.013123,0.944444,5,0
2,3,732,t***p,0.142857,0.2,0.0,0.003042,0.0,0.993593,0.003042,1.0,5,0


In [97]:
#соберем наш простой pipeline, но нам понадобится написать класс для выбора нужного поля
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

In [112]:
target = ['Class']
features = df.columns.drop(target)


#разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], random_state=0)

In [133]:
# categorical_columns = [n for n in df.columns if df[n].dtype.name == 'object']
numerical_columns = [n for n in df.columns if df[n].dtype.name != 'object']
numerical_columns.remove('Class')
continuous_columns = numerical_columns

final_transformers = list()
    
for num_col in numerical_columns:
    num_transformer = Pipeline([
                ('selector', NumberSelector(num_col)),
                ('scaler', StandardScaler())
            ])
    final_transformers.append((num_col, num_transformer))
    
# for cat_col in categorical_columns:
#     cat_transformer = Pipeline([
#                 ('selector', NumberSelector(key=cat_col)),
#                 ('scaler', OneHotEncoder())
#             ])
#     final_transformers.append((cat_col, cat_transformer))
    
feats = FeatureUnion(final_transformers)

feature_processing = Pipeline([('feats', feats)])

continuous_columns

['Record_ID',
 'Auction_ID',
 'Bidder_Tendency',
 'Bidding_Ratio',
 'Successive_Outbidding',
 'Last_Bidding',
 'Auction_Bids',
 'Starting_Price_Average',
 'Early_Bidding',
 'Winning_Ratio',
 'Auction_Duration']

In [134]:
regressor = Pipeline([
    ('features',feats),
    ('classifier', LinearRegression()),
])

In [135]:
y_train.head()

Unnamed: 0,Class
3753,0
1003,0
463,0
541,0
4507,0


In [136]:
regressor.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Record_ID',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='Record_ID')),
                                                                 ('scaler',
                                                                  StandardScaler())])),
                                                ('Auction_ID',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='Auction_ID')),
                                                                 ('scaler',
                                                                  StandardScaler())])),
                                                ('Bidder_Tendency',
                                                 Pipeline(steps=[('selector',
               

In [142]:
predictions =  regressor.predict(X_test)
 
mean_absolute_error(y_pred=predictions, y_true=y_test), r2_score(y_pred=predictions, y_true=y_test)

(0.04625806111417499, 0.8090937756716468)

In [149]:
intercept = regressor[1].intercept_
features = pd.DataFrame(continuous_columns, 
                        regressor[1].coef_.reshape(),
                        columns=['coefficient'])
features.head()

ValueError: Shape of passed values is (11, 1), indices imply (1, 1)

In [148]:
continuous_columns

['Record_ID',
 'Auction_ID',
 'Bidder_Tendency',
 'Bidding_Ratio',
 'Successive_Outbidding',
 'Last_Bidding',
 'Auction_Bids',
 'Starting_Price_Average',
 'Early_Bidding',
 'Winning_Ratio',
 'Auction_Duration']