In [1]:
# импортируем необходимые библиотеки, функцию train_test_split()
# и классы StandardScaler, OneHotEncoder, 
# TransformerMixin, LogisticRegression, Pipeline
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [2]:
# записываем CSV-файл в объект DataFrame
data = pd.read_csv('Data/StateFarm_missing.csv', sep=';')

In [3]:
# разбиваем данные на обучающие и тестовые: получаем обучающий
# массив признаков, тестовый массив признаков, обучающий массив
# меток, тестовый массив меток
X_train, X_test, y_train, y_test = train_test_split(data.drop('Response', axis=1), 
                                                    data['Response'], 
                                                    test_size=0.3,
                                                    stratify=data['Response'],
                                                    random_state=42)

In [4]:
X_train.head()

Unnamed: 0,Customer Lifetime Value,Coverage,Education,EmploymentStatus,Gender,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies
4065,5470.060561,Extended,Bachelor,Employed,F,54507.0,138.0,20.0,11.0,0.0,1.0
1258,7611.212764,Premium,High School or Below,Unemployed,F,0.0,103.0,33.0,54.0,0.0,3.0
2963,8262.879764,Extended,College,Unemployed,F,0.0,76.0,4.0,51.0,1.0,2.0
3339,21480.81781,Basic,Master,Medical Leave,F,15505.0,68.0,24.0,2.0,0.0,2.0
6986,3300.607083,Extended,Bachelor,Employed,F,59592.0,84.0,8.0,49.0,0.0,1.0


In [5]:
class DFImputer(TransformerMixin):
    def __init__(self):
        self.fill = None
        
    def fit(self, X, y=None):
        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].median() for c in X], index=X.columns)
        return self
    
    def transform(self, X, y=None):
        Xfill = X.fillna(self.fill)
        return Xfill

In [6]:
class DFStandardScaler(TransformerMixin):

    def __init__(self):
        self.ss = None
        self.mean_ = None
        self.scale_ = None

    def fit(self, X, y=None):
        self.ss = StandardScaler()
        num_cols = X.dtypes[X.dtypes != 'object'].index
        self.ss.fit(X[num_cols])
        self.mean_ = pd.Series(self.ss.mean_, index=X[num_cols].columns)
        self.scale_ = pd.Series(self.ss.scale_, index=X[num_cols].columns)
        return self

    def transform(self, X):
        num_cols = X.dtypes[X.dtypes != 'object'].index
        cat_cols = X.dtypes[X.dtypes == 'object'].index
        Xt = self.ss.transform(X[num_cols])
        X_scaled = pd.DataFrame(Xt, index=X[num_cols].index, columns=X[num_cols].columns)
        X_res = pd.concat([X[cat_cols], X_scaled], axis=1)
        return X_res

In [7]:
class DFOneHotEncoder(TransformerMixin):

    def __init__(self):
        self.dv = None

    def fit(self, X, y=None):
        cat_cols = X.dtypes[X.dtypes == 'object'].index
        self.ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
        self.ohe.fit(X[cat_cols])
        return self

    def transform(self, X):
        num_cols = X.dtypes[X.dtypes != 'object'].index
        cat_cols = X.dtypes[X.dtypes == 'object'].index
        Xt = self.ohe.transform(X[cat_cols])
        cols = self.ohe.get_feature_names()
        X_dum = pd.DataFrame(Xt, index=X[cat_cols].index, columns=cols)
        X_res = pd.concat([X[num_cols], X_dum], axis=1)
        return X_res

In [8]:
# загружаем игрушечные наборы для тестирования классов
toy_train = pd.read_csv('Data/toy_train.csv', sep=';')
toy_test = pd.read_csv('Data/toy_test.csv', sep=';')

In [9]:
toy_train

Unnamed: 0,age,income,region
0,,,MSK
1,23.0,4560.55,MSK
2,24.0,,
3,30.0,,EKAT
4,,7888.1,
5,55.0,9000.5,SPB
6,37.0,,SPB


In [10]:
toy_test

Unnamed: 0,age,income,region
0,89.0,903.33,MSK
1,23.0,4560.55,NSK
2,24.0,,MSK
3,55.0,6700.0,MSK
4,,8999.0,EKAT
5,,5430.0,SPB
6,37.0,,


In [11]:
print(toy_train['age'].median())
print(toy_train['income'].median())
print(toy_train['region'].mode())

30.0
7888.1
0    MSK
1    SPB
dtype: object


In [12]:
imp = DFImputer()
imp.fit(toy_train)
toy_train = imp.transform(toy_train)
toy_test = imp.transform(toy_test)

In [13]:
toy_train

Unnamed: 0,age,income,region
0,30.0,7888.1,MSK
1,23.0,4560.55,MSK
2,24.0,7888.1,MSK
3,30.0,7888.1,EKAT
4,30.0,7888.1,MSK
5,55.0,9000.5,SPB
6,37.0,7888.1,SPB


In [14]:
toy_test

Unnamed: 0,age,income,region
0,89.0,903.33,MSK
1,23.0,4560.55,NSK
2,24.0,7888.1,MSK
3,55.0,6700.0,MSK
4,30.0,8999.0,EKAT
5,30.0,5430.0,SPB
6,37.0,7888.1,MSK


In [15]:
scaler = DFStandardScaler()
scaler.fit(toy_train)
toy_train = scaler.transform(toy_train)
toy_test = scaler.transform(toy_test)

In [16]:
toy_train

Unnamed: 0,region,age,income
0,MSK,-0.27,0.245729
1,MSK,-0.966315,-2.33817
2,MSK,-0.866841,0.245729
3,EKAT,-0.27,0.245729
4,MSK,-0.27,0.245729
5,SPB,2.21684,1.109526
6,SPB,0.426315,0.245729


In [17]:
toy_test

Unnamed: 0,region,age,income
0,MSK,5.598941,-5.178063
1,NSK,-0.966315,-2.33817
2,MSK,-0.866841,0.245729
3,MSK,2.21684,-0.676851
4,EKAT,-0.27,1.108361
5,SPB,-0.27,-1.663027
6,MSK,0.426315,0.245729


In [18]:
ohe = DFOneHotEncoder()
ohe.fit(toy_train)
toy_train = ohe.transform(toy_train)
toy_test = ohe.transform(toy_test)

In [19]:
toy_train

Unnamed: 0,age,income,x0_EKAT,x0_MSK,x0_SPB
0,-0.27,0.245729,0.0,1.0,0.0
1,-0.966315,-2.33817,0.0,1.0,0.0
2,-0.866841,0.245729,0.0,1.0,0.0
3,-0.27,0.245729,1.0,0.0,0.0
4,-0.27,0.245729,0.0,1.0,0.0
5,2.21684,1.109526,0.0,0.0,1.0
6,0.426315,0.245729,0.0,0.0,1.0


In [20]:
toy_test

Unnamed: 0,age,income,x0_EKAT,x0_MSK,x0_SPB
0,5.598941,-5.178063,0.0,1.0,0.0
1,-0.966315,-2.33817,0.0,0.0,0.0
2,-0.866841,0.245729,0.0,1.0,0.0
3,2.21684,-0.676851,0.0,1.0,0.0
4,-0.27,1.108361,1.0,0.0,0.0
5,-0.27,-1.663027,0.0,0.0,1.0
6,0.426315,0.245729,0.0,1.0,0.0


In [21]:
# создаем конвейер
ml_pipe = Pipeline([('impute', DFImputer()), 
                    ('scaler', DFStandardScaler()),
                    ('ohe', DFOneHotEncoder()), 
                    ('logreg', LogisticRegression(solver='lbfgs', max_iter=200))])

In [22]:
# обучаем итоговый конвейер
ml_pipe.fit(X_train, y_train)
# оцениваем качество модели на обучающих данных
print('Правильность на обучающей выборке: {:.3f}'.format(
    ml_pipe.score(X_train, y_train)))
# оцениваем качество модели на тестовых данных
print('Правильность на тестовой выборке: {:.3f}'.format(
    ml_pipe.score(X_test, y_test)))

Правильность на обучающей выборке: 0.900
Правильность на тестовой выборке: 0.899
