# Импорт библиотек

In [1]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import (ElasticNet, Lasso, LinearRegression,
                                  LogisticRegression, Ridge, SGDClassifier)
from sklearn.metrics import accuracy_score
from sklearn.model_selection import (GridSearchCV, RandomizedSearchCV,
                                     cross_val_score, cross_validate,
                                     train_test_split)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import (MinMaxScaler, OneHotEncoder, OrdinalEncoder,
                                   PolynomialFeatures, StandardScaler)

from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_selection import VarianceThreshold, SelectFromModel, RFECV
from sklearn.metrics import make_scorer
from sklearn.model_selection import ParameterSampler
from tqdm.auto import tqdm

import warnings
warnings.filterwarnings("ignore")

# Загрузка данных

In [2]:
train = pd.read_csv('porto/train.csv').set_index('id')
test = pd.read_csv('porto/test.csv').set_index('id')

y = train['target']
X = train.drop(columns=['target'])

In [3]:
print(train.shape)
train.head()

(595212, 58)


Unnamed: 0_level_0,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,0,2,2,5,1,0,0,1,0,0,...,9,1,5,8,0,1,1,0,0,1
9,0,1,1,7,0,0,0,0,1,0,...,3,1,1,9,0,1,1,0,1,0
13,0,5,4,9,1,0,0,0,1,0,...,4,2,7,7,0,1,1,0,1,0
16,0,0,1,2,0,0,1,0,0,0,...,2,2,4,9,0,0,0,0,0,0
17,0,0,2,0,1,0,1,0,0,0,...,3,1,1,3,0,0,0,1,1,0


In [4]:
print(test.shape)
test.head()

(892816, 57)


Unnamed: 0_level_0,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,1,8,1,0,0,1,0,0,0,...,1,1,1,12,0,1,1,0,0,1
1,4,2,5,1,0,0,0,0,1,0,...,2,0,3,10,0,0,1,1,0,1
2,5,1,3,0,0,0,0,0,1,0,...,4,0,2,4,0,0,0,0,0,0
3,0,1,6,0,0,1,0,0,0,0,...,5,1,0,5,1,0,1,0,0,0
4,5,1,7,0,0,0,0,0,1,0,...,4,0,0,4,0,1,1,0,0,1


# Функции

In [5]:
# Code for calculating Normalized gini coefficient
# https://www.kaggle.com/c/ClaimPredictionChallenge/discussion/703
def gini(actual, pred, cmpcol = 0, sortcol = 1):  
    assert(len(actual) == len(pred))  
    epsilon = 1e-7
    values = np.asarray(np.c_[actual, pred, np.arange(len(actual))], dtype=np.float)  
    values = values[np.lexsort((values[:, 2], -1 * values[:, 1]))]  
    total = values[:, 0].sum() 
    gini_sum = (values[:, 0].cumsum().sum() + epsilon) / (total + epsilon)  
  
    gini_sum -= (len(actual) + 1) / 2  
    return gini_sum / len(actual)  

def gini_normalized(a, p):  
    '''Function to calculate the normalized gini coefficient'''
    return gini(a, p) / gini(a, a)

# RandomForest

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.3, random_state=42, shuffle=True, stratify=y)

In [8]:
def run_train(params):
    #cat_cols = [column for column in X if (column.endswith('cat') or column.endswith('bin'))]
    #cat_trans = Pipeline(steps=[('encoder', OneHotEncoder(handle_unknown='ignore'))])

    my_scorer = make_scorer(gini_normalized, greater_is_better=True, needs_proba=True)
    model = RandomForestClassifier(random_state=42)
    
    rs = RandomizedSearchCV(
        model, params, n_iter=100, scoring=my_scorer, n_jobs=30, cv=3, random_state=42, verbose=10)
    rs.fit(X_train, y_train)


    print(rs.best_params_)
    print('CV:',rs.best_score_)
    
    y_pred = rs.predict_proba(X_valid)[:, 1]
    print('Valid:',gini_normalized(y_valid, y_pred))
    
    return rs

In [31]:
params = {
    'n_estimators': range(50,1000+1,1),
    'max_depth': range(2,30+1,1),
    'criterion': ['entropy']
}

clf = run_train(params)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed: 22.6min
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed: 117.0min
[Parallel(n_jobs=10)]: Done 300 out of 300 | elapsed: 199.1min finished


{'n_estimators': 983, 'max_depth': 13, 'criterion': 'entropy'}
CV: 0.26389639597842335
Valid: 0.27281702517101114


In [9]:
params = {
    'n_estimators': [983],
    'max_depth': [13],
    'criterion': ['entropy','gini'],
    'max_features': ['sqrt','log2'],
    'bootstrap': [True,False],
    'class_weight': ['balanced','balanced_subsample',None]
}

clf = run_train(params)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=30)]: Using backend LokyBackend with 30 concurrent workers.
[Parallel(n_jobs=30)]: Done   3 out of   3 | elapsed: 13.4min remaining:    0.0s
[Parallel(n_jobs=30)]: Done   3 out of   3 | elapsed: 13.4min finished


{'n_estimators': 983, 'max_features': 'sqrt', 'max_depth': 13, 'criterion': 'entropy', 'class_weight': None, 'bootstrap': False}
CV: 0.2645883227911428
Valid: 0.27479004847652916


In [10]:
sub = test[[]].copy()
sub['target'] = clf.predict_proba(test)[:, 1]
sub.to_csv('porto_sub/porto_sub31.csv')

In [11]:
clf_best = clf.best_estimator_.fit(X, y)

In [12]:
sub = test[[]].copy()
sub['target'] = clf_best.predict_proba(test)[:, 1]
sub.to_csv('porto_sub/porto_sub32.csv')

# SGD

In [20]:
def run_train(params):
    num_cols = [column for column in X if not (column.endswith('cat') or column.endswith('bin'))]
    num_trans = Pipeline(steps=[('scaler', StandardScaler())])

    cat_cols = [column for column in X if (column.endswith('cat') or column.endswith('bin'))]
    cat_trans = Pipeline(steps=[('encoder', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(transformers=[
        ('num', num_trans, num_cols),
        ('cat', cat_trans, cat_cols)
    ])    

    pipe = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('clf', SGDClassifier(loss='log', n_jobs=1, random_state=42))
    ])

    my_scorer = make_scorer(gini_normalized, greater_is_better=True, needs_proba=True)
    rs_sgd = RandomizedSearchCV(
        pipe, params, n_iter=100, scoring=my_scorer, n_jobs=30, cv=3, random_state=42, verbose=10)
    rs_sgd.fit(X_train, y_train)

    print(rs_sgd.best_params_)
    print('CV:',rs_sgd.best_score_)
    
    y_pred = rs_sgd.predict_proba(X_valid)[:, 1]
    print('Valid:',gini_normalized(y_valid, y_pred))
    
    return rs_sgd

In [21]:
params = {
    #'poly__degree': [1,2,3],
    #'logistic__solver': ['newton-cg', 'lbfgs', 'liblinear']
    'clf__alpha': [0.01,0.05,0.001,0.0005,0.0001,0.00005,0.00001,0.000005,0.000001],
    'clf__penalty': ['l1', 'l2'],
    'clf__max_iter': range(50,500)}

sgd = run_train(params)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=30)]: Using backend LokyBackend with 30 concurrent workers.
[Parallel(n_jobs=30)]: Done   1 tasks      | elapsed:   50.8s
[Parallel(n_jobs=30)]: Done  12 tasks      | elapsed:  1.8min
[Parallel(n_jobs=30)]: Done  25 tasks      | elapsed:  2.2min
[Parallel(n_jobs=30)]: Done  38 tasks      | elapsed:  3.1min
[Parallel(n_jobs=30)]: Done  53 tasks      | elapsed:  3.6min
[Parallel(n_jobs=30)]: Done  68 tasks      | elapsed:  3.9min
[Parallel(n_jobs=30)]: Done  85 tasks      | elapsed:  5.1min
[Parallel(n_jobs=30)]: Done 102 tasks      | elapsed:  5.7min
[Parallel(n_jobs=30)]: Done 121 tasks      | elapsed:  6.6min
[Parallel(n_jobs=30)]: Done 140 tasks      | elapsed:  7.8min
[Parallel(n_jobs=30)]: Done 161 tasks      | elapsed:  8.9min
[Parallel(n_jobs=30)]: Done 182 tasks      | elapsed: 10.2min
[Parallel(n_jobs=30)]: Done 205 tasks      | elapsed: 11.6min
[Parallel(n_jobs=30)]: Done 228 tasks      | elapsed: 12.8min
[Parallel(n_jobs=30)]: Done 272 out of 300 | elapsed: 1

{'clf__penalty': 'l2', 'clf__max_iter': 270, 'clf__alpha': 0.0001}
CV: 0.2584550858789172
Valid: 0.26618798893910905


In [22]:
sub = test[[]].copy()
sub['target'] = sgd.predict_proba(test)[:, 1]
sub.to_csv('porto_sub/porto_sgd_sub1.csv')

In [23]:
sgd_best = sgd.best_estimator_.fit(X, y)

In [24]:
sub = test[[]].copy()
sub['target'] = sgd_best.predict_proba(test)[:, 1]
sub.to_csv('porto_sub/porto_sgd_sub2.csv')

# Bland

In [42]:
b1 = pd.read_csv('porto_sub/porto_sgd_sub2.csv', index_col='id')
b2 = pd.read_csv('porto_sub/porto_sub32.csv', index_col='id')

In [50]:
sub = test[[]].copy()
sub['b1'] = b1['target']
sub['b2'] = b2['target']
sub['target'] = 0.6*b1+ 0.4*b2
sub[['target']].to_csv('porto_sub/bland2.csv')
sub.head()

Unnamed: 0_level_0,b1,b2,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.029978,0.025477,0.028178
1,0.031418,0.027013,0.029656
2,0.026382,0.033105,0.029071
3,0.017978,0.017504,0.017788
4,0.036843,0.03575,0.036406


# Скор

In [51]:
score = 0.27430
score_t = 200*max(score-0.253,0)
print(score_t)
print(round(score_t))
round(score_t)/2

4.259999999999997
4


2.0