# Импорт библиотек

In [8]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import (ElasticNet, Lasso, LinearRegression,
                                  LogisticRegression, Ridge, SGDClassifier)
from sklearn.metrics import accuracy_score
from sklearn.model_selection import (GridSearchCV, RandomizedSearchCV,
                                     cross_val_score, cross_validate,
                                     train_test_split)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import (MinMaxScaler, OneHotEncoder, OrdinalEncoder,
                                   PolynomialFeatures, StandardScaler)

from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_selection import VarianceThreshold, SelectFromModel, RFECV
from sklearn.metrics import make_scorer
from sklearn.model_selection import ParameterSampler
from tqdm.auto import tqdm

import warnings
warnings.filterwarnings("ignore")

# Загрузка данных

In [9]:
train = pd.read_csv('porto/train.csv').set_index('id')
test = pd.read_csv('porto/test.csv').set_index('id')

y = train['target']
X = train.drop(columns=['target'])

In [10]:
print(train.shape)
train.head()

(595212, 58)


Unnamed: 0_level_0,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,0,2,2,5,1,0,0,1,0,0,...,9,1,5,8,0,1,1,0,0,1
9,0,1,1,7,0,0,0,0,1,0,...,3,1,1,9,0,1,1,0,1,0
13,0,5,4,9,1,0,0,0,1,0,...,4,2,7,7,0,1,1,0,1,0
16,0,0,1,2,0,0,1,0,0,0,...,2,2,4,9,0,0,0,0,0,0
17,0,0,2,0,1,0,1,0,0,0,...,3,1,1,3,0,0,0,1,1,0


In [11]:
print(test.shape)
test.head()

(892816, 57)


Unnamed: 0_level_0,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,1,8,1,0,0,1,0,0,0,...,1,1,1,12,0,1,1,0,0,1
1,4,2,5,1,0,0,0,0,1,0,...,2,0,3,10,0,0,1,1,0,1
2,5,1,3,0,0,0,0,0,1,0,...,4,0,2,4,0,0,0,0,0,0
3,0,1,6,0,0,1,0,0,0,0,...,5,1,0,5,1,0,1,0,0,0
4,5,1,7,0,0,0,0,0,1,0,...,4,0,0,4,0,1,1,0,0,1


# Функции

In [12]:
# Code for calculating Normalized gini coefficient
# https://www.kaggle.com/c/ClaimPredictionChallenge/discussion/703
def gini(actual, pred, cmpcol = 0, sortcol = 1):  
    assert(len(actual) == len(pred))  
    epsilon = 1e-7
    values = np.asarray(np.c_[actual, pred, np.arange(len(actual))], dtype=np.float)  
    values = values[np.lexsort((values[:, 2], -1 * values[:, 1]))]  
    total = values[:, 0].sum() 
    gini_sum = (values[:, 0].cumsum().sum() + epsilon) / (total + epsilon)  
  
    gini_sum -= (len(actual) + 1) / 2  
    return gini_sum / len(actual)  

def gini_normalized(a, p):  
    '''Function to calculate the normalized gini coefficient'''
    return gini(a, p) / gini(a, a)

# RandomizedSearchCV

In [13]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.3, random_state=42, shuffle=True, stratify=y)

In [14]:
def run_train(params):
    #cat_cols = [column for column in X if (column.endswith('cat') or column.endswith('bin'))]
    #cat_trans = Pipeline(steps=[('encoder', OneHotEncoder(handle_unknown='ignore'))])

    my_scorer = make_scorer(gini_normalized, greater_is_better=True, needs_proba=True)
    model = RandomForestClassifier(random_state=42)
    
    rs = RandomizedSearchCV(
        model, params, n_iter=100, scoring=my_scorer, n_jobs=30, cv=3, random_state=42, verbose=10)
    rs.fit(X_train, y_train)


    print(rs.best_params_)
    print('CV:',rs.best_score_)
    
    y_pred = rs.predict_proba(X_valid)[:, 1]
    print('Valid:',gini_normalized(y_valid, y_pred))
    
    return rs

In [31]:
params = {
    'n_estimators': range(50,1000+1,1),
    'max_depth': range(2,30+1,1),
    'criterion': ['entropy']
}

clf = run_train(params)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed: 22.6min
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed: 117.0min
[Parallel(n_jobs=10)]: Done 300 out of 300 | elapsed: 199.1min finished


{'n_estimators': 983, 'max_depth': 13, 'criterion': 'entropy'}
CV: 0.26389639597842335
Valid: 0.27281702517101114


In [15]:
params = {
    'n_estimators': [983],
    'max_depth': [13],
    'criterion': ['entropy','gini'],
    'max_features': ['sqrt','log2'],
    'bootstrap': ['True','False'],
    'class_weight': ['balanced','balanced_subsample',None]
}

clf = run_train(params)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=30)]: Using backend LokyBackend with 30 concurrent workers.
[Parallel(n_jobs=30)]: Done   1 tasks      | elapsed: 12.3min
[Parallel(n_jobs=30)]: Done  12 tasks      | elapsed: 14.1min
[Parallel(n_jobs=30)]: Done  21 out of  72 | elapsed: 16.3min remaining: 39.5min
[Parallel(n_jobs=30)]: Done  29 out of  72 | elapsed: 17.8min remaining: 26.4min
[Parallel(n_jobs=30)]: Done  37 out of  72 | elapsed: 27.0min remaining: 25.5min
[Parallel(n_jobs=30)]: Done  45 out of  72 | elapsed: 29.3min remaining: 17.6min
[Parallel(n_jobs=30)]: Done  53 out of  72 | elapsed: 30.9min remaining: 11.1min
[Parallel(n_jobs=30)]: Done  61 out of  72 | elapsed: 36.3min remaining:  6.5min
[Parallel(n_jobs=30)]: Done  69 out of  72 | elapsed: 37.7min remaining:  1.6min
[Parallel(n_jobs=30)]: Done  72 out of  72 | elapsed: 38.5min finished


{'n_estimators': 983, 'max_features': 'sqrt', 'max_depth': 13, 'criterion': 'entropy', 'class_weight': None, 'bootstrap': 'True'}
CV: 0.26389639597842335
Valid: 0.27281702517101114


In [16]:
sub = test[[]].copy()
sub['target'] = clf.predict_proba(test)[:, 1]
sub.to_csv('porto_sub/porto_sub29.csv')

In [17]:
clf_best = clf.best_estimator_.fit(X, y)

In [18]:
sub = test[[]].copy()
sub['target'] = clf_best.predict_proba(test)[:, 1]
sub.to_csv('porto_sub/porto_sub30.csv')

# Скор

In [19]:
score = 0.26954
score_t = 200*max(score-0.253,0)
print(score_t)
print(round(score_t))
round(score_t)/2

3.308
3


1.5