In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import make_scorer, cohen_kappa_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score, train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.ensemble import VotingClassifier, ExtraTreesClassifier
from catboost import Pool, CatBoostClassifier

In [2]:
data_train = pd.read_csv('train.csv', header=None)
data_train_target = pd.read_csv('train-target.csv', header=None)
data_test = pd.read_csv('test.csv', header=None)

bad_columns = [9, 15, 16, 17, 22, 26]
data_train_ = data_train.copy()
data_test_ = data_test.copy()

data_train_.drop(labels=bad_columns, axis=1, inplace=True)
data_test_.drop(labels=bad_columns, axis=1, inplace=True)

In [3]:
Kfold = StratifiedKFold(n_splits=3, shuffle=True)

In [4]:
def remove_collinear_features(x, threshold):
    '''
    remove features with correlation coef. greater than threshold
    '''
    
    # Calculate the correlation matrix
    corr_matrix = x.corr()
    iters = range(len(corr_matrix.columns) - 1)
    drop_cols = []

    # Iterate through the correlation matrix and compare correlations
    for i in iters:
        for j in range(i):
            item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
            col = item.columns
            row = item.index
            val = abs(item.values)
            
            # If correlation exceeds the threshold
            if val >= threshold:
                drop_cols.append(col.values[0])

    # Drop one of each pair of correlated columns
    drops = set(drop_cols)
    x = x.drop(columns = drops)
               
    return x

features = remove_collinear_features(data_train_, 0.7);
new_cols = features.columns.values
data_train_nc = data_train[new_cols].copy()
data_test_nc = data_test[new_cols].copy()

In [5]:
scaler_full = StandardScaler()
X_tr_new = scaler_full.fit_transform(data_train_nc)
X_test_full = scaler_full.transform(data_test_nc)
y_train_full = data_train_target.values.ravel()

X_train_new, X_test_new, y_train, y_test = train_test_split(data_train_nc, data_train_target, 
                                                    test_size=0.25, 
                                                    stratify=data_train_target)
X_train_new_scaled = scaler_full.fit_transform(X_train_new)
X_test_new_scaled = scaler_full.transform(X_test_new)
y_train = y_train.values.ravel()

In [None]:
count_estim = np.arange(380, 430, 10)
depth = np.arange(27, 31)
min_samples_split = np.arange(5, 8)
params = {
    'max_depth': depth,
    'n_estimators': count_estim,
    'min_samples_split': min_samples_split
    
}
rfc = RandomForestClassifier(random_state=42)
grid_rfc = GridSearchCV(RandomForestClassifier(criterion='entropy',
                                                      bootstrap=True, random_state=42),
                               params, cv=Kfold, scoring=make_scorer(cohen_kappa_score), verbose=True, n_jobs=-1)
grid_rfc.fit(X_tr_new, y_train_full)

In [6]:
# миноритарное "мягкое" голосование - soft voting - return probability
estimators = [
            ('lgb_', lgb.LGBMClassifier(max_depth=-1, 
                        num_leaves=35,
                        class_weight='balanced',
                        n_jobs=-1,
                        objective='binary',
                        boosting_type='goss')),
            ('catbooster', CatBoostClassifier(learning_rate=0.1,
                                l2_leaf_reg=1, verbose=0,
                                random_seed=17,
                                bootstrap_type="MVS", custom_metric="Logloss")),
            ('xgb_', XGBClassifier(n_estimators=800, learning_rate=0.01, random_state=0)),
            ('rfc', RandomForestClassifier(n_estimators=400,
                         min_samples_split=6,
                         max_depth=29,
                         criterion='entropy',
                         bootstrap=True)),
            ('extratreecl', ExtraTreesClassifier(random_state=42, bootstrap=True, oob_score=True, criterion='entropy',
                     max_depth=5, max_features='auto', min_samples_leaf=20, min_samples_split=2))
            ]
voter = VotingClassifier(estimators, voting='soft')
voter.fit(X_tr_new, y_train_full)

def vote_proba(x):
    if x['target'] == 1:
        return x[1]
    else:
        return 1 - x[0]

base_proba = pd.DataFrame(voter.predict_proba(X_test_full))
base_proba['target'] = voter.predict(X_test_full)
base_proba['target_proba'] = base_proba.apply(vote_proba, axis=1)

target_resp = pd.DataFrame(base_proba['target_proba'])
target_resp.to_csv('submission.csv', header=False, index=False)

In [7]:
base_proba

Unnamed: 0,0,1,target,target_proba
0,0.805266,0.194733,0,0.194734
1,0.462914,0.537086,1,0.537086
2,0.774596,0.225404,0,0.225404
3,0.569044,0.430956,0,0.430956
4,0.800861,0.199139,0,0.199139
...,...,...,...,...
1995,0.690849,0.309151,0,0.309151
1996,0.608506,0.391494,0,0.391494
1997,0.695296,0.304704,0,0.304704
1998,0.700723,0.299277,0,0.299277
