In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
def plot_feature_importances(model, columns):
    nr_f = 10
    imp = pd.Series(data = model.best_estimator_.feature_importances_, 
                    index=columns).sort_values(ascending=False)
    plt.figure(figsize=(7,5))
    plt.title("Feature importance")
    ax = sns.barplot(y=imp.index[:nr_f], x=imp.values[:nr_f], orient='h')

In [None]:
df = pd.read_csv('/kaggle/input/santander-customer-satisfaction/train.csv')
df_test_final =  pd.read_csv('/kaggle/input/santander-customer-satisfaction/test.csv')

In [None]:
# смотрим данные
df.describe()

In [None]:
# доля таргета
df["TARGET"].mean()

In [None]:
# на всякий случай
df = df.fillna(0)

In [None]:
# сохранаяем тагет отдельно, а из теста выкидываем его и айди
y = df['TARGET']
df.drop(['TARGET',"ID"], 1, inplace=True) 

In [None]:
# скейлим
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

scaler.fit(df)
scaled_features = scaler.transform(df)
df_sc = pd.DataFrame(scaled_features) 

In [None]:
# трейн тест
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_sc, y, test_size=0.15, random_state=42)

In [None]:
#решаем в лоб логрегрессией
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# какие параметры будем перебирать
tuned_parameters = [{
                    'penalty' : ['l1', 'l2'] # 
                #    'solver' : [ 'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
                    }]

#'penalty': ['l1','l2'], 'C': [0.001,0.01,0.1,1,10,100,1000]

# настройки перебора
clf = GridSearchCV(
        LogisticRegression(max_iter = 1000), tuned_parameters, scoring='roc_auc', verbose = 4, n_jobs = 3
    )

In [None]:
clf.fit(X_train, y_train)

In [None]:
clf.best_score_

In [None]:
# делаем предсказания и считаем score
y_pred_score = clf.predict_proba(X_test)

In [None]:
from sklearn.metrics import roc_auc_score
# делаем предсказания и считаем score
y_pred_score = clf.predict_proba(X_test)[:,1]
roc_auc_score(y_test,y_pred_score)

In [None]:
clf.best_params_

In [None]:
# какие параметры будем перебирать
tuned_parameters = [{
                    'penalty' : [ 'l2'],
                    'solver' : [ 'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
                    }]

#'penalty': ['l1','l2'], 'C': [0.001,0.01,0.1,1,10,100,1000]

# настройки перебора
clf = GridSearchCV(
        LogisticRegression(max_iter = 100), tuned_parameters, scoring='roc_auc', verbose = 4, n_jobs = 3,cv=2
    )

clf.fit(X_train, y_train)

In [None]:
# сравниваем между собой подходы
clf.cv_results_

In [None]:
print('train: ',clf.best_score_)
y_pred_score = clf.predict_proba(X_test)[:,1]
print('test rocauc:',roc_auc_score(y_test,y_pred_score))

In [None]:
# и еще раз
tuned_parameters = [{
                    'penalty' : [ 'l2'],
                    'solver' : [ 'lbfgs'],
                    'C': [0.001,0.01,0.1,1,10,100,1000]
                    }]

# настройки перебора
clf = GridSearchCV(
        LogisticRegression(max_iter = 100), tuned_parameters, scoring='roc_auc', verbose = 4, n_jobs = 3,cv=3
    )

clf.fit(X_train, y_train)

In [None]:
print('train: ',clf.best_score_)
y_pred_score = clf.predict_proba(X_test)[:,1]
print('test rocauc:',roc_auc_score(y_test,y_pred_score))

In [None]:
clf.cv_results_

In [None]:
clf.best_params_

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import RocCurveDisplay

fpr, tpr, _ = roc_curve(y_test, y_pred_score, pos_label=clf.classes_[1])
roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()

In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import PrecisionRecallDisplay

prec, recall, _ = precision_recall_curve(y_test, y_pred_score,
                                         pos_label=clf.classes_[1])
pr_display = PrecisionRecallDisplay(precision=prec, recall=recall).plot()

In [None]:
from sklearn.svm import SVC

tuned_parameters = [{'kernel': ['rbf'],#['linear','poly','rbf'], 
                    # 'gamma': [1e-3, 1e-4]
                   #  'C': [1, 10, 100],
                    #  'degree': range(1,3,1),
                    # 'class_weight' : [None,'balanced']
                     }]

cv_svc = GridSearchCV(
        SVC(probability=True), tuned_parameters, scoring='roc_auc', verbose = 4, n_jobs = 5, cv=2
    )
cv_svc.fit(X_train, y_train)

In [None]:
print('train: ',cv_svc.best_score_)
y_pred_score = cv_svc.predict_proba(X_test)[:,1]
print('test rocauc:',roc_auc_score(y_test,y_pred_score))

In [None]:
# пробуем лес
from sklearn.ensemble import RandomForestClassifier

tuned_parameters = { 
    'n_estimators': [200] #, 700]
    #'max_features': ['auto', 'sqrt', 'log2']
}

CV_rfc = GridSearchCV(RandomForestClassifier(), tuned_parameters,verbose = 4, n_jobs = 5, cv= 2)
CV_rfc.fit(X_train, y_train)

In [None]:
print('train: ',CV_rfc.best_score_)
y_pred_score = CV_rfc.predict_proba(X_test)[:,1]
print('test rocauc:',roc_auc_score(y_test,y_pred_score))

In [None]:
plot_feature_importances(CV_rfc, df.columns)

In [None]:
# пробуем бустинг
from sklearn.ensemble import GradientBoostingClassifier

tuned_parameters = { 
    "loss":["deviance"],
    #"learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
    #"min_samples_split": np.linspace(0.1, 0.5, 12),
    #"min_samples_leaf": np.linspace(0.1, 0.5, 12),
    #"max_depth":[3,5,8],
    #"max_features":["log2","sqrt"],
    #"criterion": ["friedman_mse",  "mae"],
    #"subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
    #"n_estimators":[10]
}

CV_gb = GridSearchCV(GradientBoostingClassifier(), tuned_parameters,verbose = 4, n_jobs = 5, cv= 2)
CV_gb.fit(X_train, y_train)

In [None]:
print('train: ',CV_gb.best_score_)
y_pred_score = CV_gb.predict_proba(X_test)[:,1]
print('test rocauc:',roc_auc_score(y_test,y_pred_score))

In [None]:
plot_feature_importances(CV_gb, df.columns)

Попробуем сделать некоторые переменные бинарными (те что по сути или равны нулю или большие сразу)

In [None]:
# отберем те пааметры где большинство значений (более 90%) являются нулями
quants = df.quantile(0.9)
quants_zero = quants[quants == 0]

In [None]:
quants_zero.shape

In [None]:
df_bin = df.copy() # копируем в новый всве

In [None]:
# для тех значений где 0.9перцентиль == 0 меняем все значения на бинарные
for q in quants_zero.index:
    df_bin[q] = df[q].apply( lambda x: 1 if x > 0 else 0)

In [None]:
# скалируем заново 
scaler_bin = StandardScaler()

scaler_bin.fit(df_bin)
scaled_features_bin = scaler_bin.transform(df_bin)
df_bin_sc = pd.DataFrame(scaled_features_bin) 

#и делим на трейн тест
X_train_bin, X_test_bin, y_train_bin, y_test_bin = \
        train_test_split(df_bin_sc, y, test_size=0.15, random_state=42)

In [None]:
# и пробуем лог рег
tuned_parameters = [{
                    'penalty' : [ 'l2'],
                    'solver' : [ 'lbfgs'],
                    'C': [0.01]
                    }]

# настройки перебора
clf_bin = GridSearchCV(
        LogisticRegression(max_iter = 100), tuned_parameters, scoring='roc_auc', verbose = 4, n_jobs = 3,cv=3
    )

clf_bin.fit(X_train_bin, y_train_bin)

In [None]:
print('train: ',clf_bin.best_score_)
y_pred_score_bin = clf_bin.predict_proba(X_test_bin)[:,1]
print('test rocauc:',roc_auc_score(y_test_bin,y_pred_score_bin))

In [None]:
#подготавливаем данные для фин. тесат (нули, убираем айди, скейлим)

df_test_final = df_test_final.fillna(0)
df_test_ids = df_test_final["ID"]
df_test_final.drop(["ID"], 1, inplace=True) 
scaled_features_fin = scaler.transform(df_test_final)
df_fin_sc = pd.DataFrame(scaled_features_fin) 

In [None]:
#предиктим
#y_pred_fin = clf.predict_proba(df_fin_sc)[:,1]
y_pred_fin = CV_gb.predict_proba(df_fin_sc)[:,1]

In [None]:
# возвращаем айдишники
res = pd.concat([pd.DataFrame(df_test_ids), pd.DataFrame(data=y_pred_fin,columns=['TARGET'])],axis=1)

In [None]:
res

In [None]:
res.to_csv("/kaggle/working/res_gb.csv", index=False)

In [None]:
# планы
# затюнить random forest и gb