# 1 Introduction
At Santander their mission is to help people and businesses prosper. they are always looking for ways to help our customers understand their financial health and identify which products and services might help them achieve their monetary goals.

# 2.0 Load packages

# 2.1 Import

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier, Pool
from IPython.display import display
import matplotlib.patches as patch
import matplotlib.pyplot as plt
from sklearn.svm import NuSVR
from scipy.stats import norm
from sklearn import svm
import lightgbm as lgb
import xgboost as xgb
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
import time
import glob
import sys
import os
import gc

# 2.2 Setup

In [8]:
# for get better results change fold_n to 5
fold_n = 5
folds = StratifiedKFold(n_splits=fold_n, shuffle=True, random_state=10)
%matplotlib inline
%precision 4
warnings.filterwarnings('ignore')
plt.style.use('ggplot')
np.set_printoptions(suppress=True)
pd.set_option("display.precision", 15)
pd.set_option('display.max_columns', 200)

# 2.3 Version

In [9]:
print('pandas: {}'.format(pd.__version__))
print('numpy: {}'.format(np.__version__))
print('Python: {}'.format(sys.version))

pandas: 0.22.0
numpy: 1.16.0
Python: 3.6.6 |Anaconda, Inc.| (default, Jun 28 2018, 11:27:44) [MSC v.1900 64 bit (AMD64)]


# 3.0 Problem Definition
In this challenge, we should help this bank identify which customers will make a specific transaction in the future, irrespective of the amount of money transacted. The data provided for this competition has the same structure as the real data we have available to solve this problem.

# 3.1 Problem Feature
* train.csv - the training set.
* test.csv - the test set. The test set contains some rows which are not included in scoring.
* sample_submission.csv - a sample submission file in the correct format.

# 3.2 Aim
In this competition, The task is to predict the value of target column in the test set.

# 3.3 Variables
We are provided with an anonymized dataset containing numeric feature variables, the binary target column, and a string ID_code column.

The task is to predict the value of target column in the test set.

# 3.4 evaluation
Submissions are evaluated on area under the ROC curve between the predicted probability and the observed target.

In [11]:
from sklearn.metrics import roc_auc_score, roc_curve

# 4.0 Exploratory Data Analysis(EDA)
In this section, we'll analysis how to use graphical and numerical techniques to begin uncovering the structure of your data.

Data Collection
Visualization
Data Preprocessing
Data Cleaning

In [12]:
# import Datasets to play with it
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [13]:
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission.head()

Unnamed: 0,ID_code,target
0,test_0,0
1,test_1,0
2,test_2,0
3,test_3,0
4,test_4,0


In [14]:
train.shape, test.shape, sample_submission.shape

((200000, 202), (200000, 201), (200000, 2))

In [15]:
train.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,var_11,var_12,var_13,var_14,var_15,var_16,var_17,var_18,var_19,var_20,var_21,var_22,var_23,var_24,var_25,var_26,var_27,var_28,var_29,var_30,var_31,var_32,var_33,var_34,var_35,var_36,var_37,var_38,var_39,var_40,var_41,var_42,var_43,var_44,var_45,var_46,var_47,var_48,var_49,var_50,var_51,var_52,var_53,var_54,var_55,var_56,var_57,var_58,var_59,var_60,var_61,var_62,var_63,var_64,var_65,var_66,var_67,var_68,var_69,var_70,var_71,var_72,var_73,var_74,var_75,var_76,var_77,var_78,var_79,var_80,var_81,var_82,var_83,var_84,var_85,var_86,var_87,var_88,var_89,var_90,var_91,var_92,var_93,var_94,var_95,var_96,var_97,...,var_100,var_101,var_102,var_103,var_104,var_105,var_106,var_107,var_108,var_109,var_110,var_111,var_112,var_113,var_114,var_115,var_116,var_117,var_118,var_119,var_120,var_121,var_122,var_123,var_124,var_125,var_126,var_127,var_128,var_129,var_130,var_131,var_132,var_133,var_134,var_135,var_136,var_137,var_138,var_139,var_140,var_141,var_142,var_143,var_144,var_145,var_146,var_147,var_148,var_149,var_150,var_151,var_152,var_153,var_154,var_155,var_156,var_157,var_158,var_159,var_160,var_161,var_162,var_163,var_164,var_165,var_166,var_167,var_168,var_169,var_170,var_171,var_172,var_173,var_174,var_175,var_176,var_177,var_178,var_179,var_180,var_181,var_182,var_183,var_184,var_185,var_186,var_187,var_188,var_189,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,-4.92,5.747000000000001,2.9252,3.1821,14.0137,0.5745,8.7989,14.5691,5.7487,-7.2393,4.284,30.7133,10.535,16.2191,2.5791,2.4716,14.3831,13.4325,-5.1488,-0.4073,4.9306,5.9965,-0.3085,12.9041,-3.8766,16.8911,11.192,10.5785,0.6764,7.8871,4.6667,3.8743,-5.2387,7.3746,11.5767,12.0446,11.6418,-7.017,5.9226,-14.2136,16.0283,5.3253,12.9194,29.046,-0.694,5.1736,-0.7474,14.8322,11.2668,5.3822,2.0183,10.1166,16.1828,4.959,2.0771,-0.2154,8.6748,9.5319,5.8056,22.4321,5.0109,-4.701000000000001,21.6374,0.5663,5.1999,8.86,43.1127,18.3816,-2.344,23.4104,6.5199,12.1983,13.6468,13.8372,1.3675,2.9423,-4.5213,21.4669,9.3225,16.4597,7.9984,-1.7069,-21.4494,6.7806,11.0924,9.9913,14.8421,0.1812,8.9642,16.2572,...,9.4763,13.3102,26.5376,1.4403,14.71,6.0454,9.5426,17.1554,14.1104,24.3627,2.0323,6.7602,3.9141,-0.4851,2.524,1.5093,2.5516,15.5752,-13.4221,7.2739,16.0094,9.7268,0.8897,0.7754,4.2218,12.0039,13.8571,-0.7338,-1.9245,15.4462,12.8287,0.3587,9.6508,6.5674,5.1726,3.1345,29.4547,31.4045,2.8279,15.6599,8.3307,-5.6011,19.0614,11.2663,8.6989,8.3694,11.5659,-16.4727,4.0288,17.9244,18.5177,10.78,9.0056,16.6964,10.4838,1.6573,12.1749,-13.1324,17.6054,11.5423,15.4576,5.3133,3.6159,5.0384,6.676,12.6644,2.7004,-0.6975,9.5981,5.4879,-4.7645,-8.4254,20.8773,3.1531,18.5618,7.7423,-10.1245,13.7241,-3.5189,1.7202,-8.4051,9.0164,3.0657,14.3691,25.8398,5.8764,11.8411,-19.7159,17.5743,0.5857,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.388999999999999,12.3622,7.0433,5.6208,16.5338,3.1468,8.0851,-0.4032,8.0585,14.0239,8.4135,5.4345,13.7003,13.8275,-15.5849,7.8,28.5708,3.4287,2.7407,8.5524,3.3716,6.9779,13.890999999999998,-11.7684,-2.5586,5.0464,0.5481,-9.2987,7.8755,1.2859,19.371,11.3702,0.7399,2.7995,5.8434,10.816,3.6783,-11.1147,1.873,9.8775,11.7842,1.2444,-47.3797,7.3718,0.1948,34.4014,25.7037,11.8343,13.2256,-4.1083,6.6885,-8.0946,18.5995,19.3219,7.0118,1.921,8.8682,8.0109,-7.2417,1.7944,-1.3147,8.1042,1.5365,5.4007,7.9344,5.022,2.2302,40.5632,0.5134,3.1701,20.1068,7.7841,7.0529,3.2709,23.4822,5.5075,13.7814,2.5462,18.1782,0.3683,-4.821000000000001,-5.485,13.7867,-13.5901,11.0993,7.9022,12.2301,0.4768,6.8852,8.0905,10.9631,11.7569,-1.2722,24.7876,26.6881,...,-13.695,8.4068,35.4734,1.7093,15.1866,2.6227,7.3412,32.0888,13.955,13.0858,6.6203,7.1051,5.3523,8.5426,3.6159,4.1569,3.0454,7.8522,-11.51,7.5109,31.5899,9.5018,8.2736,10.1633,0.1225,12.5942,14.5697,2.4354,0.8194,16.5346,12.4205,-0.178,5.7582,7.0513,1.9568,-8.9921,9.7797,18.1577,-1.9721,16.1622,3.6937,6.6803,-0.3243,12.2806,8.6086,11.0738,8.9231,11.77,4.2578,-4.4223,20.6294,14.8743,9.4317,16.7242,-0.5687,0.1898,12.2419,-9.6953,22.3949,10.6261,29.4846,5.8683,3.8208,15.8348,-5.0121,15.1345,3.2003,9.3192,3.8821,5.7999,5.5378,5.0988,22.033,5.5134,30.2645,10.4968,-7.2352,16.5721,-7.3477,11.0752,-5.5937,9.4878,-14.91,9.4245,22.5441,-4.8622,7.6543,-15.9319,13.3175,-0.3566,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,-4.9193,5.9525,-0.3249,-11.2648,14.1929,7.3124,7.5244,14.6472,7.6782,-1.7395,4.7011,20.4775,17.7559,18.1377,1.2145,3.5137,5.6777,13.2177,-7.994,-2.9029,5.8463,6.1439,-11.1025,12.4858,-2.2871,19.0422,11.0449,4.1087,4.6974,6.9346,10.8917,0.9003,-13.5174,2.2439,11.5283,12.0406,4.1006,-7.9078,11.1405,-5.7864,20.7477,6.8874,12.9143,19.5856,0.7268,6.4059,9.3124,6.2846,15.6372,5.82,1.1,9.1854,12.5963,-10.3734,0.8748,5.8042,3.7163,-1.1016,7.3667,9.8565,5.0228,-5.7828,2.3612,0.852,6.3577,12.1719,19.7312,19.4465,4.5048,23.2378,6.3191,12.8046,7.4729,15.7811,13.3529,10.1852,5.4604,19.0773,-4.4577,9.5413,11.9052,2.1447,-22.4038,7.0883,14.1613,10.508,14.2621,0.2647,20.4031,17.035999999999998,...,-0.3939,12.6317,14.8863,1.3854,15.0284,3.9995,5.3683,8.6273,14.1963,20.3882,3.2304,5.7033,4.5255,2.1929,3.129,2.9044,1.1696,28.7632,-17.2738,2.1056,21.1613,8.9573,2.7768,-2.1746,3.6932,12.4653,14.1978,-2.5511,-0.9479,17.1092,11.5419,0.0975,8.8186,6.6231,3.9358,-11.7218,24.5437,15.5827,3.8212,8.6674,7.3834,-2.4438,10.2158,7.4844,9.1104,4.3649,11.4934,1.7624,4.0714,-1.2681,14.333,8.0088,4.4015,14.1479,-5.1747,0.5778,14.5362,-1.7624,33.882,11.6041,13.207,5.8442,4.7086,5.7141,-1.041,20.5092,3.279,-5.5952,7.3176,5.769,-7.0927,-3.9116,7.2569,-5.8234,25.682,10.9202,-0.3104,8.8438,-9.7009,2.4013,-4.2935,9.3908,-13.2648,3.1545,23.0866,-5.3,5.3745,-6.266,10.1934,-0.8417,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,-5.8609,8.245,2.3061,2.8102,13.8463,11.9704,6.4569,14.8372,10.743,-0.4299,15.9426,13.7257,20.301,12.5579,6.8202,2.7229,12.1354,13.7367,0.8135,-0.9059,5.907,2.8407,-15.2398,10.4407,-2.5731,6.1796,10.6093,-5.9158,8.1723,2.8521,9.1738,0.6665,-3.8294,-1.037,11.777,11.2834,8.0485,-24.684,12.7404,-35.1659,0.7613,8.3838,12.6832,9.5503,1.7895,5.2091,8.0913,12.3972,14.4698,6.585,3.3164,9.4638,15.782,-25.0222,3.4418,-4.3923,8.6464,6.3072,5.6221,23.6143,5.022,-3.9989,4.0462,0.25,1.2516,24.4187,4.529,15.4235,11.6875,23.6273,4.0806,15.2733,0.7839,10.5404,1.6212,-5.2896,1.6027,17.9762,-2.3174,15.6298,4.5474,7.5509,-7.5866,7.0364,14.4027,10.7795,7.2887,-1.093,11.3596,18.1486,...,-19.8592,22.5316,18.6129,1.3512,9.3291,4.2835,10.3907,7.0874,14.3256,14.4135,4.2827,6.975,1.648,11.6896,2.5762,-2.5459,5.3446,38.1015,3.5732,5.0988,30.5644,11.3025,3.9618,-8.2464,2.7038,12.3441,12.5431,-1.3683,3.5974,13.9761,14.3003,1.0486,8.95,7.1954,-1.1984,1.9586,27.5609,24.6065,-2.8233,8.9821,3.8873,15.9638,10.0142,7.8388,9.9718,2.9253,10.4994,4.1622,3.7613,2.3701,18.0984,17.1765,7.6508,18.2452,17.0336,-10.937,12.05,-1.2155,19.975,12.3892,31.8833,5.9684,7.2084,3.8899,-11.0882,17.2502,2.5881,-2.7018,0.5641,5.343,-7.1541,-6.192,18.2366,11.7134,14.7483,8.1013,11.8771,13.9552,-10.4701,5.6961,-3.7546,8.4117,1.8986,7.2601,-0.4639,-0.0498,7.9336,-12.8279,12.4124,1.8489,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,6.2654,7.6784,-9.4458,-12.1419,13.8481,7.8895,7.7894,15.0553,8.4871,-3.068,6.5263,11.3152,21.4246,18.9608,10.1102,2.7142,14.208,13.5433,3.1736,-3.3423,5.9015,7.9352,-3.1582,9.4668,-0.0083,19.3239,12.4057,0.6329,2.7922,5.8184,19.3038,1.445,-5.5963,14.0685,11.9171,11.5111,6.9087,-65.4863,13.8657,0.0444,-0.1346,14.4268,13.3273,10.4857,-1.4367,5.7555,-8.5414,14.1482,16.984,6.1812,1.9548,9.2048,8.6591,-27.7439,-0.4952,-1.7839,5.267,-4.3205,6.986000000000001,1.6184,5.0301,-3.2431,40.1236,0.7737,-0.7264,4.5886,-4.5346,23.3521,1.0273,19.16,7.1734,14.3937,2.9598,13.3317,-9.2587,-6.7075,7.8984,14.5265,7.0799,20.167,8.0053,3.7954,-39.7997,7.0065,9.3627,10.4316,14.0553,0.0213,14.7246,35.2988,...,-22.9264,12.3562,17.340999999999998,1.694,7.1179,5.1934,8.823,10.6617,14.0837,28.2749,-0.1937,5.9654,1.0719,7.9923,2.9138,-3.6135,1.4684,25.6795,13.8224,4.7478,41.1037,12.714,5.2964,9.7289,3.937,12.1316,12.5815,7.0642,5.6518,10.9346,11.4266,0.9442,7.7532,6.6173,-6.8304,6.473,17.1728,25.8128,2.6791,13.9547,6.6289,-4.3965,11.7159,16.108,7.6874,9.157,11.567,-12.7047,3.7574,9.911,20.1461,1.2995,5.8493,19.8234,4.7022,10.6101,13.0021,-12.6068,27.0846,8.0913,33.5107,5.6953,5.4663,18.2201,6.5769,21.2607,3.2304,-1.7759,3.1283,5.5518,1.4493,-2.6627,19.8056,2.3705,18.4685,16.3309,-3.3456,13.5261,1.7189,5.1743,-7.6938,9.7685,4.891,12.2198,11.8503,-7.8931,6.4209,5.927,16.0201,-0.2829,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


Reducing memory size by ~50%
Because we make a lot of calculations in this kernel, we'd better reduce the size of the data.

* 300 MB before Reducing
* 150 MB after Reducing

In [16]:
def reduce_mem_usage(df):
    start_mem_usg = df.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in df.columns:
        if df[col].dtype != object:  # Exclude strings
            
            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",df[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = df[col].max()
            mn = df[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(df[col]).all(): 
                NAlist.append(col)
                df[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = df[col].fillna(0).astype(np.int64)
            result = (df[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        df[col] = df[col].astype(np.uint8)
                    elif mx < 65535:
                        df[col] = df[col].astype(np.uint16)
                    elif mx < 4294967295:
                        df[col] = df[col].astype(np.uint32)
                    else:
                        df[col] = df[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                df[col] = df[col].astype(np.float32)
            
            # Print new column type
            print("dtype after: ",df[col].dtype)
            print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = df.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return df, NAlist

Reducing for train and test

In [17]:
train, NAlist = reduce_mem_usage(train)
print("-"*50)
print("Warning: the following columns have missing values filled with 'df['column_name'].min() -1': ")
print("-"*50)
print(NAlist)

Memory usage of properties dataframe is : 308.2276153564453  MB
******************************
Column:  target
dtype before:  int64
dtype after:  uint8
******************************
******************************
Column:  var_0
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_1
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_2
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_3
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_4
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_5
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_6
dtype before:  float64
dtype after:  fl

dtype after:  float32
******************************
******************************
Column:  var_66
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_67
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_68
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_69
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_70
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_71
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_72
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_73
dtype before:  float64
dtype after:  fl

dtype after:  float32
******************************
******************************
Column:  var_134
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_135
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_136
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_137
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_138
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_139
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_140
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_141
dtype before:  float64
dtype af

In [18]:
test, NAlist = reduce_mem_usage(test)
print("-"*50)
print("Warning: the following columns have missing values filled with 'df['column_name'].min() -1': ")
print("-"*50)
print(NAlist)

Memory usage of properties dataframe is : 306.7017364501953  MB
******************************
Column:  var_0
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_1
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_2
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_3
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_4
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_5
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_6
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_7
dtype before:  float64
dtype after: 

dtype after:  float32
******************************
******************************
Column:  var_67
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_68
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_69
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_70
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_71
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_72
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_73
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_74
dtype before:  float64
dtype after:  fl

dtype after:  float32
******************************
******************************
Column:  var_135
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_136
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_137
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_138
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_139
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_140
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_141
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  var_142
dtype before:  float64
dtype af

# 4.2 Data set fields

In [21]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Columns: 202 entries, ID_code to var_199
dtypes: float32(200), object(1), uint8(1)
memory usage: 154.3+ MB


# 4.3 Numerical values Describe

In [22]:
train.describe()

Unnamed: 0,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,var_11,var_12,var_13,var_14,var_15,var_16,var_17,var_18,var_19,var_20,var_21,var_22,var_23,var_24,var_25,var_26,var_27,var_28,var_29,var_30,var_31,var_32,var_33,var_34,var_35,var_36,var_37,var_38,var_39,var_40,var_41,var_42,var_43,var_44,var_45,var_46,var_47,var_48,var_49,var_50,var_51,var_52,var_53,var_54,var_55,var_56,var_57,var_58,var_59,var_60,var_61,var_62,var_63,var_64,var_65,var_66,var_67,var_68,var_69,var_70,var_71,var_72,var_73,var_74,var_75,var_76,var_77,var_78,var_79,var_80,var_81,var_82,var_83,var_84,var_85,var_86,var_87,var_88,var_89,var_90,var_91,var_92,var_93,var_94,var_95,var_96,var_97,var_98,...,var_100,var_101,var_102,var_103,var_104,var_105,var_106,var_107,var_108,var_109,var_110,var_111,var_112,var_113,var_114,var_115,var_116,var_117,var_118,var_119,var_120,var_121,var_122,var_123,var_124,var_125,var_126,var_127,var_128,var_129,var_130,var_131,var_132,var_133,var_134,var_135,var_136,var_137,var_138,var_139,var_140,var_141,var_142,var_143,var_144,var_145,var_146,var_147,var_148,var_149,var_150,var_151,var_152,var_153,var_154,var_155,var_156,var_157,var_158,var_159,var_160,var_161,var_162,var_163,var_164,var_165,var_166,var_167,var_168,var_169,var_170,var_171,var_172,var_173,var_174,var_175,var_176,var_177,var_178,var_179,var_180,var_181,var_182,var_183,var_184,var_185,var_186,var_187,var_188,var_189,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
count,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,...,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0
mean,0.10049,10.679972648620604,-1.627627611160278,10.715126037597656,6.796522617340088,11.078254699707031,-5.065340042114258,5.408960819244385,16.545988082885742,0.284159243106842,7.567198753356934,0.394344419240952,-3.245583534240723,14.023975372314451,8.53026008605957,7.537585735321045,14.573036193847656,9.333322525024414,-5.696696281433105,15.244025230407717,12.43842601776123,13.290937423706056,17.25794792175293,4.305424690246582,3.019564151763916,10.584388732910156,13.667485237121582,-4.055144309997559,-1.137891292572021,5.532967090606689,5.053869247436523,-7.687808036804199,10.393088340759276,-0.512887895107269,14.774120330810549,11.434164047241213,3.842468976974487,2.187225580215454,5.868919372558594,10.642142295837402,0.662957668304443,-6.725560665130615,9.299928665161133,11.22245979309082,11.569952964782717,8.948315620422363,-12.699785232543944,11.326496124267578,-12.47187614440918,14.704588890075684,16.682533264160156,12.741005897521973,13.42894458770752,-2.528791666030884,6.008542060852051,1.137119650840759,12.745891571044922,16.629117965698242,6.27198314666748,3.177616357803345,8.931174278259277,12.155556678771973,-11.946684837341309,0.874156653881073,0.661172807216644,6.369136810302734,0.982893347740173,5.7940993309021,11.943212509155272,5.011508464813232,-3.331482172012329,24.446775436401367,0.669750154018402,0.640558302402496,19.61104965209961,19.51864051818848,16.85371208190918,6.050802707672119,19.0672664642334,5.349456310272217,14.4022216796875,5.795083045959473,14.718932151794434,-3.471280574798584,1.02582848072052,-2.590219974517822,18.362924575805664,5.621063709259033,11.351329803466797,8.70287036895752,3.725199699401855,-16.548107147216797,6.98759126663208,12.739579200744627,10.556807518005373,10.999174118041992,-0.084343619644642,14.400341033935549,18.539531707763672,1.752007842063904,...,-6.600573062896729,13.413535118103027,22.29488182067871,1.568377614021301,11.509880065917969,4.244776725769043,8.617803573608398,17.796024322509766,14.225020408630373,18.45799827575684,5.513267517089844,6.312582492828369,3.317868709564209,8.136541366577148,3.081205368041992,2.213719844818115,2.402563571929932,16.10228729248047,-5.305082321166992,3.03284740447998,24.521263122558597,11.310552597045898,1.192979454994202,7.076220035552979,4.272717475891113,12.489068984985352,13.201952934265137,0.851496040821075,-1.127952337265015,15.460216522216797,12.257137298583984,0.544669091701508,7.799696445465088,6.813328742980957,-4.826114177703857,-4.259453773498535,22.968511581420895,17.613506317138672,1.210797786712646,7.760214805603027,3.423616886138916,2.897593975067139,11.983561515808104,12.333633422851562,8.647666931152344,4.841372966766357,10.34124755859375,-3.300761938095093,3.990678548812866,5.296236038208008,16.81772804260254,10.141605377197266,7.633148670196533,16.728004455566406,6.975027561187744,-2.074105739593506,13.209324836730955,-4.813564777374268,17.914653778076172,10.223241806030272,24.259389877319336,5.633270740509033,5.362925052642822,11.002311706542969,-2.871894598007202,19.315820693969727,2.963322877883911,-4.151159763336182,4.937124252319336,5.636033535003662,-0.004962081089616,-0.831779837608337,19.8170108795166,-0.677969634532928,20.2105770111084,11.640645027160645,-2.799602746963501,11.882902145385742,-1.014055371284485,2.591427803039551,-2.741673231124878,10.085389137268066,0.719106554985046,8.769108772277832,12.756799697875977,-3.983258008956909,8.970232009887695,-10.334982872009276,15.377336502075195,0.746070563793182,3.234436273574829,7.438448905944824,1.92783260345459,3.331773042678833,17.993810653686523,-0.142086714506149,2.303354501724243,8.908224105834961,15.870863914489746,-3.326541900634766
std,0.300652975806093,3.039989709854126,4.049966812133789,2.640832424163818,2.043276309967041,1.623113632202148,7.863115310668945,0.866584956645966,3.41800856590271,3.332575798034668,1.235053658485413,5.500724792480469,5.970123767852783,0.190055906772614,4.639412879943848,2.247882127761841,0.411704421043396,2.557370185852051,6.712454319000244,7.851138114929199,7.996533870697021,5.876118183135986,8.196417808532715,2.847906351089478,0.526885569095612,3.777189970016479,0.285532176494598,5.92206859588623,1.523681402206421,0.783351898193359,2.615882873535156,7.965038299560547,2.159844875335693,2.587777137756348,4.322238922119141,0.541601359844208,5.179450988769531,3.119930982589722,2.249698162078857,4.278829574584961,4.068767070770264,8.279062271118164,5.937953472137451,0.695975065231323,0.309592455625534,5.902958869934082,21.404443740844727,2.860468149185181,10.579604148864746,11.384071350097656,7.85557222366333,0.691693484783173,8.187142372131348,4.985429286956787,0.764742493629456,8.414061546325684,5.689966201782227,3.540095806121826,0.795010805130005,4.296596527099609,0.854786276817322,4.222318649291992,11.622723579406738,2.026190042495728,3.113039016723633,1.485832571983337,3.786406517028809,1.121347427368164,7.3649582862854,0.010303688235581,3.955642461776733,11.951518058776855,0.266689985990524,3.944607973098755,7.466159820556641,14.112260818481444,6.055236339569092,7.938201904296875,3.817216873168945,1.993747711181641,1.309025526046753,7.436588287353516,2.299530267715454,8.479105949401855,8.297067642211914,6.225222110748291,3.908465147018433,7.751003742218018,5.661761283874512,2.491418600082397,3.56048059463501,13.152610778808594,0.152637347579002,4.186184406280518,0.543327867984772,2.768021821975708,0.621113061904907,8.525278091430664,12.642186164855955,0.715823292732239,...,9.181514739990234,4.950465679168701,8.627997398376465,0.185015738010406,1.970481038093567,0.8556809425354,1.894861340522766,7.604542255401611,0.171087518334389,4.354949951171875,3.823169231414795,1.082382798194885,1.591144442558289,4.458998203277588,0.985375225543976,2.621798038482666,1.650875210762024,13.297452926635742,8.799118041992188,4.182724475860596,12.120746612548828,1.714380383491516,5.168372631072998,6.147232055664062,2.736760139465332,0.318095624446869,0.776041269302368,3.137636423110962,3.238004684448242,4.136397361755371,0.832186996936798,0.456267684698105,1.456459999084473,0.375595420598984,6.16598653793335,7.617561817169189,10.382041931152344,8.890372276306152,4.551647663116455,7.686263084411621,4.896228313446045,6.715538024902344,5.691828727722168,2.934664011001587,0.922449052333832,3.899200677871704,2.518821001052856,7.41307544708252,0.199188709259033,10.384926795959473,2.464115381240845,3.962347030639648,3.00531268119812,2.014151334762573,4.961565017700195,5.771162986755371,0.955116808414459,5.570167064666748,7.88541841506958,4.122843265533447,10.880002021789553,0.217932149767876,1.419586420059204,5.261942863464355,5.45767068862915,5.024100303649902,0.369678735733032,7.797882556915283,3.105921983718872,0.369428694248199,4.424531936645508,5.377870559692383,8.674015998840332,5.966554164886475,7.13629150390625,2.892102718353271,7.513805866241455,2.628840684890747,8.579662322998047,2.798909902572632,5.2611083984375,1.371829271316528,8.963241577148438,4.474839210510254,9.318074226379396,4.725077152252197,3.189717054367065,11.574515342712402,3.944553375244141,0.976326882839203,4.559854030609131,3.023208379745483,1.478387713432312,3.991934061050415,3.135113477706909,1.429340600967407,5.454273700714111,0.92160302400589,3.010892391204834,10.437831878662108
min,0.0,0.408399999141693,-15.043399810791016,2.11710000038147,-0.040199998766184,5.07480001449585,-32.562599182128906,2.347300052642822,5.349699974060059,-10.505499839782717,3.970499992370605,-20.731300354003903,-26.09499931335449,13.434599876403809,-6.011099815368652,1.013299942016602,13.076899528503418,0.63510000705719,-33.38019943237305,-10.664199829101562,-12.402500152587892,-5.432199954986572,-10.08899974822998,-5.322500228881836,1.209800004959106,-0.678399980068207,12.720000267028809,-24.243099212646484,-6.166800022125244,2.089600086212158,-4.787199974060059,-34.79840087890625,2.140599966049194,-8.986100196838379,1.508499979972839,9.816900253295898,-16.513599395751953,-8.095100402832031,-1.18340003490448,-6.337100028991699,-14.545700073242188,-35.211700439453125,-8.535900115966797,8.859000205993652,10.652799606323242,-9.939599990844728,-90.25250244140624,1.206200003623962,-47.686199188232415,-23.902200698852536,-8.070699691772461,10.385499954223633,-15.046199798583984,-24.721399307250977,3.344899892807007,-26.778600692749023,-3.782599925994873,2.761800050735474,3.442300081253052,-12.600899696350098,6.184000015258789,-2.100600004196167,-48.80270004272461,-6.32889986038208,-10.554400444030762,1.611700057983398,-14.088800430297852,1.3367999792099,-19.544300079345703,4.993800163269043,-16.30940055847168,-17.02750015258789,-0.224000006914139,-12.383399963378906,-1.665799975395203,-34.10150146484375,-1.293599963188171,-21.63330078125,7.425700187683105,-1.818300008773804,10.445400238037108,-18.042200088500977,7.58650016784668,-30.026599884033203,-24.22010040283203,-24.43980026245117,7.02299976348877,-19.272199630737305,-8.481599807739258,1.350200057029724,-9.601400375366213,-61.71799850463867,6.52180004119873,-1.018499970436096,8.491600036621094,2.819000005722046,-2.432399988174438,-12.15839958190918,-21.739999771118164,-0.603500008583069,...,-39.17910003662109,0.075699999928474,-7.382900238037109,0.979300022125244,4.08459997177124,0.715300023555756,0.942399978637695,-5.89799976348877,13.729000091552734,5.769700050354004,-9.239800453186035,2.194200038909912,-2.030200004577637,-5.513899803161621,-0.050500001758337,-6.85860013961792,-3.163000106811523,-31.836900711059567,-37.527698516845696,-9.774200439453123,-18.696199417114254,6.305200099945068,-15.194000244140623,-12.40590000152588,-7.053800106048584,11.48610019683838,11.265399932861328,-8.876899719238281,-11.755900382995604,2.186300039291382,9.528300285339355,-0.954800009727478,2.890000104904175,5.359300136566162,-24.254600524902344,-31.380800247192383,-9.949299812316896,-9.85099983215332,-16.468399047851562,-21.27429962158203,-15.459500312805176,-16.693700790405273,-7.107999801635742,2.80679988861084,5.444300174713135,-8.27340030670166,0.42739999294281,-29.983999252319336,3.320499897003174,-41.1683006286621,9.241999626159668,-2.191499948501587,-2.880000114440918,11.030799865722656,-8.196599960327148,-21.840900421142575,9.996500015258787,-22.99040031433105,-4.554399967193604,-4.641600131988525,-7.452199935913086,4.85260009765625,0.623099982738495,-6.531700134277344,-19.997699737548828,3.816699981689453,1.851199984550476,-35.96950149536133,-5.250199794769287,4.258800029754639,-14.505999565124512,-22.47929954528809,-11.45330047607422,-22.748699188232425,-2.995300054550171,3.241499900817871,-29.116500854492188,4.952099800109863,-29.273399353027344,-7.856100082397461,-22.037399291992188,5.416500091552734,-26.001100540161133,-4.808199882507324,-18.48970031738281,-22.58329963684082,-3.022300004959106,-47.75360107421875,4.412300109863281,-2.55430006980896,-14.093299865722656,-2.691699981689453,-3.814500093460083,-11.78339958190918,8.6943998336792,-5.261000156402588,-14.209600448608398,5.960599899291992,6.299300193786621,-38.85279846191406
25%,0.0,8.453850269317627,-4.74002480506897,8.722474813461304,5.254074931144714,9.883174657821655,-11.200350046157835,4.7677001953125,13.94379997253418,-2.317800045013428,6.618800163269043,-3.594949901103973,-7.510600090026855,13.894000053405762,5.072800159454346,5.781874895095825,14.262800216674805,7.452275037765503,-10.476225137710571,9.177950382232666,6.276475191116333,8.627799987792969,11.550999641418455,2.182399988174438,2.634099960327148,7.61299991607666,13.45639991760254,-8.321725130081177,-2.307899951934814,4.992099761962891,3.171700000762939,-13.766175031661987,8.869999885559082,-2.500874936580658,11.456299781799316,11.032299995422363,0.116974998265505,-0.007125000003725,4.125475168228149,7.591050148010254,-2.19950008392334,-12.831825256347656,4.519574880599976,10.713199615478516,11.343799591064451,5.313650131225586,-28.730699539184567,9.24875020980835,-20.654524326324463,6.351974964141846,10.653474807739258,12.269000053405762,7.267625093460083,-6.065025091171265,5.435599803924561,-5.147625088691711,8.163900375366211,14.097875356674194,5.6875,0.18350000679493,8.312399864196777,8.912749767303467,-20.90172433853149,-0.572399973869324,-1.58870005607605,5.293499946594238,-1.702800035476685,4.973800182342529,6.753200054168701,5.013999938964844,-6.33662486076355,15.256624698638916,0.472299993038177,-2.197099924087524,14.097274780273438,9.595974683761597,12.480974912643433,0.596300005912781,16.014699935913086,3.817275047302246,13.375399589538574,0.694475024938583,13.21477484703064,-10.004950046539308,-5.106400012969971,-7.216125130653381,15.3385751247406,0.407549992203712,7.247174978256226,6.918774962425232,1.140499949455261,-26.665599822998047,6.869900226593018,9.670299530029297,10.195599555969238,8.82800006866455,-0.527400016784668,7.796949982643127,8.919524669647217,1.26767498254776,...,-13.198699951171877,9.639800071716309,16.047975063323975,1.428900003433228,10.097900390625,3.639600038528442,7.282299995422363,12.16807460784912,14.098899841308594,15.107174634933472,2.817475080490112,5.5100998878479,2.092675030231476,4.803249955177307,2.388774931430817,0.399699985980988,1.171875029802322,6.373499989509583,-11.587849855422974,-0.161975000053644,15.696274757385254,9.996399879455566,-2.565200090408325,2.817050039768219,2.353600025177002,12.245400428771973,12.608400344848633,-1.502325028181076,-3.580724954605103,12.514474630355837,11.61929988861084,0.207800000905991,6.724375009536743,6.543499946594238,-9.625699996948242,-9.95709991455078,14.933899879455566,10.65654993057251,-2.011825025081635,2.387574970722198,-0.121699996292591,-2.15372508764267,7.900000095367432,10.311200141906738,7.968075037002563,1.885875016450882,8.646900177001953,-8.751450061798096,3.853600025177002,-1.903200030326843,14.952199935913086,7.064599990844727,5.56790018081665,15.232999801635742,3.339900016784668,-6.266024827957153,12.475099563598633,-8.939950227737427,12.10919952392578,7.24352490901947,15.696125268936155,5.470499992370605,4.326099872589111,7.029600143432617,-7.094025135040283,15.744550228118896,2.698999881744385,-9.643099784851074,2.703200101852417,5.374599933624268,-3.258500099182129,-4.72035014629364,13.731775045394896,-5.009525060653687,15.064599990844728,9.371600151062012,-8.386500358581543,9.808674812316896,-7.395699977874756,0.625574991106987,-6.673900127410889,9.084699630737305,-6.064425110816956,5.423099994659424,5.663300037384033,-7.360000133514404,6.715199947357178,-19.205124378204346,12.501550197601318,0.014899999834597,-0.058825000189245,5.157400131225586,0.889775007963181,0.58459997177124,15.629799842834473,-1.170699954032898,-1.946925014257431,8.252799987792969,13.829700469970703,-11.20847463607788
50%,0.0,10.524750232696531,-1.608050048351288,10.579999923706056,6.824999809265137,11.1082501411438,-4.833149909973145,5.3850998878479,16.45680046081543,0.393700003623962,7.629600048065186,0.487300008535385,-3.286949992179871,14.025500297546388,8.604249954223633,7.520299911499023,14.574099540710447,9.232049942016602,-5.6663498878479,15.196249961853027,12.453900337219238,13.196800231933594,17.23425006866455,4.275150060653687,3.008649945259094,10.38034963607788,13.662500381469728,-4.196899890899658,-1.132099986076355,5.534850120544434,4.950200080871582,-7.411749839782715,10.365649700164797,-0.497649997472763,14.576000213623049,11.435199737548828,3.917750000953674,2.197999954223633,5.900650024414062,10.562700271606444,0.672299981117249,-6.617449998855591,9.162649631500244,11.243399620056152,11.5649995803833,9.437199592590332,-12.547200202941896,11.310750007629396,-12.482399940490724,14.559200286865234,16.67240047454834,12.745599746704102,13.4443998336792,-2.502449989318848,6.0278000831604,1.274049997329712,12.59409999847412,16.648149490356445,6.262499809265137,3.170100092887878,8.901000022888184,12.06434965133667,-11.892000198364258,0.794700026512146,0.681699991226196,6.377699851989746,1.021349966526031,5.782000064849854,11.921999931335447,5.019100189208984,-3.325500011444092,24.444999694824215,0.66839998960495,0.646449983119965,19.309749603271484,19.53664970397949,16.844200134277344,6.297800064086914,18.967849731445312,5.44005012512207,14.38884973526001,6.061749935150146,14.844499588012695,-3.284450054168701,1.069700002670288,-2.517949938774109,18.2964506149292,6.006700038909912,11.288000106811523,8.61620044708252,3.642549991607666,-16.482600212097168,6.986499786376953,12.673500061035156,10.582200050354004,10.983850002288818,-0.098600000143051,14.36989974975586,18.50214958190918,1.76830005645752,...,-6.401500225067139,13.380850315093994,22.30685043334961,1.565999984741211,11.497950077056885,4.224500179290771,8.605149745941162,17.573200225830078,14.22659969329834,18.281350135803223,5.394299983978271,6.340099811553955,3.408400058746338,8.148550033569336,3.083800077438354,2.249850034713745,2.456300020217896,15.944849967956545,-5.189500093460083,3.023949980735779,24.354700088500977,11.239700317382812,1.200700044631958,7.234300136566162,4.30210018157959,12.486300468444824,13.166799545288086,0.925000011920929,-1.101749956607819,15.426799774169922,12.264649868011476,0.556599974632263,7.809100151062012,6.806700229644775,-4.704249858856201,-4.111899852752686,22.9483003616333,17.25724983215332,1.211749970912933,8.066250324249268,3.56469988822937,2.975499987602234,11.855899810791016,12.35634994506836,8.65185022354126,4.904700040817261,10.395600318908691,-3.178699970245361,3.996000051498413,5.283249855041504,16.736949920654297,10.127900123596191,7.673699855804443,16.649749755859375,6.994050025939941,-2.066099882125854,13.184300422668455,-4.868400096893311,17.63045024871826,10.217549800872805,23.864500045776367,5.633500099182129,5.359700202941895,10.788700103759766,-2.637799978256226,19.27079963684082,2.960200071334839,-4.011600017547607,4.761600017547607,5.634300231933594,0.002799999900162,-0.807349979877472,19.748000144958496,-0.569750010967255,20.206100463867188,11.679800033569336,-2.538450002670288,11.737249851226808,-0.942049980163574,2.51230001449585,-2.688800096511841,10.036050319671633,0.720200002193451,8.600000381469727,12.520999908447266,-3.94694995880127,8.902149677276611,-10.209749698638916,15.239449977874756,0.74260002374649,3.20359992980957,7.347749948501587,1.901299953460693,3.396350026130676,17.9579496383667,-0.172700002789497,2.408900022506714,8.888199806213379,15.9340500831604,-2.819550037384033
75%,0.0,12.75819969177246,1.358625024557114,12.51669979095459,8.32409954071045,12.261124849319458,0.924799978733063,6.002999782562256,19.1028995513916,2.937900066375732,8.584425210952759,4.382925152778625,0.85282501578331,14.164199829101562,12.27477502822876,9.270425081253052,14.874500274658203,11.055899620056152,-0.810775011777878,21.013324737548828,18.433300018310547,17.8794002532959,23.089050292968754,6.293200016021729,3.403800010681152,13.479599952697754,13.863699913024902,-0.090199999511242,0.015624999534339,6.093699932098389,6.798925042152405,-1.443450033664703,11.885000228881836,1.469099998474121,18.097124576568604,11.844400405883787,7.487725019454956,4.460400104522705,7.542399883270264,13.5989248752594,3.637825012207031,-0.880875021219254,13.754799842834473,11.7568998336792,11.804599761962892,13.087300300598145,3.150525033473969,13.318300247192385,-4.244524836540222,23.02865028381348,22.549049854278564,13.234499931335447,19.385650157928467,0.944350004196167,6.542900085449219,7.401825070381165,17.08662462234497,19.28969955444336,6.84499979019165,6.209700107574463,9.566524744033812,15.116499900817873,-3.225450098514557,2.22819995880127,3.020299911499023,7.490600109100342,3.739200115203857,6.58620023727417,17.037650108337402,5.024099826812744,-0.498874999582767,33.63314914703369,0.864400029182434,3.510699987411499,25.207124710083008,29.62070083618164,21.4322247505188,11.81879997253418,22.041099548339844,6.867199897766113,15.383099555969238,11.449124574661257,16.34079933166504,3.101725041866302,7.449900150299072,1.986700057983398,21.358850479125977,11.158375263214111,15.43322491645813,10.567025184631348,6.146200180053711,-6.409374952316284,7.101399898529053,15.840225219726562,10.944899559020996,13.089099884033203,0.329100012779236,20.819375514984127,28.158974647521973,2.260900020599365,...,0.132100000977516,17.250225067138672,28.6822247505188,1.705399990081787,12.902099609375,4.822199821472168,9.928899765014648,23.34859943389893,14.36180019378662,21.8528995513916,8.104324579238892,7.080299854278564,4.577400207519531,11.596199989318848,3.811899900436401,4.121500015258789,3.66510009765625,25.780824661254883,0.971800029277802,6.098400115966797,33.105276107788086,12.619425058364868,5.091700077056885,11.734750270843506,6.192200183868408,12.718099594116213,13.811699867248535,3.292999982833862,1.351699948310852,18.480400085449215,12.876700401306152,0.901000022888184,8.9114248752594,7.070799827575684,-0.178800001740456,1.125950008630753,31.04242515563965,24.426025390625,4.391225099563599,13.232525110244753,7.078524827957153,8.192425012588501,16.073925018310547,14.461050271987917,9.3149995803833,7.67692494392395,12.113225221633911,2.028274953365326,4.131599903106689,12.688225030899048,18.6825008392334,13.057600021362305,9.817299842834473,18.263900756835938,10.766350269317629,1.891750007867813,13.92930030822754,-0.988575011491776,23.87532520294189,13.0945246219635,32.622849464416504,5.791999816894531,6.371200084686279,14.623900413513184,1.323600053787231,23.024024963378903,3.241499900817871,1.31872496008873,7.020025014877319,5.905399799346924,3.096400022506714,2.956799983978271,25.90772485733032,3.619899988174438,25.64122438430786,13.74549961090088,2.704400062561035,13.931300163269045,5.338749885559082,4.391124963760376,0.996200025081635,11.011300086975098,7.499175190925598,12.12742519378662,19.4561505317688,-0.590649977326393,11.19379997253418,-1.465999960899353,18.34522485733032,1.482900023460388,6.406199932098389,9.512524843215942,2.94950008392334,6.20580005645752,20.396524906158447,0.829599976539612,6.556725144386292,9.593299865722656,18.064724445343018,4.836800098419189
max,1.0,20.315000534057617,10.37679958343506,19.35300064086914,13.188300132751465,16.67140007019043,17.25160026550293,8.447699546813965,27.69179916381836,10.151300430297852,11.150600433349608,18.67020034790039,17.18869972229004,14.654500007629396,22.331499099731445,14.937700271606444,15.863300323486328,17.950599670410156,19.02589988708496,41.748001098632805,35.18299865722656,31.2859001159668,49.044300079345696,14.594499588012695,4.875199794769287,25.44599914550781,14.654600143432615,15.675100326538086,3.243099927902222,8.787400245666504,13.143099784851074,15.65149974822998,20.171899795532227,6.787099838256836,29.546600341796875,13.287799835205078,21.52890014648437,14.245599746704102,11.863800048828123,29.823499679565433,15.322299957275392,18.105600357055664,26.16580009460449,13.469599723815918,12.577899932861328,34.19609832763672,62.08440017700195,21.29389953613281,20.685400009155277,54.27379989624024,41.15299987792969,15.31719970703125,40.68899917602539,17.096799850463867,8.231499671936035,28.572399139404297,29.092100143432617,29.07410049438477,9.160900115966797,20.483299255371094,11.986700057983398,25.195499420166016,27.1028995513916,7.753600120544434,11.23169994354248,11.15369987487793,15.731300354003906,9.713199615478516,39.39680099487305,5.046899795532227,8.547300338745117,64.46440124511719,1.571900010108948,14.149999618530272,44.53609848022461,70.27200317382812,36.156700134277344,34.43519973754883,30.956899642944336,11.350700378417969,18.225599288940433,30.476900100708008,23.132400512695312,21.893400192260746,27.71430015563965,17.742399215698242,32.901100158691406,34.56370162963867,33.35409927368164,17.459400177001953,15.481599807739258,27.271299362182617,7.489500045776367,26.99760055541992,12.534299850463867,18.975000381469727,1.804000020027161,40.88059997558594,58.287899017333984,4.502799987792969,...,25.140899658203125,28.459400177001957,51.32649993896485,2.188699960708618,19.02059936523437,7.16919994354248,15.30739974975586,46.37950134277344,14.743000030517578,32.05910110473633,19.519300460815433,9.800200462341309,8.431699752807617,21.54210090637207,6.585000038146973,11.950400352478027,8.120699882507324,64.81089782714844,25.263500213623047,15.68850040435791,74.03209686279298,17.30739974975586,18.471399307250977,26.874900817871094,14.991499900817873,13.664199829101562,15.515600204467772,10.597599983215332,9.809599876403809,31.203599929809567,14.989500045776367,2.192300081253052,12.465000152587892,8.309100151062012,12.723600387573242,21.41279983520508,54.57939910888672,44.437599182128906,18.818700790405277,36.09709930419922,21.12190055847168,23.96579933166504,32.8911018371582,22.691600799560547,11.810099601745604,16.00830078125,20.437299728393555,22.149400711059567,4.752799987792969,48.42399978637695,25.435699462890625,21.124500274658203,18.384599685668945,24.007499694824215,23.242799758911133,16.831600189208984,16.496999740600586,11.972100257873535,44.77949905395508,25.1200008392334,58.39419937133789,6.309899806976318,10.134400367736816,27.56480026245117,12.11929988861084,38.33219909667969,4.220399856567383,21.276599884033203,14.886099815368652,7.089000225067139,16.73189926147461,17.917299270629883,53.591899871826165,18.855400085449215,43.54679870605469,20.854799270629883,20.245199203491207,20.59650039672852,29.84129905700684,13.448699951171877,12.750499725341797,14.39389991760254,29.248699188232425,23.70490074157715,44.363399505615234,12.9975004196167,21.739200592041016,22.78610038757324,29.330299377441406,4.03410005569458,18.440900802612305,16.71649932861328,8.402400016784668,18.281799316406246,27.928800582885746,4.272900104522705,18.32150077819824,12.000399589538574,26.079099655151367,28.50069999694824


# 4.4 Visualizations

Visualizations done in the baseline script.

# 4.5 Data Preprocessing

In [27]:
def check_missing_data(df):
    flag=df.isna().sum().any()
    if flag==True:
        total = df.isnull().sum()
        percent = (df.isnull().sum())/(df.shape[0])
        output = pd.concat([total,percent], axis=1, keys=['Total','Percent'])
        
        data_type = []
        for col in df.columns:
            dtype = str(df[col].dtype)
            data_type.append(dtype)
        output['Types'] = data_type
        return (np.transpose(output))
    else:
        return(False)

In [28]:
check_missing_data(train)

False

In [29]:
check_missing_data(test)

False

# 4.6 Binary Classification

In [30]:
train['target'].unique()

array([0, 1], dtype=uint64)

# 4.7 Is data set imblance?

A large part of the data is unbalanced, but how can we solve it?

In [32]:
train['target'].value_counts()

0    179902
1     20098
Name: target, dtype: int64

In [35]:
def check_balance(df, target):
    check=[]
    print('size of data is : ', df.shape[0])
    for i in [0,1]:
        print('for target {} ='.format(i))
        print(df[target].value_counts()[i]/df.shape[0]*100, '%')

In [36]:
check_balance(train, 'target')

size of data is :  200000
for target 0 =
89.95100000000001 %
for target 1 =
10.049 %


# 4.8 skewnees and kurtosis

In [38]:
print('Skewness : ', train['target'].skew())
print('Kurtosis : ', train['target'].kurt())

Skewness :  2.6576420477382454
Kurtosis :  5.063111884925181


# 5.0 Machine learning Explainability for Santander

In this section, I want to try extract insights from models with the help of this excellent Course in Kaggle. The Goal behind of ML Explainability for Santander is:

* All features are senseless named.(var_1, var2,...) but certainly the importance of each one is different!
* Extract insights from models.
* Find the most inmortant feature in models.
* Affect of each feature on the model's predictions.

# 5.1 Permutation Importance

In this section we will answer following question:
* What features have the biggest impact on predictions.
* How to extract insights form models

Prepare our data for modeling

In [39]:
cols = ['target', 'ID_code']
X = train.drop(cols, axis=1)
y = train['target']
X_test = test.drop('ID_code', axis=1)

Create a sample model to calculate which feature and more important

In [40]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
rfc_model = RandomForestClassifier(random_state=0).fit(train_X, train_y)

# 5.2 How to calculate and show importances?

Here is how to calculate and show importances with the eli5 library:

In [43]:
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(rfc_model, random_state=1).fit(val_X, val_y)

In [44]:
eli5.show_weights(perm, feature_names = val_X.columns.tolist(), top=150)

Weight,Feature
0.0002  ± 0.0002,var_110
0.0001  ± 0.0001,var_157
0.0001  ± 0.0001,var_162
0.0001  ± 0.0001,var_42
0.0001  ± 0.0002,var_170
0.0001  ± 0.0002,var_174
0.0001  ± 0.0001,var_188
0.0001  ± 0.0001,var_147
0.0001  ± 0.0001,var_197
0.0001  ± 0.0001,var_47


# 5.3 What can be inferred from the above?
* As you move down the top of the graph, the importance of the feature decreases.
* The features that are shown in green indicate that they have a positive impact on our prediction
* The features that are shown in white indicate that they have no effect on our prediction
* The features shown in red indicate that they have a negative impact on our prediction
* The most important feature was Var_110.

# 5.4 Partial Dependence Plots

While feature importance shows what variables most affect predictions, partial dependence plots show how feature affects predictions. Partial plots dependence plots are calculated after a model has been fit.

In [45]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
tree_model = DecisionTreeClassifier(random_state=0, max_depth=5, min_samples_split=5).fit(train_X, train_y)

For the sake of explanation, i use Decision Tree which you can see below.

In [48]:
features = [c for c in train.columns if c not in ['ID_code', 'target']]

In [49]:
from sklearn import tree
import graphviz
tree_graph = tree.export_graphviz(tree_model, out_file=None, feature_names=features)

# 5.5 Partial Dependence Plot
In this section, we see the impact of the main variables discovered in the previous sections by using the pdpbox

In [52]:
from matplotlib import pyplot as plt
from pdpbox import pdp, get_dataset, info_plots

ModuleNotFoundError: No module named 'pdpbox'

# 6.0 Model Development
So far, we have used two models, and at this point we add another model and we'll be expanding it soon. In this section you will see the following model:

* lightgbm
* RandomForestClassifier
* DecisionTreeClassifier
* CatBoostClassifier

# 6.1 Lightgbm

In [53]:
# params is based on following kernel https://www.kaggle.com/brandenkmurray/nothing-works
params = {'objective' : "binary", 
               'boost':"gbdt",
               'metric':"auc",
               'boost_from_average':"false",
               'num_threads':8,
               'learning_rate' : 0.01,
               'num_leaves' : 13,
               'max_depth':-1,
               'tree_learner' : "serial",
               'feature_fraction' : 0.05,
               'bagging_freq' : 5,
               'bagging_fraction' : 0.4,
               'min_data_in_leaf' : 80,
               'min_sum_hessian_in_leaf' : 10.0,
               'verbosity' : 1}

In [54]:
%%time
y_pred_lgb = np.zeros(len(X_test))
num_round = 1000000
for fold_n, (train_index, valid_index) in enumerate(folds.split(X,y)):
    print('Fold', fold_n, 'started at', time.ctime())
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid)
    
    lgb_model = lgb.train(params, train_data, num_round, 
                          valid_sets = [train_data, valid_data], 
                          verbose_eval = 100, early_stopping_rounds = 3500)
    
    y_pred_lgb += lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration) / 5

Fold 0 started at Mon Mar 11 17:23:34 2019
Training until validation scores don't improve for 3500 rounds.
[100]	training's auc: 0.866351	valid_1's auc: 0.848317
[200]	training's auc: 0.880308	valid_1's auc: 0.862116
[300]	training's auc: 0.882352	valid_1's auc: 0.863343
[400]	training's auc: 0.887447	valid_1's auc: 0.867385
[500]	training's auc: 0.89051	valid_1's auc: 0.870442
[600]	training's auc: 0.892927	valid_1's auc: 0.872071
[700]	training's auc: 0.894784	valid_1's auc: 0.873462
[800]	training's auc: 0.897151	valid_1's auc: 0.875473
[900]	training's auc: 0.899576	valid_1's auc: 0.877511
[1000]	training's auc: 0.901253	valid_1's auc: 0.878517
[1100]	training's auc: 0.902566	valid_1's auc: 0.879466
[1200]	training's auc: 0.904032	valid_1's auc: 0.88051
[1300]	training's auc: 0.905393	valid_1's auc: 0.88143
[1400]	training's auc: 0.906798	valid_1's auc: 0.882518
[1500]	training's auc: 0.90827	valid_1's auc: 0.883536
[1600]	training's auc: 0.90941	valid_1's auc: 0.884121
[1700]	trai

[14600]	training's auc: 0.967	valid_1's auc: 0.896169
[14700]	training's auc: 0.967276	valid_1's auc: 0.896175
[14800]	training's auc: 0.967563	valid_1's auc: 0.896132
Early stopping, best iteration is:
[11318]	training's auc: 0.957332	valid_1's auc: 0.896572
Fold 1 started at Mon Mar 11 17:32:44 2019
Training until validation scores don't improve for 3500 rounds.
[100]	training's auc: 0.866077	valid_1's auc: 0.853287
[200]	training's auc: 0.879202	valid_1's auc: 0.868413
[300]	training's auc: 0.880905	valid_1's auc: 0.868771
[400]	training's auc: 0.886127	valid_1's auc: 0.874196
[500]	training's auc: 0.888773	valid_1's auc: 0.876785
[600]	training's auc: 0.891173	valid_1's auc: 0.879111
[700]	training's auc: 0.8933	valid_1's auc: 0.880299
[800]	training's auc: 0.895925	valid_1's auc: 0.882355
[900]	training's auc: 0.898492	valid_1's auc: 0.884487
[1000]	training's auc: 0.900252	valid_1's auc: 0.885857
[1100]	training's auc: 0.901356	valid_1's auc: 0.886766
[1200]	training's auc: 0.902

KeyboardInterrupt: 

# 6.2 RandomForestClassifier

In [None]:
y_pred_rfc = rfc_model.predict(X_test)

# 6.3 DecisionTreeClassifier

In [None]:
y_pred_tree = tree_model.predict(X_test)

# 6.4 CatBoostClassifier

In [None]:
train_pool = Pool(train_X, train_y)
cat_model = CatBoostClassifier(iterations=3000, learning_rate=0.03,
                              objective='Logloss',
                              eval_metric='AUC')
cat_model.fit(train_X, train_y, silent=True)
y_pred_cat = cat_model.predict(X_test)

Now you can change your model and submit the results of other models.

In [None]:
submission_rfc = pd.DataFrame({
        "ID_code": test["ID_code"],
        "target": y_pred_rfc
    })
submission_rfc.to_csv('submission_rfc.csv', index=False)

In [None]:
submission_tree = pd.DataFrame({
        "ID_code": test["ID_code"],
        "target": y_pred_tree
    })
submission_tree.to_csv('submission_tree.csv', index=False)

In [None]:
submission_cat = pd.DataFrame({
        "ID_code": test["ID_code"],
        "target": y_pred_cat
    })
submission_cat.to_csv('submission_cat.csv', index=False)

In [None]:
# good for submit
submission_lgb = pd.DataFrame({
        "ID_code": test["ID_code"],
        "target": y_pred_lgb
    })
submission_lgb.to_csv('submission_lgb.csv', index=False)

# 6.5 Funny Combine

In [None]:
submission_rfc_cat = pd.DataFrame({
        "ID_code": test["ID_code"],
        "target": (y_pred_rfc +y_pred_cat)/2
    })
submission_rfc_cat.to_csv('submission_rfc_cat.csv', index=False)

In [None]:
submission_lgb_cat = pd.DataFrame({
        "ID_code": test["ID_code"],
        "target": (y_pred_lgb +y_pred_cat)/2
    })
submission_lgb_cat.to_csv('submission_lgb_cat.csv', index=False)

In [None]:
submission_rfc_lgb = pd.DataFrame({
        "ID_code": test["ID_code"],
        "target": (y_pred_rfc +y_pred_lgb)/2
    })
submission_rfc_lgb.to_csv('submission_rfc_lgb.csv', index=False)