
<h2>Problem Statement</h2>

In this challenge, we invite Kagglers to help us identify which customers will make a specific transaction in the future, irrespective of the amount of money transacted.

<br><br>
Submissions are scored on the <b>area under the ROC curve</b>. :

![area under the ROC curve](https://developers.google.com/machine-learning/crash-course/images/AUC.svg)

In [26]:
#IMPORTING REQUIRED LIBRARIES
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt
import seaborn as sns
import math

from lightgbm.sklearn import LGBMRegressor
from lightgbm.sklearn import LGBMClassifier
import lightgbm as lgb
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA,KernelPCA,NMF

from sklearn.metrics import roc_auc_score,accuracy_score

from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression

import gc
gc.enable()


import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [2]:
#All functions

#FUNCTION FOR PROVIDING FEATURE SUMMARY
def feature_summary(df_fa):
    print('DataFrame shape')
    print('rows:',df_fa.shape[0])
    print('cols:',df_fa.shape[1])
    col_list=['Unique_Count','Max','Min','Mean','Std','Skewness','Median']
    df=pd.DataFrame(index=df_fa.columns,columns=col_list)
    #df['Null']=list([len(df_fa[col][df_fa[col].isnull()]) for i,col in enumerate(df_fa.columns)])
    #df['%_Null']=list([len(df_fa[col][df_fa[col].isnull()])/df_fa.shape[0]*100 for i,col in enumerate(df_fa.columns)])
    df['Unique_Count']=list([len(df_fa[col].unique()) for i,col in enumerate(df_fa.columns)])
    #df['Data_type']=list([df_fa[col].dtype for i,col in enumerate(df_fa.columns)])
    for i,col in enumerate(df_fa.columns):
        if 'float' in str(df_fa[col].dtype) or 'int' in str(df_fa[col].dtype):
            df.at[col,'Max']=str(round(df_fa[col].max(),2))
            df.at[col,'Min']=str(round(df_fa[col].min(),2))
            df.at[col,'Mean']=df_fa[col].mean()
            df.at[col,'Std']=df_fa[col].std()
            df.at[col,'Skewness']=df_fa[col].skew()
            df.at[col,'Median']=df_fa[col].median()
            
        
    return(df.fillna('-'))



In [3]:
#DATASET VIEW
path1="../input/"
data_files=list(os.listdir(path1))
df_files=pd.DataFrame(data_files,columns=['File_Name'])
df_files['Size_in_MB']=df_files.File_Name.apply(lambda x:round(os.stat(path1+x).st_size/(1024*1024),2))
df_files

Unnamed: 0,File_Name,Size_in_MB
0,sample_submission.csv,2.56
1,test.csv,287.56
2,train.csv,288.14


In [4]:
%%time
#READING AVAILABLE FILES DATASET
#HISTORICAL TRANSACTIONS FILE IS A LARGE ONE 
#SO WE WILL BE READING IT IN PARTS
print('reading train dataset...')
df_train=pd.read_csv(path1+'train.csv')
print('reading test dataset...')
df_test=pd.read_csv(path1+'test.csv')
print('submission file')
df_submission=pd.read_csv(path1+'sample_submission.csv')

reading train dataset...
reading test dataset...
submission file
CPU times: user 15.9 s, sys: 4.6 s, total: 20.5 s
Wall time: 20.5 s


In [5]:
#CREATING FINAL X, y and test SETS
X=df_train.drop(['ID_code','target'],axis=1)
y=df_train['target']
test=df_test.drop(['ID_code'],axis=1)

In [6]:
df_combi=pd.concat([X,test],axis=0)
df_fs=feature_summary(df_combi)

DataFrame shape
rows: 400000
cols: 200


In [7]:
n_clus=3
cluster = KMeans(n_clusters=n_clus, random_state=0, n_jobs=-1)
model=cluster.fit(df_fs)
df_fs['labels']=model.labels_

In [8]:
df_fs.groupby('labels').count()

Unnamed: 0_level_0,Unique_Count,Max,Min,Mean,Std,Skewness,Median
labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,28,28,28,28,28,28,28
1,20,20,20,20,20,20,20
2,24,24,24,24,24,24,24
3,13,13,13,13,13,13,13
4,30,30,30,30,30,30,30
5,16,16,16,16,16,16,16
6,17,17,17,17,17,17,17
7,16,16,16,16,16,16,16
8,21,21,21,21,21,21,21
9,15,15,15,15,15,15,15


In [9]:
print(X.shape,y.shape,test.shape)

(200000, 200) (200000,) (200000, 200)


In [10]:
for i in range(n_clus):
    f_list=list(df_fs[df_fs.labels==i].index)
    print('cluster id:',i+1,'\tfeature cluster item count:',len(f_list))

cluster id: 1 	feature cluster item count: 28
cluster id: 2 	feature cluster item count: 20
cluster id: 3 	feature cluster item count: 24
cluster id: 4 	feature cluster item count: 13
cluster id: 5 	feature cluster item count: 30
cluster id: 6 	feature cluster item count: 16
cluster id: 7 	feature cluster item count: 17
cluster id: 8 	feature cluster item count: 16
cluster id: 9 	feature cluster item count: 21
cluster id: 10 	feature cluster item count: 15


In [11]:
%%time
#CREATING FINAL MODEL WITH STRATIFIED KFOLDS
#FOLD COUNT 10
#TRIED XGBClassifier, LGBMClassifier, CatBoostClassifier
#BEST SCORE ACHIEVED BY CatBoostClassifier

param = {
    'bagging_freq': 5,          
    'bagging_fraction': 0.38,   'boost_from_average':'false',   
    'boost': 'gbdt',             'feature_fraction': 0.04,     'learning_rate': 0.0085,
    'max_depth': -1,             'metric':'auc',                'min_data_in_leaf': 80,     'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,            'num_threads': 8,              'tree_learner': 'serial',   'objective': 'binary',
    'reg_alpha': 0.1302650970728192, 'reg_lambda': 0.3603427518866501,'verbosity': -1
}


#DATAFRAMES FOR STORING PREDICTIONS ON TRAIN DATA AS WELL AS TEST DATA
#CAN BE USED FOR ENSEMBLE 
df_preds=pd.DataFrame()
df_preds_x=pd.DataFrame()



for i in range(n_clus):
    f_list=list(df_fs[df_fs.labels==i].index)
    print('Starting predicting cluster:',i+1)
    
    k=1
    splits=10
    avg_score=0
    
    #CREATING STRATIFIED FOLDS
    skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=200)
    print('\nStarting KFold iterations...')
    X1=X[f_list]
    test1=test[f_list]
    
    for train_index,test_index in skf.split(X1,y):
        df_X=X1.iloc[train_index,:]
        df_y=y.iloc[train_index]
        val_X=X1.iloc[test_index,:]
        val_y=y.iloc[test_index]

        #FITTING MODEL
    

        trn_data = lgb.Dataset(df_X, label=df_y)
        val_data = lgb.Dataset(val_X, label=val_y)
        model= lgb.train(param, trn_data, 1000000, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 2000)
        col_name='X_'+str(i+1)+'_'+str(k)
#PREDICTING ON VALIDATION DATA
        
        preds_x=model.predict(val_X,num_iteration=model.best_iteration)
        df_preds_x[col_name]=model.predict(X1,num_iteration=model.best_iteration)
#CALCULATING ACCURACY
        acc=roc_auc_score(val_y,preds_x)
        print('Iteration:',k,'  roc_auc_score:',acc)
        
#         col_name='P_'+str(i+1)+'_'+str(k)
        if k==1:
            score=acc
            preds=model.predict(test1,num_iteration=model.best_iteration)
            df_preds[col_name]=preds#model.predict(test,num_iteration=model.best_iteration)
        else:
            preds1=model.predict(test1,num_iteration=model.best_iteration)
            preds=preds+preds1
            df_preds[col_name]=preds1#model.predict(test,num_iteration=model.best_iteration)
        
        if score<acc:
            score=acc
            
        avg_score=avg_score+acc        
        k=k+1
    
    print('\n Best score:',score,' Avg Score:',avg_score/splits)


Starting predicting cluster: 1

Starting KFold iterations...
Training until validation scores don't improve for 2000 rounds.
[1000]	training's auc: 0.708143	valid_1's auc: 0.683282
[2000]	training's auc: 0.712722	valid_1's auc: 0.683696
[3000]	training's auc: 0.715874	valid_1's auc: 0.683356
Early stopping, best iteration is:
[1721]	training's auc: 0.712056	valid_1's auc: 0.684093
Iteration: 1   roc_auc_score: 0.6840932904263076
Training until validation scores don't improve for 2000 rounds.
[1000]	training's auc: 0.708553	valid_1's auc: 0.679262
[2000]	training's auc: 0.712976	valid_1's auc: 0.679391
[3000]	training's auc: 0.715792	valid_1's auc: 0.679333
Early stopping, best iteration is:
[1895]	training's auc: 0.712655	valid_1's auc: 0.679776
Iteration: 2   roc_auc_score: 0.6797312144187074
Training until validation scores don't improve for 2000 rounds.
[1000]	training's auc: 0.708178	valid_1's auc: 0.684897
[2000]	training's auc: 0.712584	valid_1's auc: 0.686962
[3000]	training's a

Training until validation scores don't improve for 2000 rounds.
[1000]	training's auc: 0.698519	valid_1's auc: 0.66201
[2000]	training's auc: 0.702505	valid_1's auc: 0.661894
Early stopping, best iteration is:
[776]	training's auc: 0.697394	valid_1's auc: 0.663165
Iteration: 2   roc_auc_score: 0.6631653305923276
Training until validation scores don't improve for 2000 rounds.
[1000]	training's auc: 0.696844	valid_1's auc: 0.675256
[2000]	training's auc: 0.700773	valid_1's auc: 0.675325
[3000]	training's auc: 0.703702	valid_1's auc: 0.674554
Early stopping, best iteration is:
[1312]	training's auc: 0.69865	valid_1's auc: 0.675694
Iteration: 3   roc_auc_score: 0.6756835887267387
Training until validation scores don't improve for 2000 rounds.
[1000]	training's auc: 0.696984	valid_1's auc: 0.677501
[2000]	training's auc: 0.700548	valid_1's auc: 0.677249
Early stopping, best iteration is:
[632]	training's auc: 0.694953	valid_1's auc: 0.678479
Iteration: 4   roc_auc_score: 0.6783718981523732


[1000]	training's auc: 0.712167	valid_1's auc: 0.685952
[2000]	training's auc: 0.715364	valid_1's auc: 0.686688
[3000]	training's auc: 0.718548	valid_1's auc: 0.686753
Early stopping, best iteration is:
[1764]	training's auc: 0.714633	valid_1's auc: 0.687094
Iteration: 5   roc_auc_score: 0.6870937696177257
Training until validation scores don't improve for 2000 rounds.
[1000]	training's auc: 0.712717	valid_1's auc: 0.679866
[2000]	training's auc: 0.715844	valid_1's auc: 0.681873
[3000]	training's auc: 0.718872	valid_1's auc: 0.681405
[4000]	training's auc: 0.721701	valid_1's auc: 0.680462
Early stopping, best iteration is:
[2022]	training's auc: 0.715885	valid_1's auc: 0.681981
Iteration: 6   roc_auc_score: 0.6819812278241919
Training until validation scores don't improve for 2000 rounds.
[1000]	training's auc: 0.712051	valid_1's auc: 0.686789
[2000]	training's auc: 0.715223	valid_1's auc: 0.687415
[3000]	training's auc: 0.718318	valid_1's auc: 0.686294
Early stopping, best iteration i

Training until validation scores don't improve for 2000 rounds.
[1000]	training's auc: 0.699697	valid_1's auc: 0.693316
[2000]	training's auc: 0.703502	valid_1's auc: 0.693843
[3000]	training's auc: 0.706167	valid_1's auc: 0.693309
Early stopping, best iteration is:
[1378]	training's auc: 0.701594	valid_1's auc: 0.694223
Iteration: 8   roc_auc_score: 0.6942225227392775
Training until validation scores don't improve for 2000 rounds.
[1000]	training's auc: 0.701145	valid_1's auc: 0.68174
[2000]	training's auc: 0.704891	valid_1's auc: 0.681848
[3000]	training's auc: 0.707673	valid_1's auc: 0.681114
Early stopping, best iteration is:
[1422]	training's auc: 0.70315	valid_1's auc: 0.68287
Iteration: 9   roc_auc_score: 0.6828696933836645
Training until validation scores don't improve for 2000 rounds.
[1000]	training's auc: 0.700838	valid_1's auc: 0.681731
[2000]	training's auc: 0.704326	valid_1's auc: 0.682752
[3000]	training's auc: 0.707147	valid_1's auc: 0.682469
Early stopping, best iterat

Iteration: 1   roc_auc_score: 0.604610846053209
Training until validation scores don't improve for 2000 rounds.
[1000]	training's auc: 0.639263	valid_1's auc: 0.599264
[2000]	training's auc: 0.643659	valid_1's auc: 0.599246
Early stopping, best iteration is:
[192]	training's auc: 0.631468	valid_1's auc: 0.600107
Iteration: 2   roc_auc_score: 0.6001071846039105
Training until validation scores don't improve for 2000 rounds.
[1000]	training's auc: 0.638162	valid_1's auc: 0.607565
[2000]	training's auc: 0.642462	valid_1's auc: 0.606644
Early stopping, best iteration is:
[298]	training's auc: 0.633224	valid_1's auc: 0.609157
Iteration: 3   roc_auc_score: 0.6091570220050387
Training until validation scores don't improve for 2000 rounds.
[1000]	training's auc: 0.637169	valid_1's auc: 0.619057
[2000]	training's auc: 0.641442	valid_1's auc: 0.619629
Early stopping, best iteration is:
[873]	training's auc: 0.63643	valid_1's auc: 0.619704
Iteration: 4   roc_auc_score: 0.6197037049328123
Training

In [13]:
%%time
#CREATING SUMBISSION FILE
df_preds_x.to_csv('X_features.csv',index=False)
df_preds.to_csv('test_features.csv',index=False)

In [None]:
# %%time

# X=df_preds_x
# test=df_preds

# model=LogisticRegression()
# k=1
# splits=15
# avg_score=0

# #CREATING STRATIFIED FOLDS
# skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=200)
# print('\nStarting KFold iterations...')
# for train_index,test_index in skf.split(X,y):
#     df_X=X.iloc[train_index,:]
#     df_y=y.iloc[train_index]
#     val_X=X.iloc[test_index,:]
#     val_y=y.iloc[test_index]

# #FITTING MODEL
    
#     model.fit(df_X,df_y)
    
    
# #PREDICTING ON VALIDATION DATA
    
#     preds_x=pd.Series(model.predict_proba(val_X)[:,1])
# #CALCULATING ACCURACY
#     acc=roc_auc_score(val_y,preds_x)
#     print('Iteration:',k,'  roc_auc_score:',acc)
#     if k==1:
#         score=acc
#         preds=pd.Series(model.predict_proba(test)[:,1])
        
#     else:
#         preds1=pd.Series(model.predict_proba(test)[:,1])
#         preds=preds+preds1
        
#         if score<acc:
#             score=acc
            
#     avg_score=avg_score+acc        
#     k=k+1
# print('\n Best score:',score,' Avg Score:',avg_score/splits)
# #TAKING AVERAGE OF PREDICTIONS
# preds=preds/splits

In [24]:
df_preds.head()

Unnamed: 0,X_1_1,X_1_2,X_1_3,X_1_4,X_1_5,X_1_6,X_1_7,X_1_8,X_1_9,X_1_10,X_2_1,X_2_2,X_2_3,X_2_4,X_2_5,X_2_6,X_2_7,X_2_8,X_2_9,X_2_10,X_3_1,X_3_2,X_3_3,X_3_4,X_3_5,X_3_6,X_3_7,X_3_8,X_3_9,X_3_10,X_4_1,X_4_2,X_4_3,X_4_4,X_4_5,X_4_6,X_4_7,X_4_8,X_4_9,X_4_10,...,X_7_1,X_7_2,X_7_3,X_7_4,X_7_5,X_7_6,X_7_7,X_7_8,X_7_9,X_7_10,X_8_1,X_8_2,X_8_3,X_8_4,X_8_5,X_8_6,X_8_7,X_8_8,X_8_9,X_8_10,X_9_1,X_9_2,X_9_3,X_9_4,X_9_5,X_9_6,X_9_7,X_9_8,X_9_9,X_9_10,X_10_1,X_10_2,X_10_3,X_10_4,X_10_5,X_10_6,X_10_7,X_10_8,X_10_9,X_10_10
0,0.070566,0.06487,0.068104,0.067225,0.081491,0.067331,0.059878,0.071672,0.054394,0.07912,0.124517,0.128814,0.115187,0.129152,0.128814,0.130017,0.132282,0.131762,0.116402,0.134512,0.084227,0.090862,0.082301,0.093486,0.082602,0.070275,0.085812,0.07731,0.074173,0.07792,0.109935,0.157218,0.107412,0.119623,0.399889,0.106892,0.107007,0.128543,0.111742,0.109739,...,0.087021,0.086582,0.083058,0.077776,0.092997,0.085629,0.088854,0.087607,0.082498,0.07898,0.182424,0.115071,0.298014,0.111361,0.124466,0.138342,0.249687,0.114829,0.141227,0.140216,0.123763,0.114497,0.14266,0.127198,0.123673,0.119868,0.11653,0.130626,0.144473,0.114946,0.175798,0.182056,0.136277,0.107782,0.18774,0.14752,0.115161,0.142279,0.11221,0.113277
1,0.098904,0.093015,0.089776,0.097998,0.097381,0.092715,0.090661,0.09705,0.087037,0.103094,0.092518,0.08707,0.110452,0.094117,0.089219,0.093135,0.085449,0.087852,0.109096,0.128893,0.10479,0.104934,0.108734,0.106469,0.101862,0.106142,0.10643,0.101421,0.106686,0.100962,0.09747,0.153972,0.098066,0.110915,0.39936,0.092302,0.097216,0.119957,0.098238,0.097143,...,0.181912,0.185036,0.195963,0.213871,0.161845,0.17528,0.194353,0.187978,0.188687,0.1838,0.185187,0.115042,0.298191,0.113669,0.115583,0.14008,0.249566,0.114307,0.140729,0.140207,0.126341,0.123732,0.146925,0.138644,0.134717,0.133785,0.123935,0.147015,0.149295,0.125096,0.172323,0.176885,0.130348,0.095007,0.182618,0.142013,0.091884,0.139663,0.100514,0.094433
2,0.12073,0.123034,0.121144,0.113547,0.113859,0.121755,0.129983,0.117492,0.122565,0.113182,0.133972,0.139413,0.121862,0.135802,0.14035,0.138283,0.147307,0.146542,0.123443,0.138297,0.094341,0.094791,0.099364,0.098091,0.095903,0.089977,0.09351,0.087683,0.089202,0.093072,0.094114,0.152333,0.093236,0.109392,0.39863,0.085554,0.094567,0.119126,0.096976,0.095796,...,0.218416,0.225553,0.272791,0.318352,0.225003,0.244469,0.259474,0.26336,0.258268,0.283482,0.180508,0.107258,0.295179,0.107132,0.09853,0.132627,0.2466,0.102513,0.135015,0.135946,0.088385,0.097736,0.134336,0.080379,0.085739,0.087348,0.096579,0.078672,0.134711,0.089597,0.169354,0.17428,0.128304,0.091046,0.180708,0.138937,0.088432,0.135746,0.094226,0.091033
3,0.138212,0.138517,0.140876,0.146041,0.132746,0.139809,0.144608,0.141547,0.146545,0.130408,0.086905,0.079167,0.105468,0.084721,0.082633,0.080779,0.085321,0.084737,0.109831,0.128016,0.091587,0.09596,0.090021,0.099807,0.088319,0.089318,0.094374,0.088331,0.083061,0.089747,0.10839,0.15771,0.110091,0.11812,0.399793,0.116357,0.110462,0.131063,0.109789,0.108224,...,0.072277,0.070723,0.06513,0.060738,0.076285,0.070264,0.069011,0.068268,0.067713,0.064054,0.183905,0.10844,0.296622,0.114589,0.11131,0.137201,0.247884,0.107719,0.139012,0.139318,0.117851,0.124905,0.148774,0.1273,0.130172,0.131023,0.120191,0.125785,0.150427,0.125494,0.180796,0.185964,0.140205,0.131235,0.193046,0.150921,0.130904,0.147566,0.115629,0.123277
4,0.130977,0.131687,0.136001,0.137065,0.119341,0.144837,0.153203,0.139273,0.152464,0.122179,0.107935,0.109445,0.118477,0.10661,0.111446,0.10977,0.111133,0.120148,0.114903,0.13279,0.099885,0.097258,0.097308,0.10405,0.097512,0.103051,0.101691,0.101901,0.099453,0.097682,0.121916,0.161172,0.121506,0.126726,0.40047,0.119174,0.118926,0.134179,0.119572,0.117538,...,0.079893,0.079187,0.073687,0.071661,0.082282,0.079263,0.076306,0.077041,0.076805,0.073102,0.176193,0.095478,0.293634,0.095473,0.078146,0.127847,0.243699,0.091074,0.129417,0.128677,0.086337,0.098277,0.133243,0.082856,0.085767,0.084685,0.097609,0.08026,0.133633,0.091046,0.172437,0.179817,0.134714,0.104779,0.185455,0.145262,0.102388,0.140927,0.107447,0.104352


In [29]:
%%time
#PREPARING SUBMISSION
df_submission['target']=df_preds.mean(axis=1)
df_submission

CPU times: user 80 ms, sys: 0 ns, total: 80 ms
Wall time: 81.6 ms


In [15]:
#CREATING SUMBISSION FILE
df_submission.to_csv('submission.csv',index=False)

In [30]:
df_submission

Unnamed: 0,ID_code,target
0,test_0,0.117467
1,test_1,0.124543
2,test_2,0.133745
3,test_3,0.120209
4,test_4,0.117072
5,test_5,0.107997
6,test_6,0.108284
7,test_7,0.125165
8,test_8,0.105301
9,test_9,0.108563
