In [9]:
import pandas
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

## 1. pre-process

### 1-1 load data

In [31]:
# load data set
train = pandas.read_csv("./data/train.csv")
test  = pandas.read_csv("./data/test.csv")

In [11]:
train.tail()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
76015,151829,2,48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60926.49,0
76016,151830,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,118634.52,0
76017,151835,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,74028.15,0
76018,151836,2,25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,84278.16,0
76019,151838,2,46,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


In [12]:
test.tail() # no 'TARGET'

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var29_ult3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38
75813,151831,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40243.2
75814,151832,2,26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,146961.3
75815,151833,2,24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,167299.77
75816,151834,2,40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016
75817,151837,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016


### 1-2 data summary

In [32]:
# component of data
data = len(train) # total number of train datas

features = train.shape[1]-2 # number of features (exclude first column(ID) and last column(TARGET))

sat_c = len(train[train['TARGET'] == 0]) # number of satisfied customers

unsat_c = len(train[train['TARGET'] == 1]) # number of unsatisfied customers

sat_rate = 100 * unsat_c/sat_c # percent of unsatisfied customers

print '[Train data set]'
print ('total number of datas: {}'.format(data))
print ('number of features: {}'.format(features))
print ('number of satisfied customers: {}'.format(sat_c))
print ('number of unsatisfied customers: {}'.format(unsat_c))
print ('percent of unsatisfied customers: {:.2f}%'.format(sat_rate))
# ---------------------------------------------------------------------

data = len(test) # total number of test datas

features = test.shape[1]-1 # number of features (exclude first column(ID))
print
print '[test data set]'
print ('total number of datas: {}'.format(data))
print ('number of features: {}'.format(features))

[Train data set]
total number of datas: 76020
number of features: 369
number of satisfied customers: 73012
number of unsatisfied customers: 3008
percent of unsatisfied customers: 4.00%

[test data set]
total number of datas: 75818
number of features: 369


### 1-3 delete constant columns

In [33]:
# remove constant columns
con_removed = [] # removed constant columns
df_train = train # copy the train data set
df_test = test # copy the test data set

# removed constant columns
for col in df_train.columns:
    if df_train[col].std() == 0:
        con_removed.append(col)

df_train.drop(con_removed, axis=1, inplace=True) # remove constant columns in the train data set

df_test.drop(con_removed, axis=1, inplace=True) # remove constant columns in the test data set

print ('removed constand columns: {}'.format(len(con_removed)))
print
print con_removed

removed constand columns: 34

['ind_var2_0', 'ind_var2', 'ind_var27_0', 'ind_var28_0', 'ind_var28', 'ind_var27', 'ind_var41', 'ind_var46_0', 'ind_var46', 'num_var27_0', 'num_var28_0', 'num_var28', 'num_var27', 'num_var41', 'num_var46_0', 'num_var46', 'saldo_var28', 'saldo_var27', 'saldo_var41', 'saldo_var46', 'imp_amort_var18_hace3', 'imp_amort_var34_hace3', 'imp_reemb_var13_hace3', 'imp_reemb_var33_hace3', 'imp_trasp_var17_out_hace3', 'imp_trasp_var33_out_hace3', 'num_var2_0_ult1', 'num_var2_ult1', 'num_reemb_var13_hace3', 'num_reemb_var33_hace3', 'num_trasp_var17_out_hace3', 'num_trasp_var33_out_hace3', 'saldo_var2_ult1', 'saldo_medio_var13_medio_hace3']


### 1-4 remove duplicate columns

In [41]:
# remove duplicated columns
dup_removed = [] # removed duplicated columns
col_tmp = [] # temp list for columns scan
dupList = {} # a dictionary of duplicated columns

columns = df_train.columns
for i in range(len(columns)-1):
    v = df_train[columns[i]].values
    dupCols = [] # duplicated columns
    for n in range(i+1, len(columns)): # compare with i and i+1(equal?) 
        if np.array_equal(v, df_train[columns[n]].values):
            dup_removed.append(columns[n])
            if columns[n] not in col_tmp:
                dupCols.append(columns[n])
                dupList[columns[i]] = dupCols
                
print dupList
print
print ('duplicated columns with distinct values: {}'.format(len(dupList)))

df_train.drop(dup_removed, axis=1, inplace=True)
df_test.drop(dup_removed, axis=1, inplace=True)
print
print ('removed duplicated coulumns: {}'.format(len(dup_removed)))

{'delta_imp_reemb_var33_1y3': ['delta_num_reemb_var33_1y3'], 'ind_var18_0': ['ind_var18'], 'delta_imp_reemb_var13_1y3': ['delta_num_reemb_var13_1y3'], 'ind_var26_0': ['ind_var26'], 'ind_var25_0': ['ind_var25'], 'num_var6_0': ['num_var29_0'], 'num_var26_0': ['num_var26'], 'ind_var40': ['ind_var39'], 'ind_var37_0': ['ind_var37'], 'num_var18_0': ['num_var18'], 'delta_imp_trasp_var33_in_1y3': ['delta_num_trasp_var33_in_1y3'], 'saldo_var13_medio': ['saldo_medio_var13_medio_ult1'], 'num_var40': ['num_var39'], 'num_var34_0': ['num_var34'], 'num_var32_0': ['num_var32'], 'ind_var13_medio_0': ['ind_var13_medio'], 'num_var6': ['num_var29'], 'num_var13_medio_0': ['num_var13_medio'], 'ind_var32_0': ['ind_var32'], 'delta_imp_reemb_var17_1y3': ['delta_num_reemb_var17_1y3'], 'delta_imp_trasp_var17_in_1y3': ['delta_num_trasp_var17_in_1y3'], 'saldo_var6': ['saldo_var29'], 'ind_var34_0': ['ind_var34'], 'num_var37_0': ['num_var37'], 'num_var25_0': ['num_var25'], 'ind_var6_0': ['ind_var29_0'], 'delta_imp_t

In [42]:
print dup_removed

['ind_var29_0', 'ind_var29', 'ind_var13_medio', 'ind_var18', 'ind_var26', 'ind_var25', 'ind_var32', 'ind_var34', 'ind_var37', 'ind_var39', 'num_var29_0', 'num_var29', 'num_var13_medio', 'num_var18', 'num_var26', 'num_var25', 'num_var32', 'num_var34', 'num_var37', 'num_var39', 'saldo_var29', 'saldo_medio_var13_medio_ult1', 'delta_num_reemb_var13_1y3', 'delta_num_reemb_var17_1y3', 'delta_num_reemb_var33_1y3', 'delta_num_trasp_var17_in_1y3', 'delta_num_trasp_var17_out_1y3', 'delta_num_trasp_var33_in_1y3', 'delta_num_trasp_var33_out_1y3']


In [8]:
# remove duplicate columns
colsToRemove = [] # columns to remove
colsScaned = [] # columns scaned
dupList = {} # a dictionary of dupicate columns with distinct values
# the keys are the first column of the dupicated columns
# the entries are columns which has the same value as the key 
columns = df_train.columns
for i in range(len(columns)-1):
    # search through every column
    v = df_train[columns[i]].values
    dupCols = [] # dumpicated columns
    for j in range(i+1,len(columns)):
    # compare if the two column are equal
        if np.array_equal(v,df_train[columns[j]].values):  
        # if yes add to the columns to be removed
            colsToRemove.append(columns[j])
            if columns[j] not in colsScaned:
            # only add new entry if the column haven't be scaned before
                dupCols.append(columns[j]) 
                colsScaned.append(columns[j]) # add the column as scaned
                dupList[columns[i]] = dupCols # update the key in the dictionary

# print the dupicated columns as a dictionary
# the keys are distinct dupicated columns
# the entries are columns which has the same value as the key 
print(dupList)
print("# duplicated columns with distinct values: {}".format(len(dupList)))  

df_train.drop(colsToRemove, axis=1, inplace=True) 
# remove duplicate columns in the training set
df_test.drop(colsToRemove, axis=1, inplace=True) 
# remove duplicate columns in the test set
print("# removed duplicated columns: {}".format(len(colsToRemove)))  
#print("Train set size: {}".format(df_train.shape))
#print("Test set size: {}".format(df_test.shape))

{'delta_imp_reemb_var33_1y3': ['delta_num_reemb_var33_1y3'], 'ind_var18_0': ['ind_var18'], 'delta_imp_reemb_var13_1y3': ['delta_num_reemb_var13_1y3'], 'ind_var26_0': ['ind_var26'], 'ind_var25_0': ['ind_var25'], 'num_var6_0': ['num_var29_0'], 'num_var26_0': ['num_var26'], 'ind_var40': ['ind_var39'], 'ind_var37_0': ['ind_var37'], 'num_var18_0': ['num_var18'], 'delta_imp_trasp_var33_in_1y3': ['delta_num_trasp_var33_in_1y3'], 'saldo_var13_medio': ['saldo_medio_var13_medio_ult1'], 'num_var40': ['num_var39'], 'num_var34_0': ['num_var34'], 'num_var32_0': ['num_var32'], 'ind_var13_medio_0': ['ind_var13_medio'], 'num_var6': ['num_var29'], 'num_var13_medio_0': ['num_var13_medio'], 'ind_var32_0': ['ind_var32'], 'delta_imp_reemb_var17_1y3': ['delta_num_reemb_var17_1y3'], 'delta_imp_trasp_var17_in_1y3': ['delta_num_trasp_var17_in_1y3'], 'saldo_var6': ['saldo_var29'], 'ind_var34_0': ['ind_var34'], 'num_var37_0': ['num_var37'], 'num_var25_0': ['num_var25'], 'ind_var6_0': ['ind_var29_0'], 'delta_imp_t

In [9]:
print(colsToRemove) # print Duplicate columns

['ind_var29_0', 'ind_var29', 'ind_var13_medio', 'ind_var18', 'ind_var26', 'ind_var25', 'ind_var32', 'ind_var34', 'ind_var37', 'ind_var39', 'num_var29_0', 'num_var29', 'num_var13_medio', 'num_var18', 'num_var26', 'num_var25', 'num_var32', 'num_var34', 'num_var37', 'num_var39', 'saldo_var29', 'saldo_medio_var13_medio_ult1', 'delta_num_reemb_var13_1y3', 'delta_num_reemb_var17_1y3', 'delta_num_reemb_var33_1y3', 'delta_num_trasp_var17_in_1y3', 'delta_num_trasp_var17_out_1y3', 'delta_num_trasp_var33_in_1y3', 'delta_num_trasp_var33_out_1y3']


## 1.5 Make Train and Test Set

In [43]:
# make train data set
x_train = df_train.drop(['ID', 'TARGET'], axis=1)
y_train = df_train['TARGET']

# make test data set
id_test = df_test['ID']
x_test = df_test.drop(['ID'], axis=1)

print ('Train data set size: {}'.format(x_train.shape))
print ('Test data set size: {}'.format(x_test.shape))

Train data set size: (76020, 306)
Test data set size: (75818, 306)


## 2. Data Analysis

### 2.1 log

## 2.1 PCA