In [4]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout

In [5]:
df_train = pd.read_csv("/resources/data/brainwaves/predict_fraudulant/train.csv")
df_test =  pd.read_csv("/resources/data/brainwaves/predict_fraudulant/test.csv")

df_train

Unnamed: 0,transaction_id,num_var_1,num_var_2,num_var_3,num_var_4,num_var_5,num_var_6,num_var_7,cat_var_1,cat_var_2,...,cat_var_34,cat_var_35,cat_var_36,cat_var_37,cat_var_38,cat_var_39,cat_var_40,cat_var_41,cat_var_42,target
0,id_11,2.302632e-08,0.040182,0.0,1.800000e-07,2.302632e-08,2.368421e-08,1.115205e-08,,ce,...,0,0,0,0,0,0,0,0,0,0
1,id_33,7.965789e-06,0.157872,0.0,2.105000e-06,2.769737e-07,7.965789e-06,2.433058e-06,da,tn,...,0,0,0,0,0,0,0,0,0,0
2,id_51,7.828947e-08,0.089140,0.0,3.550000e-07,4.671053e-08,1.052632e-07,4.276014e-07,gf,ce,...,0,0,0,0,0,0,0,0,0,0
3,id_54,7.894737e-08,0.227239,0.0,1.050000e-06,1.381579e-07,2.190789e-07,1.848054e-08,,ce,...,0,0,0,0,0,0,0,0,0,0
4,id_62,3.321053e-06,0.160410,0.0,2.105000e-06,2.769737e-07,3.340789e-06,2.152983e-06,da,tn,...,0,0,0,0,0,0,0,0,0,0
5,id_67,1.953947e-07,0.078566,0.0,3.550000e-07,4.671053e-08,4.407895e-08,9.463310e-08,gf,ce,...,0,0,0,0,0,0,0,0,0,0
6,id_71,1.736842e-07,0.600508,0.0,4.575000e-06,5.986842e-07,1.736842e-07,8.411831e-08,hn,ce,...,0,0,0,0,0,0,0,0,0,0
7,id_88,5.789474e-08,0.152374,0.0,2.105000e-06,2.769737e-07,5.789474e-08,2.453451e-08,da,tn,...,0,0,0,0,0,0,0,0,0,0
8,id_95,3.289474e-08,0.093053,0.0,7.500000e-07,8.223684e-08,9.868421e-08,1.274520e-08,pu,ce,...,0,0,0,0,0,0,0,0,0,0
9,id_97,5.500000e-07,0.158401,0.0,2.105000e-06,2.769737e-07,5.500000e-07,3.138505e-07,da,tn,...,0,0,0,0,0,0,0,0,0,0


In [6]:
#joining test and train datasets
df= pd.concat([df_train, df_test], axis=0, join='outer')

In [7]:
#cat_var_1 and cat_var_6 have small no of nans.....imputing them
df['cat_var_1'].fillna(df['cat_var_1'].value_counts().idxmax(), inplace=True)
df['cat_var_6'].fillna(df['cat_var_6'].value_counts().idxmax(), inplace=True)
df.isnull().sum()

cat_var_1              0
cat_var_10             0
cat_var_11             0
cat_var_12             0
cat_var_13             0
cat_var_14             0
cat_var_15             0
cat_var_16             0
cat_var_17             0
cat_var_18             0
cat_var_19             0
cat_var_2              0
cat_var_20             0
cat_var_21             0
cat_var_22             0
cat_var_23             0
cat_var_24             0
cat_var_25             0
cat_var_26             0
cat_var_27             0
cat_var_28             0
cat_var_29             0
cat_var_3          97215
cat_var_30             0
cat_var_31             0
cat_var_32             0
cat_var_33             0
cat_var_34             0
cat_var_35             0
cat_var_36             0
cat_var_37             0
cat_var_38             0
cat_var_39             0
cat_var_4              0
cat_var_40             0
cat_var_41             0
cat_var_42             0
cat_var_5              0
cat_var_6              0
cat_var_7              0


In [8]:
df['cat_var_3'].fillna(df['cat_var_3'].value_counts().idxmax(), inplace=True)

In [9]:
#dropping all redundant features
df=df.drop(['cat_var_31', 'cat_var_33', 'cat_var_34', 'cat_var_35', 'cat_var_36'
            ,'cat_var_37', 'cat_var_38', 'cat_var_40', 'cat_var_41', 'cat_var_42'
            ,'cat_var_8'], axis=1,)
df.isnull().sum()

cat_var_1              0
cat_var_10             0
cat_var_11             0
cat_var_12             0
cat_var_13             0
cat_var_14             0
cat_var_15             0
cat_var_16             0
cat_var_17             0
cat_var_18             0
cat_var_19             0
cat_var_2              0
cat_var_20             0
cat_var_21             0
cat_var_22             0
cat_var_23             0
cat_var_24             0
cat_var_25             0
cat_var_26             0
cat_var_27             0
cat_var_28             0
cat_var_29             0
cat_var_3              0
cat_var_30             0
cat_var_32             0
cat_var_39             0
cat_var_4              0
cat_var_5              0
cat_var_6              0
cat_var_7              0
cat_var_9              0
num_var_1              0
num_var_2              0
num_var_3              0
num_var_4              0
num_var_5              0
num_var_6              0
num_var_7              0
target            523466
transaction_id         0


In [10]:
label_enc = LabelEncoder()
df['cat_var_1']=label_enc.fit_transform(df['cat_var_1'])
df['cat_var_2']=label_enc.fit_transform(df['cat_var_2'])
df['cat_var_3']=label_enc.fit_transform(df['cat_var_3'])
df['cat_var_4']=label_enc.fit_transform(df['cat_var_4'])
df['cat_var_5']=label_enc.fit_transform(df['cat_var_5'])
df['cat_var_6']=label_enc.fit_transform(df['cat_var_6'])
df['cat_var_7']=label_enc.fit_transform(df['cat_var_7'])
df['cat_var_9']=label_enc.fit_transform(df['cat_var_9'])
df['cat_var_10']=label_enc.fit_transform(df['cat_var_10'])
df['cat_var_11']=label_enc.fit_transform(df['cat_var_11'])
df['cat_var_12']=label_enc.fit_transform(df['cat_var_12'])
df['cat_var_13']=label_enc.fit_transform(df['cat_var_13'])
df['cat_var_14']=label_enc.fit_transform(df['cat_var_14'])
df['cat_var_15']=label_enc.fit_transform(df['cat_var_15'])
df['cat_var_16']=label_enc.fit_transform(df['cat_var_16'])
df['cat_var_17']=label_enc.fit_transform(df['cat_var_17'])
df['cat_var_18']=label_enc.fit_transform(df['cat_var_18'])
df

Unnamed: 0,cat_var_1,cat_var_10,cat_var_11,cat_var_12,cat_var_13,cat_var_14,cat_var_15,cat_var_16,cat_var_17,cat_var_18,...,cat_var_9,num_var_1,num_var_2,num_var_3,num_var_4,num_var_5,num_var_6,num_var_7,target,transaction_id
0,127,3,3,1,19,1,0,1,1,0,...,4,2.302632e-08,0.040182,0.0,1.800000e-07,2.302632e-08,2.368421e-08,1.115205e-08,0.0,id_11
1,65,15,3,3,14,1,1,1,1,1,...,2,7.965789e-06,0.157872,0.0,2.105000e-06,2.769737e-07,7.965789e-06,2.433058e-06,0.0,id_33
2,127,12,0,4,16,1,0,1,1,0,...,2,7.828947e-08,0.089140,0.0,3.550000e-07,4.671053e-08,1.052632e-07,4.276014e-07,0.0,id_51
3,127,5,3,1,3,11,0,1,1,0,...,1,7.894737e-08,0.227239,0.0,1.050000e-06,1.381579e-07,2.190789e-07,1.848054e-08,0.0,id_54
4,65,20,2,3,14,1,1,1,1,1,...,1,3.321053e-06,0.160410,0.0,2.105000e-06,2.769737e-07,3.340789e-06,2.152983e-06,0.0,id_62
5,127,17,4,4,16,1,0,1,1,0,...,4,1.953947e-07,0.078566,0.0,3.550000e-07,4.671053e-08,4.407895e-08,9.463310e-08,0.0,id_67
6,155,17,4,4,7,10,0,1,0,1,...,0,1.736842e-07,0.600508,0.0,4.575000e-06,5.986842e-07,1.736842e-07,8.411831e-08,0.0,id_71
7,65,9,4,3,14,1,1,1,0,1,...,3,5.789474e-08,0.152374,0.0,2.105000e-06,2.769737e-07,5.789474e-08,2.453451e-08,0.0,id_88
8,324,16,3,0,10,2,0,0,1,1,...,4,3.289474e-08,0.093053,0.0,7.500000e-07,8.223684e-08,9.868421e-08,1.274520e-08,0.0,id_95
9,65,15,2,3,14,1,1,1,0,1,...,4,5.500000e-07,0.158401,0.0,2.105000e-06,2.769737e-07,5.500000e-07,3.138505e-07,0.0,id_97


In [11]:
df_train =df.iloc[:348978,:]
df_test =df.iloc[348978:,:]
df_test.head()

Unnamed: 0,cat_var_1,cat_var_10,cat_var_11,cat_var_12,cat_var_13,cat_var_14,cat_var_15,cat_var_16,cat_var_17,cat_var_18,...,cat_var_9,num_var_1,num_var_2,num_var_3,num_var_4,num_var_5,num_var_6,num_var_7,target,transaction_id
0,65,13,0,3,14,1,1,1,1,1,...,3,4.736842e-07,0.162737,0.0,2.105e-06,2.769737e-07,4.828947e-07,1.30001e-07,,id_1
1,324,4,1,0,10,2,0,0,1,1,...,4,6.578947e-08,0.086391,0.0,7.5e-07,1.315789e-07,9.868421e-08,3.1863e-08,,id_6
2,65,21,2,3,14,1,1,1,0,1,...,0,1.585526e-07,0.159987,0.0,2.105e-06,2.769737e-07,1.585526e-07,4.715724e-08,,id_9
3,324,9,4,0,10,2,0,0,1,1,...,2,8.552632e-08,0.088083,0.0,7.5e-07,8.552632e-08,9.868421e-08,4.14219e-08,,id_14
4,127,12,0,4,16,1,0,1,1,0,...,3,1.144737e-07,0.089563,0.0,3.55e-07,4.671053e-08,4.407895e-08,1.452953e-07,,id_15


In [12]:
X = df_train.loc[:,['cat_var_1',
 'cat_var_10', 'cat_var_11', 'cat_var_12', 'cat_var_13', 'cat_var_14',
 'cat_var_15', 'cat_var_16', 'cat_var_17', 'cat_var_18', 'cat_var_19',
 'cat_var_2', 'cat_var_20', 'cat_var_21', 'cat_var_22', 'cat_var_23',
 'cat_var_24', 'cat_var_25', 'cat_var_26', 'cat_var_27', 'cat_var_28',
 'cat_var_29', 'cat_var_3', 'cat_var_30', 'cat_var_32', 'cat_var_39', 'cat_var_4',
 'cat_var_5', 'cat_var_6', 'cat_var_7', 'cat_var_9', 'num_var_1', 'num_var_2',
 'num_var_3', 'num_var_4', 'num_var_5', 'num_var_6', 'num_var_7']]

Y = df_train.loc[:, 'target']

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split( X, Y,
                                        test_size=0.33, random_state=7)


## Building a baseline model


In [21]:
def create_baseline():
    # create model
    model = Sequential()
    model.add(Dense(40, input_dim=38, kernel_initializer='normal', activation='sigmoid'))
    model.add(Dense(40, kernel_initializer='normal',activation='sigmoid'))
    model.add(Dense(20,kernel_initializer='normal',activation = 'sigmoid'))
    model.add(Dense(15,kernel_initializer='normal',activation = 'sigmoid'))
    model.add(Dense(10,kernel_initializer='normal',activation='sigmoid'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model



In [None]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(build_fn=create_baseline, epochs=20, batch_size=10, verbose=1, )))
pipeline = Pipeline(estimators)

pipeline.fit(X_train,Y_train)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


Pipeline(steps=[('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('mlp', <keras.wrappers.scikit_learn.KerasClassifier object at 0x7f4249fd9750>)])

In [23]:
s_pred = pipeline.predict_proba(df_test.loc[:,['cat_var_1',
 'cat_var_10', 'cat_var_11', 'cat_var_12', 'cat_var_13', 'cat_var_14',
 'cat_var_15', 'cat_var_16', 'cat_var_17', 'cat_var_18', 'cat_var_19',
 'cat_var_2', 'cat_var_20', 'cat_var_21', 'cat_var_22', 'cat_var_23',
 'cat_var_24', 'cat_var_25', 'cat_var_26', 'cat_var_27', 'cat_var_28',
 'cat_var_29', 'cat_var_3', 'cat_var_30', 'cat_var_32', 'cat_var_39', 'cat_var_4',
 'cat_var_5', 'cat_var_6', 'cat_var_7', 'cat_var_9', 'num_var_1', 'num_var_2',
 'num_var_3', 'num_var_4', 'num_var_5', 'num_var_6', 'num_var_7']])

s_pred



array([[ 0.93226963,  0.06773036],
       [ 0.91613132,  0.0838687 ],
       [ 0.93062997,  0.06937002],
       ..., 
       [ 0.93261087,  0.06738912],
       [ 0.93119156,  0.06880843],
       [ 0.93271649,  0.06728348]], dtype=float32)

In [24]:
c=0
s_test=[]
for arr in s_pred:
    s_test.append(arr[1])
    c=c+1
s_test

[0.06773036,
 0.083868697,
 0.069370024,
 0.08212287,
 0.067292154,
 0.067278057,
 0.067787655,
 0.071897738,
 0.067676313,
 0.067289278,
 0.067294672,
 0.068438113,
 0.067285635,
 0.067292094,
 0.23559923,
 0.083033055,
 0.068822883,
 0.076612219,
 0.067290843,
 0.067289107,
 0.067971453,
 0.070918463,
 0.071905218,
 0.068228491,
 0.067474321,
 0.067289077,
 0.067292549,
 0.068083279,
 0.075130068,
 0.071400739,
 0.074302904,
 0.082449213,
 0.06729006,
 0.0672912,
 0.067441203,
 0.39725372,
 0.067361474,
 0.067756444,
 0.06728898,
 0.06856212,
 0.067290954,
 0.069190152,
 0.2352197,
 0.93346071,
 0.06729012,
 0.067538612,
 0.067497119,
 0.072033659,
 0.067289673,
 0.074568473,
 0.067290567,
 0.2432791,
 0.098894015,
 0.075771518,
 0.075005412,
 0.072821289,
 0.06759984,
 0.067289345,
 0.06729009,
 0.072626397,
 0.067293532,
 0.070056416,
 0.64499283,
 0.067289278,
 0.069508955,
 0.067290835,
 0.070624977,
 0.077571891,
 0.067290902,
 0.06729006,
 0.070579581,
 0.076172873,
 0.07015141

In [25]:
df_test['target'] = s_test
df_pred = df_test.loc[:,['target', 'transaction_id']]
df_pred.to_csv('/resources/data/brainwaves/predict_fraudulant/pred_fraud_nn3.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
