In [151]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import scipy
import os
from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import Dense, BatchNormalization
from keras.optimizers import Adam

In [152]:
df = pd.read_csv("GRIR_GCP_Data.csv", sep=",")

In [153]:
df.head()

Unnamed: 0,WERKS,SCENARIO,KTOKK,VSTATU,VPATD,EKORG,EKGRP,TOTGRQTY,TOTIRQTY,NODLGR,NODLIR,DIFGRIRD,DIFGRIRV,STATUS
0,ML01,3,1,1,30,1,A,0,80,0,90,-80,-38100,1
1,ML01,3,1,1,30,1,A,0,107,0,177,-107,-41600,0
2,ML01,3,1,1,30,1,A,0,107,0,152,-107,-27600,1
3,ML01,3,1,1,30,1,A,0,96,0,79,-96,-13800,1
4,ML01,3,1,1,30,1,A,0,146,0,192,-146,-73500,0


In [154]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8279 entries, 0 to 8278
Data columns (total 14 columns):
WERKS       8279 non-null object
SCENARIO    8279 non-null int64
KTOKK       8279 non-null int64
VSTATU      8279 non-null int64
VPATD       8279 non-null int64
EKORG       8279 non-null int64
EKGRP       8279 non-null object
TOTGRQTY    8279 non-null int64
TOTIRQTY    8279 non-null int64
NODLGR      8279 non-null int64
NODLIR      8279 non-null int64
DIFGRIRD    8279 non-null int64
DIFGRIRV    8279 non-null int64
STATUS      8279 non-null int64
dtypes: int64(12), object(2)
memory usage: 905.6+ KB


In [155]:
#Mark some columns as categorical
for col_cat in ['SCENARIO','KTOKK','VSTATU','EKORG']:
    df[col_cat] = df[col_cat].astype('category')
    
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8279 entries, 0 to 8278
Data columns (total 14 columns):
WERKS       8279 non-null object
SCENARIO    8279 non-null category
KTOKK       8279 non-null category
VSTATU      8279 non-null category
VPATD       8279 non-null int64
EKORG       8279 non-null category
EKGRP       8279 non-null object
TOTGRQTY    8279 non-null int64
TOTIRQTY    8279 non-null int64
NODLGR      8279 non-null int64
NODLIR      8279 non-null int64
DIFGRIRD    8279 non-null int64
DIFGRIRV    8279 non-null int64
STATUS      8279 non-null int64
dtypes: category(4), int64(8), object(2)
memory usage: 679.7+ KB


In [156]:
#Prepare train and output columns
df_x = df.drop(['STATUS'],axis=1)
df_y = df['STATUS']

In [157]:
#Split dataset -> Split 10 times and choose the one with best P values( Significance test )
p_res = {}
t_res = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.1, random_state=i,stratify=df_y)

    #Run Significance Tests on both the distributions( Train and Test ) for all numerical attributes
    p_res = {}
    for c_ in X_train.columns:
        if not X_train[c_].dtype == 'object':
            try:
                _, a = scipy.stats.ks_2samp(X_train[c_].values,X_test[c_].values)
                #print('P-value for column {} is {}'.format(c_.upper(), a))
                p_res['Random'] = i
                p_res[c_] = a
            except:
                p_res['Random'] = i
                p_res[c_] = 'Error'
    t_res.append(p_res)

p_df = pd.DataFrame(t_res)
p_df

Unnamed: 0,DIFGRIRD,DIFGRIRV,EKORG,KTOKK,NODLGR,NODLIR,Random,SCENARIO,TOTGRQTY,TOTIRQTY,VPATD,VSTATU
0,0.865186,0.85032,0.99653,1.0,0.997738,0.485958,0,0.879783,0.194014,0.33862,0.9905,1.0
1,0.987979,0.835343,0.99653,0.787057,0.830635,0.727003,1,0.999995,0.596619,0.107376,0.989171,1.0
2,0.614764,0.205466,0.999289,1.0,0.093186,0.209803,2,0.711002,0.312491,0.79712,0.665292,1.0
3,0.287316,0.358338,0.774537,0.999998,0.038072,0.248484,3,0.709403,0.027086,0.894614,0.999331,1.0
4,0.06127,0.108989,0.922054,0.959982,0.549862,0.205517,4,0.205517,0.544833,0.205517,1.0,0.998587
5,0.640502,0.50172,0.414625,1.0,0.77671,0.514906,5,0.975722,0.791521,0.587118,0.997557,0.982422
6,0.286878,0.179726,1.0,0.999825,0.957352,0.895561,6,0.881185,0.739157,0.430504,0.881883,0.998032
7,0.922265,0.890104,0.275618,0.980325,0.810987,0.996811,7,0.999995,0.562305,0.28761,0.698262,0.999999
8,0.549601,0.666214,0.71464,0.999998,0.941271,0.227629,8,0.825911,0.769461,0.882082,0.70847,0.998032
9,0.49986,0.953603,1.0,0.109492,0.504285,0.44754,9,1.0,0.869874,0.665397,0.987354,1.0


In [158]:
#Split Train and Validation

#Use the best split value from above after manual inspection
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.1, random_state=8,stratify=df_y)
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7451 entries, 692 to 315
Data columns (total 13 columns):
WERKS       7451 non-null object
SCENARIO    7451 non-null category
KTOKK       7451 non-null category
VSTATU      7451 non-null category
VPATD       7451 non-null int64
EKORG       7451 non-null category
EKGRP       7451 non-null object
TOTGRQTY    7451 non-null int64
TOTIRQTY    7451 non-null int64
NODLGR      7451 non-null int64
NODLIR      7451 non-null int64
DIFGRIRD    7451 non-null int64
DIFGRIRV    7451 non-null int64
dtypes: category(4), int64(7), object(2)
memory usage: 611.7+ KB


In [159]:
x_train['NODLGR'].max()

268

In [160]:
#Feature engineering steps
x_train['grminusirbyvpatd'] = ( x_train['TOTGRQTY'] - x_train['TOTIRQTY'] ) / x_train['VPATD']
x_test['grminusirbyvpatd'] = ( x_test['TOTGRQTY'] - x_test['TOTIRQTY'] ) / x_test['VPATD']


for c_ in ['VPATD','TOTGRQTY','TOTIRQTY','NODLGR','NODLIR','DIFGRIRD','DIFGRIRV']:
    x_train[c_] = (x_train[c_] - x_train[c_].min()) / (x_train[c_].max() - x_train[c_].min())
    x_test[c_] = (x_test[c_] - x_test[c_].min()) / (x_test[c_].max() - x_test[c_].min())
    
x_train.describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: ht

Unnamed: 0,VPATD,TOTGRQTY,TOTIRQTY,NODLGR,NODLIR,DIFGRIRD,DIFGRIRV,grminusirbyvpatd
count,7451.0,7451.0,7451.0,7451.0,7451.0,7451.0,7451.0,7451.0
mean,0.506039,0.329276,0.470508,0.384943,0.370619,0.626838,0.509348,-0.578462
std,0.405868,0.31487,0.31283,0.306502,0.311038,0.231224,0.169547,1.439549
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-6.666667
25%,0.0,0.0,0.27,0.0,0.043388,0.525547,0.488085,-0.833333
50%,0.5,0.275,0.495,0.41791,0.338843,0.722628,0.55456,-0.033333
75%,1.0,0.605,0.735,0.641791,0.636364,0.770073,0.599851,0.2
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.666667


In [161]:
x_train.shape

(7451, 14)

In [162]:
#One hot encoders
x_train = pd.get_dummies(x_train)
x_test = pd.get_dummies(x_test)
print(x_train.shape)
print(x_test.shape)

(7451, 24)
(828, 24)


In [163]:
x_train.shape[1]

24

In [222]:
##############################
###Create keras Model
model = Sequential()
model.add(Dense(32, input_dim=x_train.shape[1], activation='relu',kernel_initializer='he_uniform'))
model.add(Dense(32, activation='relu',kernel_initializer='he_uniform'))
model.add(Dense(128, activation='relu',kernel_initializer='he_uniform'))
# model.add(Dense(32, activation='relu',kernel_initializer='he_uniform'))
model.add(Dense(1,   activation='sigmoid'))
model.compile(loss='binary_crossentropy',  optimizer=Adam(lr= 0.05), metrics =['accuracy'])

In [223]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_226 (Dense)            (None, 32)                800       
_________________________________________________________________
dense_227 (Dense)            (None, 32)                1056      
_________________________________________________________________
dense_228 (Dense)            (None, 128)               4224      
_________________________________________________________________
dense_229 (Dense)            (None, 1)                 129       
Total params: 6,209
Trainable params: 6,209
Non-trainable params: 0
_________________________________________________________________


In [224]:
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=30, batch_size=512)

Train on 7451 samples, validate on 828 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x21ec301eb00>