In [None]:
import numpy as np 
import pandas as pd 
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df=pd.read_csv('../input/porto-seguro-safe-driver-prediction/train.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
def percent(x,y):
    return (100 * float(x))/float(y)

In [None]:
no_claim,claim =df.target.value_counts()
print(f'No claim {no_claim}')
print(f'Claim {claim}')
print(f'Claim percentage {round(percent(claim,claim + no_claim),2)} %')

In [None]:
sns.countplot(x='target',data=df)

In [None]:
for col in df.columns:
    count=df[df[col]==-1][col].count()
    if count > 0:
        print(f'{col} -- {count} ({round(percent(count,df.shape[0]),2)}%)')

In [None]:
df=df.drop(["ps_car_03_cat", "ps_car_05_cat", "ps_reg_03"],axis=1)

In [None]:
df.info()


In [None]:
df.shape

In [None]:
from sklearn.impute import SimpleImputer
cat_cols=['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat','ps_car_01_cat', 'ps_car_02_cat', 'ps_car_07_cat','ps_car_09_cat']
num_cols=['ps_car_11', 'ps_car_12', 'ps_car_14']
num_imp=SimpleImputer(missing_values=-1,strategy='mean')
cat_imp=SimpleImputer(missing_values=-1,strategy='most_frequent')

In [None]:
for col in cat_cols:
    df[col]=cat_imp.fit_transform(df[[col]]).ravel()

for col in num_cols:
    df[col]=num_imp.fit_transform(df[[col]]).ravel()

In [None]:
df=pd.get_dummies(df,columns=cat_cols)

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split
labels=df.columns[2:]
X=df[labels]
y=df['target']

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.05,random_state=42)

In [None]:
def build_model(train_data,metrics=['accuracy']):
    model=keras.Sequential([
        keras.layers.Dense(units=36,activation='relu',input_shape=(train_data.shape[-1],)),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(0.25),
        keras.layers.Dense(units=1,activation='sigmoid'),
    ])
    model.compile(
    optimizer=keras.optimizers.Adam(lr=0.001),
    loss=keras.losses.BinaryCrossentropy(),
    metrics=metrics
    )
    return model

In [None]:
model=build_model(X_train)

In [None]:
BATCH_SIZE=2048
history=model.fit(
    X_train,
    y_train,
    batch_size=BATCH_SIZE,
    epochs=20,
    validation_split=0.05,
    shuffle=True,
    verbose=2
)

In [None]:
def plot_accuracy(history):
    history=pd.DataFrame(history.history)
    history['epoch']=history.epoch
    
    plt.figure()
    plt.xlabel('epoch')
    plt.ylabel('accuracy')
    plt.plot(history['epoch'],history['accuracy'],label='Training Accuracy')
    plt.plot(history['epoch'],history['val_accuracy'],label='Validtion Accuracy')
    plt.ylim((0,1))
    plt.legend()
    plt.show()

In [None]:
model.evaluate(X_test,y_test,batch_size=BATCH_SIZE)

Create a dummy array of predictions and evaluate model performance you can see that it's still giving a 96% accuracy so it's mean that we are using bad evaluation metrics

In [None]:
def awesome_model_predict(features):
    return np.full((features.shape[0], ), 0)
y_pred = awesome_model_predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred,y_test)