In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
dfTrain = pd.read_csv("/kaggle/input/tabular-playground-series-may-2022/train.csv")
dfTrain.head()

In [None]:
dfTrain.shape

In [None]:
dfTrain.target.value_counts()

In [None]:
dfTrain.f_27.value_counts()

In [None]:
dfTrain.info()

In [None]:
dfTrain.columns

In [None]:
dfTrain.drop(["id"], axis=1, inplace=True)

In [None]:
dfTrain.drop(["f_27"], axis=1, inplace=True) # dropping the only string column

In [None]:
# Check for missing values
sum(dfTrain.isna().sum())

In [None]:
#checking for correlation
pearson_corr = dfTrain.corr(method='pearson')

pearson_corr

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt
sns.set(rc={'figure.figsize':(16,8)})
%matplotlib inline
sns.set()

In [None]:
sns.heatmap(pearson_corr, 
            linewidth=1, 
            annot=True, 
            annot_kws={'size' : 10} )

plt.title('Pearson correlations', fontsize=25)

plt.show()

In [None]:

corr = dfTrain.corr()
sns.heatmap(corr, xticklabels=corr.columns.values,yticklabels=corr.columns.values)

In [None]:
import tensorflow as tf

In [None]:
dfTrain_features = dfTrain.copy()
dfTrain_labels = dfTrain_features.pop('target')

In [None]:
features = list(dfTrain.columns.values)
features.remove('target')
print(features)
X = dfTrain[features]
y = dfTrain['target']

In [None]:
print(X.head())

In [None]:
print(y.head())

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
import keras

In [None]:
from keras.models import Sequential
from keras.layers import Dense
 
model = Sequential() 
model.add(Dense(128, activation='relu', input_dim=30))
model.add(Dense(1, activation='sigmoid')) 
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
model.summary()

In [None]:
hist = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=100)


In [None]:
acc = hist.history['accuracy']
val = hist.history['val_accuracy']
epochs = range(1, len(acc) + 1)
 
plt.plot(epochs, acc, '-', label='Training accuracy')
plt.plot(epochs, val, ':', label='Validation accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.plot()

In [None]:
from sklearn.metrics import confusion_matrix
 
y_predicted = model.predict(X_test) > 0.5
mat = confusion_matrix(y_test, y_predicted)
labels = ['0', '1']
 
sns.heatmap(mat, square=True, annot=True, fmt='d', cbar=False, cmap='Blues',
            xticklabels=labels, yticklabels=labels)
 
plt.xlabel('Predicted label')
plt.ylabel('Actual label')

#### With Lime, lets see which features are making the decisions for 0 or 1 class

In [None]:
import lime

In [None]:
def prob(data):
    print(data.shape)
    y_pred=model.predict(data).reshape(-1, 1)
    y_pred =(y_pred>0.5)
    print(np.array(list(zip(1-y_pred.reshape(data.shape[0]),y_pred.reshape(data.shape[0])))))
    return np.hstack((1-y_pred,y_pred))

In [None]:
import lime.lime_tabular
explainer = lime.lime_tabular.LimeTabularExplainer(X[list(X.columns)].astype(int).values, 
                                                   mode='classification',training_labels=y_train,feature_names=list(X.columns))

In [None]:
idx = 25
exp = explainer.explain_instance(X.loc[idx,X.columns].astype(int).values, prob, num_features=30)

In [None]:
exp.show_in_notebook(show_table=True)

In [None]:
idx = 1250
exp = explainer.explain_instance(X.loc[idx,X.columns].astype(int).values, prob, num_features=30)

In [None]:
exp.show_in_notebook(show_table=True)

In [None]:
idx = 800000
exp = explainer.explain_instance(X.loc[idx,X.columns].astype(int).values, prob, num_features=30)

In [None]:
exp.show_in_notebook(show_table=True)

In [None]:
idx = 700000
exp = explainer.explain_instance(X.loc[idx,X.columns].astype(int).values, prob, num_features=30)

In [None]:
exp.show_in_notebook(show_table=True)