## Tabular Playground - November 2021

In [None]:
# Basic Data Preprocessing
import numpy as np
import pandas as pd 

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# Data Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler,MinMaxScaler
from scipy.stats import uniform, randint

# Feature Selection
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel
from sklearn.decomposition import PCA

#Modelling
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
import tensorflow as tf
from tensorflow import keras

# Metrics
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay,accuracy_score
from sklearn import metrics
from sklearn.model_selection import cross_val_score,GridSearchCV, KFold, RandomizedSearchCV,RepeatedStratifiedKFold
from skopt import BayesSearchCV


In [None]:
# Reading Test and Train data 
# Dropping ID column
train_df= pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv').iloc[:,1:]
test_df= pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv')

In [None]:
# Create new DF without ID column from scoring dataset
test_df_X = test_df.iloc[:,1:]
# Create DF of ID column to be used later while submission
test_df_id = test_df.iloc[:,:1]

In [None]:
# Function to Remove outliers from data
def remove_outliers(x):
    upper_limit = x.mean() + (2*x.std())
    lower_limit = x.mean() - (2*x.std())
    return np.where(x > upper_limit,upper_limit,np.where(x <lower_limit,lower_limit,x))

In [None]:
# Remove outliers for Train dataset
train_df = train_df.apply(lambda x: remove_outliers(x))

# Seperate target from Train Dataset
train_df_Y = train_df.target
train_df_X = train_df.iloc[:,:100]

In [None]:
train_df_X.dtypes.unique()

In [None]:
df_cat_variables = train_df_X.select_dtypes('int64').astype('int8')
df_cont_variables = train_df_X.select_dtypes('float64').astype('float32')

In [None]:
# Scale and transform dataset
def data_scaler_fit(option,df):
    if option == 1:
        transformer = StandardScaler().fit(df)
    if option == 2 :
        transformer = RobustScaler().fit(df)
    if option ==3 :
        transformer = MinMaxScaler().fit(df)
    return transformer

In [None]:
"""
# Tanh estimator : https://stackoverflow.com/questions/43061120/tanh-estimator-normalization-in-python
m = np.mean(unnormalizedData, axis=0) # array([16.25, 26.25])
std = np.std(unnormalizedData, axis=0) # array([17.45530005, 22.18529919])

data = 0.5 * (np.tanh(0.01 * ((unnormalizedData - m) / std)) + 1)
"""

In [None]:
transformer = data_scaler_fit(3,df_cont_variables)

In [None]:
train_df_X = transformer.transform(df_cont_variables)
test_df_X = transformer.transform(test_df_X[df_cont_variables.columns])

#train_df_X = df_cont_variables.to_numpy()
#test_df_X = test_df_X[df_cont_variables.columns].to_numpy()

In [None]:
# Use PCA for dimention reduction
"""
pca = PCA(0.95)
pca.fit(train_df_X)

train_df_X = pca.transform(train_df_X)
test_df_X = pca.transform(test_df_X)
"""

In [None]:
"""
model = XGBClassifier(objective = 'binary:logistic',eval_metric="auc",random_state=1542,use_label_encoder=False,tree_method = 'gpu_hist')


# define evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# params = {
  #      "colsample_bytree": uniform(0.7, 0.3),
   #     "gamma": uniform(0, 0.5),
    #    "learning_rate": uniform(0.03, 0.3), # default 0.1 
     #   "max_depth": randint(2, 8), # default 3
      #  "n_estimators": randint(100, 1000), # default 100
       # "subsample": uniform(0.6, 0.4)
  #  }


param_grid = {
    "eta":(0.01,0.2,'uniform') ,
    "learning_rate": (0.0001, 0.3, "log-uniform"),
    "n_estimators": (100,  1000) ,
    "max_depth": (2, 12) ,
    "colsample_bytree": (0.3, 0.7,'uniform'),
    "gamma": (0, 0.5,'uniform'),
    "subsample": (0.4, 1.0)
}


search = GridSearchCV(model, param_grid, cv=cv, verbose=1, n_jobs=1, scoring='roc_auc',return_train_score=True,refit=True)

search.fit(X_train, y_train,eval_set=[(X_test, y_test)])

print(" Results from Random Search " )
print("\n The best estimator across ALL searched params:\n", search.best_estimator_)
print("\n The best score across ALL searched params:\n", search.best_score_)
print("\n The best parameters across ALL searched params:\n", search.best_params_)
"""

In [None]:
"""
def training_models(model_type):
    # Stochastic Gradient Descent
    if model_type == 'SGD':
        model = SGDClassifier(loss="log", penalty="l2", max_iter=100)
    
    # Multi-layer Perceptron
    if model_type == 'MLP':
        model = MLPClassifier(alpha=1e-5,solver = 'sgd',learning_rate = 'adaptive',warm_start=True,early_stopping = True,
                              max_iter=500,random_state=1)
    # Decision Tree
    if model_type == 'DTC':
        model = DecisionTreeClassifier(max_depth = 10,max_features = 'auto', random_state = 1)
        
    # Random Forest
    if model_type == 'RFC':
        model = RandomForestClassifier(n_jobs = -1,n_estimators=100, warm_start=True,random_state = 1)
    
    # Gradient Boosting Classifier
    if model_type == 'GBC':
        model = GradientBoostingClassifier(loss= 'exponential',learning_rate = 0.05,n_estimators=500,max_depth=10,criterion='squared_error')
        
    # XG Boost
    if model_type == 'XGB':
        model = XGBClassifier(objective = 'binary:logistic',n_estimators=1000,eval_metric="auc",random_state=1542,tree_method = 'gpu_hist',use_label_encoder=False)
    
    
    return model
"""

In [None]:
"""
def feature_selection(model_type,train_X,train_Y):
    embeded_selector = SelectFromModel(training_models(model_type), threshold='1.25*median')
    embeded_selector.fit(train_X,train_Y)
    embeded_support = embeded_selector.get_support()
    embeded_feature = pd.DataFrame(train_X).loc[:,embeded_support].columns.tolist()
    return embeded_feature

"""

In [None]:
# final_features = feature_selection('GBC',train_df_X, train_df_Y)

In [None]:
# Divide into train and test
X_train, X_test, y_train, y_test = train_test_split(train_df_X, train_df_Y, test_size=0.20, random_state=151)

In [None]:
"""
parameter_space = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

model = MLPClassifier(warm_start=True,early_stopping = True,max_iter=500,random_state=1)

search = GridSearchCV(model, parameter_space, cv=3, verbose=1, n_jobs=-1, scoring='roc_auc',return_train_score=True,refit=True)

search.fit(X_train, y_train)


print(" Results from Random Search " )
print("\n The best estimator across ALL searched params:\n", search.best_estimator_)
print("\n The best score across ALL searched params:\n", search.best_score_)
print("\n The best parameters across ALL searched params:\n", search.best_params_)

model = search
"""

In [None]:
model = keras.models.Sequential([
keras.layers.Flatten(input_shape=[X_train.shape[1],]),
keras.layers.Dense(512, activation="relu"),
keras.layers.Dropout(0.5),
keras.layers.Dense(128, activation="relu"),
keras.layers.Dropout(0.5),
keras.layers.Dense(64, activation="relu"),
keras.layers.Dense(1, activation="sigmoid")
])

In [None]:
model.summary()

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)
epochs = 100

model.compile(loss="binary_crossentropy",
optimizer="sgd",
metrics=[tf.keras.metrics.AUC()])

In [None]:
history = model.fit(X_train, y_train, epochs=epochs,validation_data=(X_test, y_test),callbacks=[callback])

In [None]:
#Train Model

#model = training_models('MLP',X_train,y_train,X_test, y_test)
#del(model)

In [None]:
# list all data in history
print(history.history.keys())

In [None]:
# summarize history for accuracy
plt.plot(history.history[list(history.history.keys())[1]])
plt.plot(history.history[list(history.history.keys())[3]])
plt.title('Model AUC')
plt.ylabel('AUC')
plt.xlabel('Epoch')
plt.legend(['train', 'test'], loc='lower right')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper right')
plt.show()

In [None]:
# Predict on interim test df
target_predict = pd.DataFrame(model.predict(X_test).round().astype(int),columns = ['pred_target'])

In [None]:
# Calculate and Show Confusion Matrix
conf_metrix = confusion_matrix(y_test,target_predict.to_numpy(), normalize= 'true')
disp = ConfusionMatrixDisplay(confusion_matrix = conf_metrix )
disp.plot()
plt.show()

In [None]:
# Print Accuracy
print('Accuracy: '+ str(accuracy_score(y_test,target_predict.to_numpy()) * 100) + '%')

### Train Model on Whole data before predicting actual test data

In [None]:
#Train Model
#model = training_models('MLP',train_df_X,train_df_Y.to_numpy(),X_test, y_test)

### Predict and Submit to leaderboard

In [None]:
# Predict on actual test dataset
probability = pd.DataFrame(model.predict(test_df_X),columns = ['target'])

In [None]:
# Submit Predictions
submission = pd.concat([test_df_id,probability],axis = 1)
submission.to_csv('submission.csv',index=False)