In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
import statsmodels.formula.api as smf                # logistic regression
from sklearn.model_selection import train_test_split # train/test split
from sklearn.linear_model import LogisticRegression  # logistic regression
from sklearn.linear_model import Lasso
from sklearn.metrics import confusion_matrix         # confusion matrix
from sklearn.metrics import roc_auc_score            # auc score
from sklearn.neighbors import KNeighborsClassifier   # KNN for classification
from sklearn.preprocessing import StandardScaler     # standard scaler

# CART model packages
from sklearn.tree import DecisionTreeClassifier      # classification trees
from sklearn.tree import export_graphviz             # exports graphics
from six import StringIO                             # saves objects in memory
from IPython.display import Image                    # displays on frontend

# Hyperparameter Tuning
from sklearn.model_selection import RandomizedSearchCV  # hyperparameter tuning
from sklearn.metrics import make_scorer                 # customizable scorer

# Ensemble Modeling
from sklearn.ensemble import RandomForestClassifier     # random forest
from sklearn.ensemble import GradientBoostingClassifier # gbm

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import data as dataframe

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv', index_col='id')
test_df = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv', index_col='id')

## Exploratory Data Analysis

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
# Explore the dataset - row and column
print(train_df.shape)
print(test_df.shape)

Train data count: 600,000 / Test data count : 540,000

In [None]:
# Explore the datatype per each column
print(train_df.dtypes)

In [None]:
print(test_df.dtypes)

Feature columns : float64 / Target column : int

In [None]:
# Show the number of missing values in the dataset.
train_df.isnull().sum(axis = 0)

In [None]:
test_df.isnull().sum(axis = 0)

There's no missing value on the dataframe

In [None]:
# Show the number of target values in the dataset.

y = train_df["target"]

sns.countplot(y)


target_temp = train_df.target.value_counts()

print(target_temp)

In [None]:
# Plot data distributions for every variables
fig, ax = plt.subplots(figsize = (15, 70))
i = 0
FEATS = list(train_df.columns)

for val in FEATS:
    plt.subplot(21, 5, i + 1)
    sns.kdeplot(data = train_df, x = val)
    i = i + 1

In [None]:
# Generate a correlation matrix.
print(train_df.corr())

## Scaling feature values

### MinMaxScaler Transform

<ul>
<li> Normalization is a rescaling of the data from the original range so that all values are within the new range of 0 and 1. </li> 
<li> Data scaling is a recommended pre-processing step when working with many machine learning algorithms. (<a href="https://machinelearningmastery.com/standardscaler-and-minmaxscaler-transforms-in-python/">Jason Brownlee</a>, 2020) </li> 
</ul>

In [None]:
# perform a robust scaler transform of the dataset
from sklearn.preprocessing import MinMaxScaler
trans = MinMaxScaler()
train_df = trans.fit_transform(train_df)

In [None]:
from pandas import DataFrame
# convert the array back to a dataframe
train_df = DataFrame(train_df)
# summarize
print(train_df.describe())

In [None]:
train_df.columns = ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10',
                    'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20',
                    'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30',
                    'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40',
                    'f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47', 'f48', 'f49', 'f50',
                    'f51', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f58', 'f59', 'f60',
                    'f61', 'f62', 'f63', 'f64', 'f65', 'f66', 'f67', 'f68', 'f69', 'f70',
                    'f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'f78', 'f79', 'f80',
                    'f81', 'f82', 'f83', 'f84', 'f85', 'f86', 'f87', 'f88', 'f89', 'f90',
                    'f91', 'f92', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99', 'target']

In [None]:
# instantiating a logistic model object
logistic_full = smf.logit(formula = """  target ~ 
                                         f1 + f2 + f3 + f4 + f5 + f6 + f7 + f8 + f9 + f10 +
                                         f11 + f12 + f13 + f14 + f15 + f16 + f17 + f18 + f19 + f20 + 
                                         f21 + f22 + f23 + f24 + f25 + f26 + f27 + f28 + f29 + f30 +
                                         f31 + f32 + f33 + f34 + f35 + f36 + f37 + f39 + f40 +
                                         f41 + f42 + f43 + f44 + f45 + f46 + f47 + f48 + f49 + f50 +
                                         f51 + f53 + f54 + f55 + f56 + f57 + f58 + f59 + f60 + 
                                         f61 + f62 + f63 + f64 + f65 + f66 + f67 + f68 + f69 + f70 +
                                         f71 + f73 + f74 + f75 + f76 + f77 + f78 + f79 + f80 +
                                         f81 + f82 + f83 + f84 + f85 + f86 + f87 + f88 + f89 + f90 +
                                         f91 + f93 + f94 + f95 + f96 + f97 + f98 + f99 """,
                                         data = train_df)

# fitting the model object
results_full = logistic_full.fit()


# checking the results SUMMARY
results_full.summary()

## Experiment with the validation split

In the following code cell, you'll see a variable named validation_split, which we've initialized at 0.2. The validation_split variable specifies the proportion of the original training set that will serve as the validation set. 

The following code builds a model, trains it on the training set, and evaluates the built model on both:

- The training set.
- And the validation set.

In [None]:
candidate_dict = ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10',
                  'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20',
                  'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30',
                  'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f39', 'f40',
                  'f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47', 'f48', 'f49', 'f50',
                  'f51', 'f53', 'f54', 'f55', 'f56', 'f57', 'f58', 'f59', 'f60',
                  'f61', 'f62', 'f63', 'f64', 'f65', 'f66', 'f67', 'f68', 'f69', 'f70',
                  'f71', 'f73', 'f74', 'f75', 'f76', 'f77', 'f78', 'f79', 'f80',
                  'f81', 'f82', 'f83', 'f84', 'f85', 'f86', 'f87', 'f88', 'f89', 'f90',
                  'f91', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99']

y_train = train_df.iloc[:,-1]
X_train = train_df.loc[:, candidate_dict]
X_train = train_df.drop('target', axis=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.1, random_state=1) 

## Logistic Regression with Default Hyperparameters in scikit-learn

In [None]:
# INSTANTIATING a logistic regression model
logreg = LogisticRegression(solver ='liblinear',
                            C = 1.0,
                            warm_start = True,
                            random_state = 1)

# FITTING the training data
logreg_fit = logreg.fit(X_train, y_train)


# PREDICTING based on the testing set
logreg_pred = logreg_fit.predict(X_val)


# SCORING the results
print('Training ACCURACY:', logreg_fit.score(X_train, y_train).round(4))
print('Testing  ACCURACY:', logreg_fit.score(X_val, y_val).round(4))


# SCORING with AUC
print('AUC Score        :', roc_auc_score(y_true  = y_val,
                                          y_score = logreg_pred).round(decimals = 4))

# unpacking the confusion matrix
logreg_tn, \
logreg_fp, \
logreg_fn, \
logreg_tp = confusion_matrix(y_true = y_val, y_pred = logreg_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {logreg_tn}
False Positives: {logreg_fp}
False Negatives: {logreg_fn}
True Positives : {logreg_tp}
""")

# creating a confusion matrix
print(confusion_matrix(y_true = y_val,
                       y_pred = logreg_pred))

## Classification Trees (CART Models)

In [None]:
# INSTANTIATING a classification tree object
pruned_tree = DecisionTreeClassifier(max_depth = 5,
                                     min_samples_leaf = 17,
                                     criterion = 'entropy',
                                     random_state = 1)

# FITTING the training data
pruned_tree_fit  = pruned_tree.fit(X_train, y_train)


# PREDICTING on new data
pruned_tree_pred = pruned_tree_fit.predict(X_val)


# SCORING the model
print('Training ACCURACY:', pruned_tree_fit.score(X_train, y_train).round(4))
print('Testing  ACCURACY:', pruned_tree_fit.score(X_val, y_val).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_val,
                                          y_score = pruned_tree_pred).round(4))

# unpacking the confusion matrix
pruned_tree_tn, \
pruned_tree_fp, \
pruned_tree_fn, \
pruned_tree_tp = confusion_matrix(y_true = y_val, y_pred = pruned_tree_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {pruned_tree_tn}
False Positives: {pruned_tree_fp}
False Negatives: {pruned_tree_fn}
True Positives : {pruned_tree_tp}
""")

# creating a confusion matrix
print(confusion_matrix(y_true = y_val,
                       y_pred = pruned_tree_pred))

## Random Forest

In [None]:
# INSTANTIATING a random forest model with hyperparameters tuned values
random_forest = RandomForestClassifier(n_estimators     = 350,
                                       criterion        = 'gini',
                                       max_depth        = 7,
                                       max_features     = 'auto',
                                       min_samples_leaf = 1,
                                       bootstrap        = True,
                                       warm_start       = True,
                                       random_state     = 1)

# FITTING the training data
random_forest_fit = random_forest.fit(X_train, y_train)


# PREDICTING based on the testing set
random_forest_fit_pred = random_forest_fit.predict(X_val)


# SCORING the results
print('Training ACCURACY:', random_forest_fit.score(X_train, y_train).round(4))
print('Testing  ACCURACY:', random_forest_fit.score(X_val, y_val).round(4))


# saving AUC score
print('AUC Score        :', roc_auc_score(y_true  = y_val,
                                          y_score = random_forest_fit_pred).round(4))

# unpacking the confusion matrix
rf_tn, \
rf_fp, \
rf_fn, \
rf_tp = confusion_matrix(y_true = y_val, y_pred = random_forest_fit_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {rf_tn}
False Positives: {rf_fp}
False Negatives: {rf_fn}
True Positives : {rf_tp}
""")

# creating a confusion matrix
print(confusion_matrix(y_true = y_val,
                       y_pred = random_forest_fit_pred))

## Gradient Boosted Machines

In [None]:
# INSTANTIATING a Gradient Boosted Machines
gbm = GradientBoostingClassifier(loss          = 'deviance',
                                 learning_rate = 0.1,
                                 n_estimators  = 100,
                                 criterion     = 'friedman_mse',
                                 max_depth     = 2,
                                 warm_start    = False,
                                 random_state  = 1)

# FITTING the training data
gbm_fit = gbm.fit(X_train, y_train)


# PREDICTING based on the testing set
gbm_pred = gbm_fit.predict(X_val)


# SCORING the results
print('Training ACCURACY:', gbm_fit.score(X_train, y_train).round(4))
print('Testing ACCURACY :', gbm_fit.score(X_val, y_val).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_val,
                                          y_score = gbm_pred).round(4))

# unpacking the confusion matrix
gbm_tn, \
gbm_fp, \
gbm_fn, \
gbm_tp = confusion_matrix(y_true = y_val, y_pred = gbm_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {gbm_tn}
False Positives: {gbm_fp}
False Negatives: {gbm_fn}
True Positives : {gbm_tp}
""")

# creating a confusion matrix
print(confusion_matrix(y_true = y_val,
                       y_pred = gbm_pred))

## Binary Classification with Keras NN

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout

In [None]:
# Construct the Sequential model

model = Sequential()
model.add(Dense(128, activation="relu", input_shape = (X_train.shape[1],))) # Hidden Layer 1 that receives the Input from the Input Layer

model.add(Dense(64, activation="relu")) # Hidden Layer 2
model.add(Dropout(0.2))

model.add(Dense(32, activation="relu")) # Hidden Layer 3
model.add(Dropout(0.2))

model.add(Dense(16, activation="relu")) # Hidden Layer 4
model.add(Dropout(0.2))


model.add(Dense(1, activation="sigmoid")) # Outout Layer

model.summary()

In [None]:
# Compile the model
model.compile(optimizer='adam', loss = "binary_crossentropy", metrics = ['accuracy'])

In [None]:
# Fit the model
model.fit(X_train, y_train, batch_size = 64, epochs = 100)

In [None]:
# Validate the model
validation_loss, validation_accuracy = model.evaluate(X_val, y_val, batch_size=32)
print("Loss: "+ str(np.round(validation_loss, 3)))
print("Accuracy: "+ str(np.round(validation_accuracy, 3)))

In [None]:
# Predict the Keras NN model with the test set
test_df = trans.fit_transform(test_df)
test_df = DataFrame(test_df)
test_df.columns = ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10',
                   'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20',
                   'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30',
                   'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40',
                   'f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47', 'f48', 'f49', 'f50',
                   'f51', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f58', 'f59', 'f60',
                   'f61', 'f62', 'f63', 'f64', 'f65', 'f66', 'f67', 'f68', 'f69', 'f70',
                   'f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'f78', 'f79', 'f80',
                   'f81', 'f82', 'f83', 'f84', 'f85', 'f86', 'f87', 'f88', 'f89', 'f90',
                   'f91', 'f92', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99']

X_test = test_df

# test_df["test_pred"] = np.nan
# y_test = test_df.drop("test_pred", axis=1)

y_predict = model.predict(X_test)
y_predict = np.ravel(y_predict)

In [None]:
# Save predition to the submission.csv
test_df_temp = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv')
output = pd.DataFrame({'id': test_df_temp.id, 'target': y_predict})
output = output.loc[:, ['id', 'target']]

output.to_csv('submission.csv', index = False)