In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import

In [None]:
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble        import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier 
from sklearn.neighbors       import KNeighborsClassifier
from sklearn.svm             import LinearSVC
from sklearn.naive_bayes     import GaussianNB
from sklearn.linear_model    import LogisticRegression
from sklearn.neural_network  import MLPClassifier
from sklearn.tree            import DecisionTreeClassifier
from sklearn.svm             import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model    import LogisticRegression
from lightgbm                import LGBMClassifier
from sklearn.model_selection import KFold,StratifiedKFold

from sklearn.model_selection import train_test_split
from sklearn.calibration     import calibration_curve
from sklearn.pipeline        import make_pipeline
from sklearn.metrics         import plot_confusion_matrix
from sklearn.metrics         import classification_report, confusion_matrix, accuracy_score
%matplotlib inline

# Setting parameter

In [None]:
N_SPLITS   = 2         # Number of Stratified K Folds
DATA_PERC  = 1         # Percentage of the training set (execution speed parameter during dev & test)
SEED       = 1         # Random seed

# Read data file

In [None]:
all_train = pd.read_csv("../input/tabular-playground-series-mar-2021/train.csv")
all_test = pd.read_csv("../input/tabular-playground-series-mar-2021/test.csv")

# Review data

## Review NaN impact

In [None]:
print("+-------------------- train ----------------------------+ +-------------------- test ----------------------------+")
rf = lambda df :  [ "{:>10s} - Count: {:4d} - Nan: {:4d} - type:{:10s}".format(col,df[col].count(),
                                                                 df[col].isna().sum(),
                                                                 df.dtypes[col].name) for col in df.columns]
print("\n")
_ = [print(tr + tr) for (tr,te) in zip(rf(all_train),rf(all_test))]


## Training data

In [None]:
all_train.describe().T.style.background_gradient(cmap='YlOrRd',vmin=0,vmax=1,subset=pd.IndexSlice[:,'mean':'max'])

## Test data

In [None]:
all_test.describe().T.style.background_gradient(cmap='YlOrRd',vmin=0,vmax=1,subset=pd.IndexSlice[:,'mean':'max'])

## Translate categorical feature
Across the all data set (train and test) to ensure encoded feature consistency

In [None]:
def build_feature(no_hot_array, hot_array,dataset) :
        tmp = dataset[no_hot_array]
        for a_hot in hot_array :
             tmp = pd.concat([tmp, pd.get_dummies(dataset[a_hot],prefix=a_hot)], axis=1);   
        return tmp
    
cont_columns = [col for col in all_train.columns if 'cont' in col]
cat_columns  = [col for col in all_train.columns if 'cat' in col]

all_data = pd.concat([all_train,all_test]).reset_index(drop=True)
all_data_encoded  = build_feature(cont_columns, cat_columns,all_data)
all_train_encoded = all_data_encoded[:all_train.shape[0]]
all_test_encoded  = all_data_encoded[all_train.shape[0]:]
print("all_data.shape          : ",all_data.shape," - all_data_encoded.shape : ",all_data_encoded.shape, " ")
print("all_train_encoded.shape : ",all_train_encoded.shape," - all_test_encoded.shape : ",all_test_encoded.shape, " ")


## Select a subset of train data set for shorter runtime, during notebook developement

In [None]:
np.random.seed(SEED)
mask = np.random.rand(all_train_encoded.shape[0]) <= DATA_PERC
train_encoded = all_train_encoded[mask]

# Review each classifiers

In [None]:
classifiers = [
    LGBMClassifier(),
#         KNeighborsClassifier(4),
#         SVC(probability=True),
#         DecisionTreeClassifier(),
    RandomForestClassifier (n_estimators=20, random_state=0),
#         AdaBoostClassifier(),
#         GradientBoostingClassifier(),
    GaussianNB(),
    MLPClassifier( solver='adam', alpha=0.314, random_state=1, max_iter=4000,
                       early_stopping=True, hidden_layer_sizes=[40, 40, 40], ),
    LinearSVC(C=1.0),
    LinearDiscriminantAnalysis(),
#         QuadraticDiscriminantAnalysis(),
#         LogisticRegression(max_iter=4000)
]

X = train_encoded.values
y = all_train[mask].loc[:,"target"].values
log_res  = []
for clf in classifiers :
        print("\n----------------------------------------------------------" )
        print("Classifier: {:20s} - # fold: {:2d}".format(clf.__class__.__name__,N_SPLITS) )
        skf = StratifiedKFold(n_splits=N_SPLITS)
        # skf.get_n_splits(X, y)
        fold_no = 1
        accuracy = []
        for train_index, test_index in skf.split(X, y):
            tic = time.perf_counter()
            print("   fold: {:2d} -".format(fold_no), end=" ")
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
#             print(confusion_matrix(y_test,y_pred))
#             print(classification_report(y_test,y_pred))
            acc = accuracy_score(y_test, y_pred)
            toc = time.perf_counter()
            print("accuracy: {:2.2f} time: {:0.1f} sec".format(100*acc,toc - tic))
            accuracy.append(acc)
            fold_no = fold_no + 1
        log_entry = [clf, np.mean(accuracy) ,toc - tic,sum(y_train)/len(y_train)]
        log_res.append(log_entry)

log_cols = ["Classifier", "Accuracy","Time","target distrib. %"]
log 	 = pd.DataFrame(log_res, columns=log_cols)
print(log.sort_values(['Accuracy'],ascending=False))

# Get best classifier 

best_clf = log.loc[0,"Classifier"] 
print("\nBest classifier :",best_clf)


# Prepare submission
Use the best performing classifier

In [None]:
if DATA_PERC == 1 : 
    # Train best_clf on the entire train set
    best_clf.fit(X,y)
    y_train_pred = best_clf.predict(all_train_encoded)
    acc = accuracy_score(y_train_pred, y)
    print(" Accuracy against the all train set : {:.4f}".format(acc))

    # Test prediction

    y_test_pred = best_clf.predict(all_test_encoded)
    print(log)

    # Saving the file
    sub = pd.DataFrame({'id': all_test['id'].values, 'target': y_test_pred})
    sub.to_csv('sub.csv', index=False)
else :
    print("Set DATA_PERC to 1 to save submission")