In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier

%matplotlib inline

In [None]:
df_train = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/train.csv')
df_test = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/test.csv')

target = 'target'
y_train = df_train[target]
df_test_ids = df_test['id']

# Data Exploration

In [None]:
y_train.mean()

In [None]:
print(df_train.shape)
print(df_test.shape)

In [None]:
df_test_ids

In [None]:
df = pd.concat([df_train.drop(columns=target), df_test], ignore_index = True)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.describe(include=['O'])

In [None]:
sns.distplot(df_train[target]);

In [None]:
#skewness and kurtosis
print("Skewness: %f" % df_train[target].skew())
print("Kurtosis: %f" % df_train[target].kurt())

In [None]:
#https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python
#correlation matrix
df_dummy = pd.get_dummies(df_train)[:5000]
corrmat = df_dummy.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);

In [None]:
#correlation matrix
corrmat = df_train.sample(5000).corr()
k = 10 #number of variables for heatmap
f, ax = plt.subplots(figsize=(12, 9))
cols = corrmat.nlargest(k, target)[target].index
cm = np.corrcoef(df_train[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

Relation between continious variable and the target:

In [None]:
#correlation heatmap of dataset
def correlation_heatmap(df, k):
    _ , ax = plt.subplots(figsize =(14, 12))
    colormap = sns.diverging_palette(220, 10, as_cmap = True)
    
    corrmat = df.corr()
    cols = corrmat.nlargest(k, target)[target].index
    _ = sns.heatmap(
        
        df[cols].corr(), 
        cmap = colormap,
        square=True, 
        cbar_kws={'shrink':.9 }, 
        ax=ax,
        annot=True, 
        linewidths=0.1,vmax=1.0, linecolor='white',
        annot_kws={'fontsize':12 }
    )
    
    plt.title('Pearson Correlation of Features', y=1.05, size=15)

correlation_heatmap(pd.get_dummies(df_train)[:100],7)

## Continious Variables

In [None]:
#bivariate analysis saleprice/grlivarea
var = 'cont8'
data = pd.concat([df_train[target], df_train[var]], axis=1)
data.plot.scatter(x=var, y=target);

Relation between categorical variable and the target:

## Categorical Variables

In [None]:
#https://www.kaggle.com/gaetanlopez/tps-complete-eda-single-lgb-tuning-strategy
cols = df_train.select_dtypes(include='number').drop(columns=['id',target]).columns

fig = plt.figure(figsize=(30,50))
i=1
for cont in cols:
    plt.subplot(len(cols), 3, i)
    sns.histplot(df_train[cont])
    i+=1
    
    plt.subplot(len(cols), 3, i)
    plt.boxplot(x = df_train[cont])
    i+=1

    plt.subplot(len(cols), 3, i)
    sns.violinplot(data = df_train, x = 'target', y = cont)
    i+=1

    plt.tight_layout()

plt.show()

In [None]:
var = 'cat16'
data = pd.concat([df_train[target], df_train[var]], axis=1)
f, ax = plt.subplots(figsize=(16, 8))
fig = sns.boxplot(x=var, y=target, data=data) #NOTE: only first variable in the Target is selected
#fig.axis(ymin=0, ymax=800000);
#plt.xticks(rotation=90);

In [None]:
var = 'cat5'
df_train.groupby(var)[target].value_counts()

# Classifier Test
Borrowed code from [this notebook](https://www.kaggle.com/ldfreeman3/a-data-science-framework-to-achieve-99-accuracy).

In [None]:
MLA = [
    #Ensemble Methods
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),

    #Gaussian Processes
    #consumes a lot of RAM, may cause notebook to fail b/c of te 16gb limit
    #https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.GaussianProcessClassifier.html
    #"Note that this class thus does not implement a true multi-class Laplace approximation."
    gaussian_process.GaussianProcessClassifier(),
    
    #GLM
    linear_model.LogisticRegressionCV(),
    linear_model.PassiveAggressiveClassifier(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),
    
    #Navies Bayes
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),
    
    #Nearest Neighbor
    neighbors.KNeighborsClassifier(),
    
    #SVM
    svm.SVC(probability=True),
    svm.NuSVC(probability=True),
    svm.LinearSVC(),
    
    #Trees    
    tree.DecisionTreeClassifier(),
    tree.ExtraTreeClassifier(),
    
    #Discriminant Analysis
    discriminant_analysis.LinearDiscriminantAnalysis(),
    discriminant_analysis.QuadraticDiscriminantAnalysis(),

    
    #xgboost: http://xgboost.readthedocs.io/en/latest/model.html
    XGBClassifier()    
    ]

In [None]:
df_dummies = pd.get_dummies(df).drop(columns='id')

In [None]:
def MLA_test(X_initial, y_initial, f, t, verbose=0):
    X = X_initial[f:t]    
    y = y_initial[f:t]    
    
    #split dataset in cross-validation with this splitter class: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html#sklearn.model_selection.ShuffleSplit
    #note: this is an alternative to train_test_split
    cv_split = model_selection.ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = 0 ) # run model 10x with 60/30 split intentionally leaving out 10%
    
    #create table to compare MLA metrics
    MLA_columns = ['MLA Name', 'MLA Parameters','MLA Train Accuracy Mean', 'MLA Test Accuracy Mean', 'MLA Test Accuracy 3*STD' ,'MLA Time']
    MLA_compare = pd.DataFrame(columns = MLA_columns)

    #index through MLA and save performance to table
    row_index = 0
    for alg in MLA:
        #set name and parameters
        MLA_name = alg.__class__.__name__
        MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
        MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())

        #score model with cross validation: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html#sklearn.model_selection.cross_validate
        cv_results = model_selection.cross_validate(alg, X, y, cv  = cv_split, n_jobs=-1, verbose=0, return_train_score=True)

        training_score = cv_results['train_score'].mean()
        test_score = cv_results['test_score'].mean()
        if verbose == 1:
            print('{}/{}'.format(row_index+1, len(MLA)), MLA_name, " - ", training_score, test_score)

        MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean()
        MLA_compare.loc[row_index, 'MLA Train Accuracy Mean'] = training_score
        MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'] = test_score
        #if this is a non-bias random sample, then +/-3 standard deviations (std) from the mean, should statistically capture 99.7% of the subsets
        MLA_compare.loc[row_index, 'MLA Test Accuracy 3*STD'] = cv_results['test_score'].std()*3   #let's know the worst that can happen!
    
        #save MLA predictions - see section 6 for usage
        #alg.fit(data1[data1_x_bin], df[Target])
        #MLA_predict[MLA_name] = alg.predict(df[data1_x_bin])

        row_index+=1
    
    #print and sort table: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sort_values.html
    MLA_compare.sort_values(by = ['MLA Test Accuracy Mean'], ascending = False, inplace = True)
    
    return MLA_compare

In [None]:
MLA_compare = MLA_test(df_dummies, y_train, 0, 5000, verbose=1)
MLA_compare

In [None]:
MLA_compare.values[0]

In [None]:
model = None
if model!=None:
    cv_split = model_selection.ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = 0 ) # run model 10x with 60/30 split intentionally leaving out 10%
    cv_results = model_selection.cross_validate(model, X, y, cv  = cv_split, n_jobs=-1, verbose=0, return_train_score=True)

    training_score = cv_results['train_score'].mean()
    test_score = cv_results['test_score'].mean()
    print(model.__class__.__name__, " - ", training_score, test_score)

In [None]:
f = 0
t = 10000
model = MLA[4]
model.fit(df_dummies[f:t], y_train[f:t])

In [None]:
X_test = df_dummies[df_train.shape[0]:]
results = model.predict(X_test)
df_results = pd.DataFrame({'id':df_test_ids, 'target':results})
df_results

In [None]:
df_results.to_csv('classifierSubmission.csv', index=False)

Even though classifiers achieved upto 84% accuracy in evaluation data, this success didn't translate to test data. Highest submission score was 76%.

# Post-training Evaluation

Borrowed code from [this notebook](https://www.kaggle.com/rafjaa/resampling-strategies-for-imbalanced-datasets).

In [None]:
f = 100000
t = 150000

conf_mat = confusion_matrix(y_true=y_train[f:t], y_pred=model.predict(df_dummies[f:t]))
print('Confusion matrix:\n', conf_mat)

labels = ['Class 0', 'Class 1']
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(conf_mat, cmap=plt.cm.Blues)
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('Expected')
plt.show()

We can see that model is misclassifying 1/3 of the class 1. Seeing this inbalanced state, I decided to learn about resampling.

## Resampling

"A widely adopted technique for dealing with highly unbalanced datasets is called resampling. It consists of removing samples from the majority class (under-sampling) and / or adding more examples from the minority class (over-sampling)"

In [None]:
# Class count
count_class_0, count_class_1 = df_train.target.value_counts()

# Divide by class
df_class_0 = df_train[df_train['target'] == 0]
df_class_1 = df_train[df_train['target'] == 1]

## Random under-sampling


In [None]:
df_class_0_under = df_class_0.sample(count_class_1)
df_test_under = pd.concat([df_class_0_under, df_class_1], axis=0)

print('Random under-sampling:')
print(df_test_under.target.value_counts())

df_test_under.target.value_counts().plot(kind='bar', title='Count (target)');

## Random over-sampling

In [None]:
df_class_1_over = df_class_1.sample(count_class_0, replace=True)
df_test_over = pd.concat([df_class_0, df_class_1_over], axis=0)

print('Random over-sampling:')
print(df_test_over.target.value_counts())

df_test_over.target.value_counts().plot(kind='bar', title='Count (target)');

When I used resampling, accuracy droped to 70%.

# TF Model

In [None]:
import tensorflow as tf

In [None]:
f = 0
t = 300000
X = df_dummies[f:t]
y = y_train[f:t]

In [None]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(20, activation='relu', input_shape= (642,)))
#model.add(tf.keras.layers.Dense(20, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='sgd', loss='mse', metrics=['accuracy'])
history = model.fit(X, y, epochs=10, validation_split=0.3, shuffle=True)
#history = model.fit(pd.get_dummies(df_test_under).drop(columns=['target','id']), df_test_under['target'], epochs=10, validation_split=0.3, shuffle=True)

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
#Sanity check
f = 100000
t = 150000
model.evaluate(df_dummies[f:t], y_train[f:t])

In [None]:
y_pred = model.predict(df_dummies[df_train.shape[0]:])

In [None]:
df_results = pd.DataFrame({'id':df_test_ids, 'target':y_pred.reshape(y_pred.shape[0])})

In [None]:
df_results.head()

In [None]:
df_results.to_csv('sequentialNNSubmission.csv', index=False)

Submission of TF model achieved 86.915% accuracy.

# LGBM + Optuna

- https://www.kaggle.com/dmitryuarov/catboost-vs-xgb-vs-lgbm-tps-mar-21
- https://www.kaggle.com/calebyenusah/lgbm-and-optuna-tps-march-2021


In [None]:
import optuna
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [None]:
f = 0
t = 300000
X = df_dummies[f:t]
y = y_train[f:t]

In [None]:
def objective(trial, data = X, target = y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 0)

    params = {
        'reg_alpha': trial.suggest_float('reg_alpha', 0.001, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.001, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 11, 333),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'max_depth': trial.suggest_int('max_depth', 5, 20),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.01, 0.02, 0.05, 0.005, 0.1]),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.5),
        'n_estimators': trial.suggest_int('n_estimators', 50, 3000),
        'random_state': 42,
        'boosting_type': 'gbdt',
        'metric': 'AUC',
        #'device': 'gpu'
    }
    
    model = LGBMClassifier(**params)  
    model.fit(X_train, y_train, eval_set = [(X_val,y_val)], early_stopping_rounds = 222, verbose = False)
    y_pred = model.predict_proba(X_val)[:,1]
    roc_auc = roc_auc_score(y_val, y_pred)

    return roc_auc

In [None]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 50)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Best value:', study.best_value)

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
paramsLGBM = study.best_trial.params
paramsLGBM['boosting_type'] = 'gbdt'
paramsLGBM['metric'] = 'AUC'
paramsLGBM['random_state'] = 42

In [None]:
from sklearn.model_selection import KFold

folds = KFold(n_splits = 10, shuffle = True, random_state = 42)

predictions = np.zeros(len(X_test))

for fold, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
    
    X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    model = LGBMClassifier(**paramsLGBM)
   
    model.fit(X_train, y_train, eval_set = [(X_val, y_val)], eval_metric = 'auc', verbose = False, early_stopping_rounds = 222)
    
    predictions += model.predict_proba(X_test)[:,1] / folds.n_splits 

In [None]:
submission = pd.DataFrame({'id': df_test_ids, 'target': predictions})
submission.to_csv('submissionLGBM.csv', index = False)

Achieved 89.249% with the LGBM + Optuna.

# To Do and Resources:

- [X] Check how much time XGBClassifier takes. It took unusually long, 84 seconds. Next highest was 4.1 seconds. Remove it if necessary
- [X] Find out the score over time in best performing MLAs. See if there is overfitting.
- [ ] https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
- [ ] https://towardsdatascience.com/optimizing-hyperparameters-in-random-forest-classification-ec7741f9d3f6
- [ ] https://www.datacamp.com/community/tutorials/random-forests-classifier-python
- [ ] Check if the nxn variable relation graph can be implemented (In notebook 4)

### Statistics:

- [ ] https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/discussion/41037
- [ ] https://scikit-learn.org/stable/modules/cross_validation.html
- [ ] https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html
- [ ] https://en.wikipedia.org/wiki/Cross-validation_(statistics)

### Imbalanced Data (!!!!!)

- [X] https://www.kaggle.com/rafjaa/resampling-strategies-for-imbalanced-datasets

### EDA Notebooks:

- [X] https://www.kaggle.com/sudalairajkumar/winning-solutions-of-kaggle-competitions
- [X] https://www.kaggle.com/kanncaa1/data-sciencetutorial-for-beginners
- [X] https://www.kaggle.com/ldfreeman3/a-data-science-framework-to-achieve-99-accuracy
- [X] https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python
- [X] https://www.kaggle.com/calebyenusah/lgbm-and-optuna-tps-march-2021

### LGBM Notebooks:
- [ ] https://www.kaggle.com/gaetanlopez/tps-complete-eda-single-lgb-tuning-strategy
- [ ] https://www.kaggle.com/rmiperrier/lgbm-optuna
- [ ] https://www.kaggle.com/ekozyreff/tps-2021-03-lightgbm-optuna-10-folds