# CIE 632: Machine Learning Fundamentals - Spring 2021
# Project - Due Date: June 26, 2021
# Name: Muhammad Hamdy AlAref

## Exploring the data

In [None]:
from sklearn import *
import pandas as pd

data_train = pd.read_csv('../input/tabular-playground-series-jun-2021/train.csv')
data_test = pd.read_csv('../input/tabular-playground-series-jun-2021/test.csv')

In [None]:
data_train.head()

In [None]:
data_train.describe(include='all')

In [None]:
X_train = data_train.iloc[:, 1:-1].to_numpy()
y_train = data_train.iloc[:, -1].to_numpy()
X_test = data_test.iloc[:, 1:].to_numpy()

for i in (X_train, y_train, X_test): print(i.shape)  # sanity check

This is a classification problem with 200K samples, i.e. $n = 200,000$; each having 75 features (from `feature_0` to `feature_74`), i.e. $p = 75$.
There are $9$ possible classes for each observation.

In the following, the `neg_log_loss` score will be used as the competition evaluates submissions using multi-class logarithmic loss.

## Standardizing the features

In [None]:
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Streamlining models evaluations

In [None]:
def choose_model(models, X, y, verbose=False):
    best_clf, best_score = None, None
    for model in models:
        clf = model_selection.GridSearchCV(model['clf'], model['params'], scoring='neg_log_loss', n_jobs=-1).fit(X, y)
        if best_score is None or best_score < clf.best_score_: best_score, best_clf = clf.best_score_, clf.best_estimator_
        if verbose: print(f"{model['name']} got score {clf.best_score_}" + (f" with parameters {clf.best_params_}" if model['params'] else ""))
    return best_clf, best_score

## Trying different models with all the features

In [None]:
models = (
    {
        'name'   : 'Gaussian Naive Bayes',
        'clf'    : naive_bayes.GaussianNB(),
        'params' : {}
    },
    {
        'name'   : 'Logistic Regression',
        'clf'    : linear_model.LogisticRegression(),
        'params' : {'penalty' : ('none',), 'solver' : ('saga',)}
    },
    {
        'name'   : 'Lasso',
        'clf'    : linear_model.LogisticRegression(),
        'params' : {'penalty' : ('l1',), 'solver' : ('saga',)}
    },
    {
        'name'   : 'Ridge',
        'clf'    : linear_model.LogisticRegression(),
        'params' : {'penalty' : ('l2',), 'solver' : ('saga',)}
    },
    {
        'name'   : 'Linear Discriminant Analysis',
        'clf'    : discriminant_analysis.LinearDiscriminantAnalysis(),
        'params' : {}
    },
    {
        'name'   : 'Quadratic Discriminant Analysis',
        'clf'    : discriminant_analysis.QuadraticDiscriminantAnalysis(),
        'params' : {}
    },
#     {  # Takes forever!
#         'name'   : 'K Nearest Neighbors',
#         'clf'    : neighbors.KNeighborsClassifier(),
#         'params' : {'algorithm' : ('ball_tree',), 'n_neighbors' : range(1, 10, 2)}  # Specifying the algorithm to prevent brute force from exploding.
#                                                                                     # Selecting BallTree as it is considered better than KDTree in high dimensions.
#     },
    {
        'name'   : 'Decision Tree',
        'clf'    : tree.DecisionTreeClassifier(),
        'params' : {'max_depth' : range(10, 101, 10)}
    },
    {
        'name'   : 'Random Forests',
        'clf'    : ensemble.RandomForestClassifier(),
        'params' : {'max_depth' : range(10, 51, 10)}
    },
    {
        'name'   : 'AdaBoost',
        'clf'    : ensemble.AdaBoostClassifier(),
        'params' : {}
    },
#     {  # Takes forever and ever!!! I guess SVM's bad reputation of bad scaling is well-earned!
#         'name'   : 'Support Vector Machines',
#         'clf'    : svm.SVC(),
#         'params' : {'kernel' : ('linear', 'poly', 'rbf'), 'C' : range(1, 10, 2)}
#     },
    {
        'name'   : 'Multi-layer Perceptron',
        'clf'    : neural_network.MLPClassifier(),
        'params' : {'hidden_layer_sizes' : range(10, 100, 10)}
    }
)

In [None]:
best_clf, best_score = choose_model(models, X_train, y_train, verbose=True)

In [None]:
# Re-tuning random forests as its optimum lies on the border.
best_clf, best_score = choose_model(
    ({
        'name'   : 'Random Forests',
        'clf'    : ensemble.RandomForestClassifier(),
        'params' : {'max_depth' : range(1, 11)}
    },), X_train, y_train, verbose=True)

Evidently, Ridge (Logistic Regression with `l2` penalty), LDA, Random Forests and MLP (not-so-deep neural network) yield the best results with Random Forests being slightly better.

## Re-trying the best models in a lower-dimensional setting
*MLP is excluded as it is probably best if tuned separately with dedicated neural network libraries supporting deeper models and accelerators (GPUs and TPUs).*

In [None]:
best_models = (
    {
        'name'   : 'Ridge',
        'clf'    : linear_model.LogisticRegression(),
        'params' : {'penalty' : ('l2',), 'solver' : ('saga',)}
    },
    {
        'name'   : 'Linear Discriminant Analysis',
        'clf'    : discriminant_analysis.LinearDiscriminantAnalysis(),
        'params' : {}
    },
    {
        'name'   : 'Random Forests',
        'clf'    : ensemble.RandomForestClassifier(),
        'params' : {'max_depth' : range(1, 11)}
    }
)

In [None]:
pca = decomposition.PCA(n_components='mle').fit(X_train)

In [None]:
pca.n_components_

In [None]:
pca.explained_variance_ratio_

In [None]:
pca.explained_variance_ratio_.sum()

Looks like the variability in the data is spread across all the dimensions!

In [None]:
X_reduced = pca.transform(X_train)
best_clf_reduced, best_score_reduced = choose_model(best_models, X_reduced, y_train, verbose=True)

## Writing predictions

In [None]:
# Choosing between the high- and low-dimensional settings automatically; for scripting purposes
if best_score_reduced > best_score:
    print('Choosing the low-dimensional setting...')
    best_score = best_score_reduced
    best_clf = best_clf_reduced
    X_test = pca.transform(X_test)
else:
    print('Sticking with the high-dimensional setting...')

print(f"Estimated score = {best_score}")

In [None]:
y_predict = best_clf.predict_proba(X_test)
data_predict = pd.read_csv('../input/tabular-playground-series-jun-2021/sample_submission.csv')
data_predict.iloc[:, 1:] = y_predict
data_predict.to_csv('submission.csv', index=False)