# Project: Sklearn London

# Overview

The aim of project is to explore sklearn by using it. 

# Data Description

- This is a synthetic data set of 40 features, representing objects from two classes (labeled as 0 or 1). The training set has 1000 samples and the testing set has 9000.


- This is a binary classification task, You are evaluated on classification accuracy (the percentage of labels you predict correctly).  The training set has 1000 samples and the testing set has 9000.  Your prediction should be a 9000 x 1 vector of ones or zeros. You also need an Id column (1 to 9000) and should include a header. The format looks like this:

Id,Solution
1,0
2,1
3,1
...
9000,0

# Import modules

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import missingno as mi

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

from sklearn.dummy import DummyClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.mixture import GaussianMixture

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [None]:
warnings.filterwarnings("ignore")

# Constants

In [None]:
RANDOM_STATE = 12345

# Methods

## Visual

In [None]:
def values_distribution(df, xmin=-20, xmax=20):
    fig, ax = plt.subplots(figsize=(15,20))
    
    sns.boxplot(data=df, orient='h')
    
    ax.set_xlabel('Values')
    ax.set_ylabel('Columns')
    ax.set_xlim(xmin, xmax)
    
    plt.show()
    pass

In [None]:
def class_count(labels):
    fig, ax = plt.subplots(figsize=(10, 5))
    sns.countplot(data=labels, x='labels')
    plt.show()
    pass

In [None]:
def corr_matrix(df, annot=False, plot=True, clip=0):
    
    corr = df.corr()
    mask = np.zeros_like(corr, dtype=bool)
    mask[np.triu_indices_from(mask)] = True
    corr[mask] = np.nan
    corr[abs(corr)<clip] = np.nan
    
    if plot:
        fig,ax = plt.subplots(figsize=(15, 13))

        sns.heatmap(corr, 
                    xticklabels=corr.columns, 
                    yticklabels=corr.columns, 
                    cmap='vlag', 
                    center=0, 
                    annot=annot, 
                    vmin=-1, vmax=1)

        plt.title('Correlation matrix')
        plt.grid()
        plt.show()
    
    return corr

In [None]:
def plot_feature_importance(importance, names, model_type):

    #Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    #Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)

    #Sort the DataFrame in order decreasing feature importance
    fi_df = fi_df.sort_values(by=['feature_importance'], ascending=False)\
                 .reset_index(drop=True)
    
    #Define size of bar plot
    fig, ax = plt.subplots(figsize=(10,20))
    #Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'], 
                y=fi_df.index,
                orient='h')
    
    ax.set_yticklabels(fi_df['feature_names'])
    #Add chart labels
    plt.title(model_type + 'FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')

## Train

In [None]:
def train_model(X_train, X_val, y_train, y_val, estimator, show=True):
    
    estimator.fit(X_train, y_train)
       
    predict_train = estimator.predict(X_train)
    score_train = round(accuracy_score(predict_train, y_train), 2)

    predict_val = estimator.predict(X_val)
    accuracy_val = round(accuracy_score(predict_val, y_val), 2)
    precision_val = round(precision_score(predict_val, y_val), 2)
    recall_val = round(recall_score(predict_val, y_val), 2)
    roc_auc_val = round(roc_auc_score(predict_val, y_val), 2)
    estimator_name = type(estimator).__name__

    res = pd.DataFrame(data={'Accuracy train':score_train, 
                             'Accuracy valid':accuracy_val, 
                             'Precision': precision_val, 
                             'Recall': recall_val, 
                             'ROC-AUC':roc_auc_val},
                       index=[estimator_name])
        
    if show:
        display(res)
        
    return estimator, res

## Submit results

In [None]:
def test_submission(estimator, X_test):
    
    y_pred = estimator.predict(X_test)
    
    idx = [i for i in range(1, 9001)]
    
    submission = pd.DataFrame(data={'Id':idx, 'Solution':y_pred})
    
    submission.to_csv('/kaggle/working/predictions.csv', index=False)
    
    pass

# Data Loading

## Train

In [None]:
train = pd.read_csv('/kaggle/input/data-science-london-scikit-learn/train.csv', header=None)
train.info()
display(train)

## Test

In [None]:
test = pd.read_csv('/kaggle/input/data-science-london-scikit-learn/test.csv', header=None)
test.info()
display(test)

## Train labels

In [None]:
train_labels = pd.read_csv('/kaggle/input/data-science-london-scikit-learn/trainLabels.csv', header=None)
train_labels.columns = ['labels']
train_labels.info()
display(train_labels)

# Data Analysis

## Train data

### Check missing values

In [None]:
mi.bar(train, figsize=(10,5))

- No missing Values

### Check data discributions

#### Train features

In [None]:
values_distribution(train)

- All features have median values around 0 with few outliers

- Minimum / maximum values are in range [-20 20]

#### Train labels

In [None]:
class_count(train_labels)

- Train classes are balanced

### Check correlations

In [None]:
corr = corr_matrix(pd.concat([train, train_labels], axis=1), plot=True)

- Only a few features have have little correlation

- Features 12 and 14 correlates the most with target labels

In [None]:
print('Top 10 correlations:')
round(abs(corr).max().sort_values(ascending=False).head(10), 2)

- Maximum correlation is around 0.6

- None of the features are colinear

## Test data

### Check missing values

In [None]:
mi.bar(train, figsize=(10,5))

- No missing Values

### Check distribution

#### Test features

In [None]:
values_distribution(test)

- Distributions of features values are the same for test data

# Train model

## Split data

In [None]:
X = train
y = train_labels

- Split data to train and validate datasets (80:20)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, 
                                                  test_size=0.2, 
                                                  shuffle=True,
                                                  random_state=RANDOM_STATE)

## Baseline model

- Baseline model will produce random predictions

In [None]:
dummy_clf = DummyClassifier(strategy='uniform', random_state=RANDOM_STATE)

In [None]:
dummy_clf, _ = train_model(X_train, X_val, y_train, y_val, dummy_clf)

- As expected accuracy is almost 0.5

## Simple Classifiers

- Lets compare different types ofclassifiers 

In [None]:
classifiers = [GradientBoostingClassifier(), 
               RandomForestClassifier(), 
               SGDClassifier(), 
               LinearSVC(), 
               KNeighborsClassifier(),
               MLPClassifier(),
               GaussianProcessClassifier(),
               AdaBoostClassifier()
              ]

In [None]:
res_combine = pd.DataFrame()

for clf in classifiers:
    _, res = train_model(X_train, X_val, y_train, y_val, clf, show=False)   
    res_combine = pd.concat([res_combine, res])

In [None]:
res_combine.sort_values(by='Accuracy valid', ascending=False)

- KNeighborsClassifier shows the best score on validation data

#### KNeighborsClassifier

In [None]:
knn_clf = KNeighborsClassifier()
knn_clf, _ = train_model(X_train, X_val, y_train, y_val, knn_clf, show=False)   

#### RandomForestClassifier

In [None]:
rf_clf = RandomForestClassifier()
rf_clf, _ = train_model(X_train, X_val, y_train, y_val, rf_clf, show=False)

## Feature Importance

In [None]:
plot_feature_importance(rf_clf.feature_importances_, X.columns, 'RADNOM FOREST')

- As we noticed during correlation analysis - features 12 and 14 have the most impact on predictions

## Scale features

In [None]:
scaler = StandardScaler()

In [None]:
scaler.fit(X)

In [None]:
X_scale = scaler.transform(X)

In [None]:
X_scale_train, X_scale_val, y_scale_train, y_scale_val = train_test_split(X_scale, y, 
                                                          test_size=0.2, 
                                                          shuffle=True,
                                                          random_state=RANDOM_STATE)

### Check results with simple classifiers

In [None]:
res_combine = pd.DataFrame()

for clf in classifiers:
    _, res = train_model(X_scale_train, X_scale_val, 
                         y_scale_train, y_scale_val, clf, show=False) 
    
    res_combine = pd.concat([res_combine, res])

In [None]:
res_combine.sort_values(by='Accuracy valid', ascending=False)

- Result is worse with scaling

## Tuning Hyper-parameters

- For tuning hyper-parameters we will use Grid Search with Stritified cross-validation strategy

In [None]:
cv = StratifiedKFold(n_splits=3)

### KNeighborsClassifier 

In [None]:
estimator = KNeighborsClassifier()

In [None]:
parameters = {'n_neighbors' : [i for i in range(3, 20, 1)], 
              'weights' : ['uniform', 'distance'],
              'metric':['euclidean', 
                        'manhattan', 
                        'chebyshev', 
                        'minkowski', 
                        'wminkowski', 
                        'seuclidean', 
                        'mahalanobis']
             }

In [None]:
knn_clf_grid = GridSearchCV(estimator, parameters, cv=cv, scoring='accuracy')

In [None]:
%%time
knn_clf_grid.fit(X, y)

In [None]:
display(pd.DataFrame(knn_clf_grid.cv_results_)[['params', 'mean_test_score']]\
                     .sort_values(by='mean_test_score', ascending=False)\
                     .head(5))

print('Best KNN parameters:\n', knn_clf_grid.best_params_)

In [None]:
knn_clf = KNeighborsClassifier(**knn_clf_grid.best_params_)
knn_clf, _ = train_model(X_train, X_val, y_train, y_val, knn_clf, show=True)   

### MLPClassifier

In [None]:
estimator = MLPClassifier()

In [None]:
parameters = {'hidden_layer_sizes' : [i for i in range(10, 200, 10)], 
              }

In [None]:
mlp_clf_grid = GridSearchCV(estimator, parameters, cv=cv, scoring='accuracy')

In [None]:
%%time
mlp_clf_grid.fit(X, y)

In [None]:
display(pd.DataFrame(mlp_clf_grid.cv_results_)[['params', 'mean_test_score']]\
                     .sort_values(by='mean_test_score', ascending=False)\
                     .head(5))

print('Best KNN parameters:\n', mlp_clf_grid.best_params_)

In [None]:
mlp_clf = MLPClassifier(**mlp_clf_grid.best_params_)
mlp_clf, _ = train_model(X_train, X_val, y_train, y_val, mlp_clf, show=True)   

### Random Forest Classifier

In [None]:
estimator = RandomForestClassifier()

In [None]:
parameters = {'max_depth' : [None, 5, 10, 20], 
              'n_estimators' : [100, 200, 300, 400, 500]
             }

In [None]:
rf_clf_grid = GridSearchCV(estimator, parameters, cv=cv, scoring='accuracy')

In [None]:
%%time
rf_clf_grid.fit(X, y)

In [None]:
display(pd.DataFrame(rf_clf_grid.cv_results_)[['params', 'mean_test_score']]\
                     .sort_values(by='mean_test_score', ascending=False)\
                     .head(5))

print('Best KNN parameters:\n', rf_clf_grid.best_params_)

In [None]:
rf_clf = RandomForestClassifier(**rf_clf_grid.best_params_)
rf_clf, _ = train_model(X_train, X_val, y_train, y_val, rf_clf, show=True)

## Gaussian Mixture

- Too reduce dimentions we can try to use Gaussian Mixture

- Set parameters to test

In [None]:
estimator = GaussianMixture(random_state=RANDOM_STATE)

In [None]:
parameters = {
    'n_components': [i for i in range(1, 10)],
    'covariance_type': ['spherical', 'tied', 'diag', 'full']
}

In [None]:
gm_grid = GridSearchCV(estimator, parameters, cv=5)

In [None]:
gm_grid.fit(pd.concat([train, test]))

In [None]:
gm = gm_grid.best_estimator_

## Tuning Hyper-parameters using Gaussian Mixture

- Prepare data with Gaussian Mixture

In [None]:
X_gm = gm.predict_proba(X)

- Grid search best parameters

In [None]:
estimator = KNeighborsClassifier()

In [None]:
parameters = {'n_neighbors' : [i for i in range(3, 20, 1)], 
              'weights' : ['uniform', 'distance'],
              'metric':['euclidean', 
                        'manhattan', 
                        'chebyshev', 
                        'minkowski', 
                        'wminkowski', 
                        'seuclidean', 
                        'mahalanobis']
             }

In [None]:
knn_clf_grid = GridSearchCV(estimator, parameters, cv=cv, scoring='accuracy')

In [None]:
%%time
knn_clf_grid.fit(X_gm, y)

In [None]:
display(pd.DataFrame(knn_clf_grid.cv_results_)[['params', 'mean_test_score']]\
                     .sort_values(by='mean_test_score', ascending=False)\
                     .head(5))

print('Best KNN parameters:\n', knn_clf_grid.best_params_)

In [None]:
knn_clf = knn_clf_grid.best_estimator_

- Result is not good enough with KNN estimator

# Submit Test predictions

In [None]:
test_gm = gm.predict_proba(test)

test_submission(knn_clf, test_gm)