In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Loadind

Let's first load and look at the data

In [None]:
import pandas as pd 
import warnings
warnings.filterwarnings("ignore")

X_train = pd.read_csv("/kaggle/input/data-science-london-scikit-learn/train.csv", header=None)
X_test = pd.read_csv("/kaggle/input/data-science-london-scikit-learn/test.csv", header=None)
y_train = pd.read_csv("/kaggle/input/data-science-london-scikit-learn/trainLabels.csv", header=None)

In [None]:
X_train.info(memory_usage='deep')

In [None]:
X_train.describe()

The data look very clean ! We won't need to remove any part or inpute some missing data.

# Trying to fit few models

### Imports and scaling 

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import validation_curve, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import GradientBoostingClassifier

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
y_train = y_train.to_numpy().ravel()


### Fittting

In [None]:
estimators = {
    GradientBoostingClassifier(random_state=0): {'learning_rate': np.linspace(0.001, 1, 10)},
    DecisionTreeClassifier(random_state=0): {'max_depth': np.arange(1, 11)},
    RandomForestClassifier(random_state=0): {'max_depth': np.arange(1, 11)},
    KNeighborsClassifier(): {'n_neighbors': np.arange(1, 31, 3)},
    SVC(random_state=0): {'gamma': np.linspace(0.001, 0.1, 10)},
    LogisticRegression(): {'C': np.linspace(0.001, 0.1, 10)},
    SGDClassifier(random_state=0): {'alpha': np.linspace(0.00001, 0.1, 10)}
}

fig, axs = plt.subplots(nrows=4, ncols=2, figsize=(14, 24))
axs = axs.ravel()

for i, (estimator, params) in enumerate(estimators.items()):

    print(f'Training {estimator.__class__.__name__}')
    param_name = list(params)[0]
    param_range = params[param_name]

    grid = GridSearchCV(estimator, params, cv=3, scoring="accuracy", n_jobs=-1)
    grid.fit(X_train, y_train)
    train_score, validation_score = validation_curve(
        estimator, X_train, y_train, param_name=param_name, param_range=param_range, cv=3, scoring="accuracy", n_jobs=-1
    )

    axs[i].set_title(estimator.__class__.__name__)
    axs[i].plot(param_range, validation_score.mean(axis=1), label='Validation_score')
    axs[i].plot(param_range, train_score.mean(axis=1), label='Train_score')
    axs[i].scatter(grid.best_params_[param_name], grid.best_score_, c='r', label='GridSearchCV hyperparameter choice')
    axs[i].set_xlabel(param_name)
    axs[i].legend()

Random forest and knn seem to give the best results

# Preprocessing with Gaussian Mixtures

In fact the most of the features seem normaly distributed

In [None]:
X_train = pd.read_csv("/kaggle/input/data-science-london-scikit-learn/train.csv", header=None)
X_test = pd.read_csv("/kaggle/input/data-science-london-scikit-learn/test.csv", header=None)
y_train = pd.read_csv("/kaggle/input/data-science-london-scikit-learn/trainLabels.csv", header=None)

X_train.boxplot(figsize=(15, 8))

In [None]:
from scipy.stats import norm

random_feature = X_train.sample(axis=1)

min_ = random_feature.min()
max_ = random_feature.max()
mean = random_feature.mean()
std = random_feature.std()

fitted_gaussian = norm.pdf(np.linspace(min_, max_, 1000), loc=mean, scale=std)

plt.figure(figsize=(8, 6))
plt.hist(random_feature, bins=100, density=True, label='Feature data')
plt.plot(np.linspace(min_, max_, 1000), fitted_gaussian, lw=5, label='Gaussian fit')
plt.legend()
plt.title(random_feature.columns[0])
plt.show()

Our assumption here is that the data are coming from a collection of gaussian distributions (which is likely seeing the shape of the features !). So let's try to replace the data by their probability of being in one of the Gaussian mixture cluster. The fitting part will determine the number of clusters and their shape.

In [None]:
param_grid = {
    'n_components': range(1, 11),
    'covariance_type': ['spherical', 'tied', 'diag', 'full']
}

gm_grid = GridSearchCV(GaussianMixture(random_state=0), param_grid=param_grid, cv=5, n_jobs=-1)
gm_grid.fit(pd.concat([X_train, X_test]))

X_train = gm_grid.best_estimator_.predict_proba(X_train)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
y_train = y_train.to_numpy().ravel()

Let's try to fit the best models found in the first part

In [None]:
estimators = {
    RandomForestClassifier(random_state=0): {'max_depth': np.arange(1, 11)},
    KNeighborsClassifier(): {'n_neighbors': np.arange(1, 11)},
}

fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(14, 6))
axs = axs.ravel()

for i, (estimator, params) in enumerate(estimators.items()):

    print(f'Training {estimator.__class__.__name__}')
    param_name = list(params)[0]
    param_range = params[param_name]

    grid = GridSearchCV(estimator, params, cv=3, scoring="accuracy", n_jobs=-1)
    grid.fit(X_train, y_train)
    train_score, validation_score = validation_curve(
        estimator, X_train, y_train, param_name=param_name, param_range=param_range, cv=3, scoring="accuracy", n_jobs=-1
    )

    axs[i].set_title(estimator.__class__.__name__)
    axs[i].plot(param_range, validation_score.mean(axis=1), label='Validation_score')
    axs[i].plot(param_range, train_score.mean(axis=1), label='Train_score')
    axs[i].scatter(grid.best_params_[param_name], grid.best_score_, c='r', label='GridSearchCV hyperparameter choice')
    axs[i].set_xlabel(param_name)
    axs[i].legend()

# Trainning of the best model

In [None]:
X_test = gm_grid.best_estimator_.predict_proba(X_test)
X_test = scaler.transform(X_test)

forest = RandomForestClassifier(random_state=0, max_depth=7)

forest.fit(X_train, y_train)
submission = forest.predict(X_test)

submission = pd.DataFrame(submission, index=range(1, 9001)).reset_index()
submission.columns= ['Id', 'Solution']

submission.to_csv("submission.csv", index=False)