# Data Science London + Scikit-learn
This is a synthetic data set of 40 features, representing objects from two classes (labeled as 0 or 1). The training set has 1000 samples and the testing set has 9000.
More information can be found [here](https://www.kaggle.com/c/data-science-london-scikit-learn/data)

In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import random as rn

# Define plot style
plt.style.use('ggplot')
%matplotlib inline

In [None]:
# Read train and test data
train = pd.read_csv('../input/data-science-london-scikit-learn/train.csv', header=None)
test = pd.read_csv('../input/data-science-london-scikit-learn/test.csv', header=None)

# Read train labels
train_label = pd.read_csv('../input/data-science-london-scikit-learn/trainLabels.csv', header=None)

# Check the data dimensions
print('Train data:', train.shape)
print('Test data:', test.shape)
print('Train label', train_label.shape)

In [None]:
# Print out data information
print(train.info())

In [None]:
# Print out summary statistics
print(train.describe())

# Visualize summary statistics
fig, ax = plt.subplots(figsize=[13,5])  # default figsize = [6,4, 4.8]
plt.boxplot(train)
plt.show()

In [None]:
# Let's split train data into train and validadtion set
from sklearn.model_selection import train_test_split
train_label = np.ravel(train_label)
Xtrain, Xval, ytrain, yval = train_test_split(train, train_label, random_state=42, test_size=0.2)
print(Xtrain.shape)
print(Xval.shape)

# Import various classification models
In this notebook, we will try 5 different classifiers:
* k-neighest neighbors (KNN)
* Random forest (RF)
* Support vector machine (SVM) with linear and rbf kernels

We will first train the four classifiers on the raw data. Then, we will do some feature engineering using Gaussian Mixture Models (GMM) to obtain new data, and re-trained the four classifiers on the new data.

In [None]:
# Import classifiers from sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score


# Option 1: Raw data
## kNN

In [None]:
num_neighs = np.arange(1,26)
train_accuracy = []
val_accuracy = []
best_accuracy_knn = 0.0

# Run kNN over different hyperparameter values
for i, n in enumerate(num_neighs):
    knn = KNeighborsClassifier(n_neighbors=n)
    # Fit kNN
    knn.fit(Xtrain, ytrain)
    # Train accuracy
    train_accuracy.append(knn.score(Xtrain, ytrain))
    # Validation accuracy
    val_accuracy.append(knn.score(Xval, yval))
    print("kNN (k={}): train accuracy={:.5f}, validation accuracy={:.5f}"
          .format(n, train_accuracy[i], val_accuracy[i]))
    if val_accuracy[i] > best_accuracy_knn:
        best_accuracy_knn = val_accuracy[i]
        best_knn = knn
        best_num_neighs = n

plt.plot(num_neighs, train_accuracy, label='Train')
plt.plot(num_neighs, val_accuracy, label='Validation')
plt.ylabel('Accuracy')
plt.xlabel('Number of neighbors')
plt.legend()
plt.show()

print('Best validation accuracy (k={}): {:.5f}'.format(best_num_neighs, best_accuracy_knn))

## RF

In [None]:
estimators = [25, 50, 75, 100]  # number of estimators 
max_depths = [20, 30, 40, 50]
splits = [5, 10, 15]
param = {'n_estimators':estimators, 
         'max_depth':max_depths, 
         'min_samples_split':splits}
train_accuracy = []
val_accuracy = []

rf = RandomForestClassifier(random_state=100)
RandomForestCV = GridSearchCV(estimator=rf, param_grid=param, cv=10)
RandomForestCV.fit(Xtrain, ytrain)
train_accuracy = RandomForestCV.score(Xtrain, ytrain)
best_accuracy_rf = RandomForestCV.score(Xval, yval)

# Print results
print(RandomForestCV.best_params_)
print("RF: train accuracy={:.5f}, validation accuracy={:.5f}"
      .format(train_accuracy, best_accuracy_rf))

## SVM (linear)

In [None]:
svm = SVC(kernel='linear', random_state=100, C=1)
svm.fit(Xtrain, ytrain)
# Train accuracy
train_accuracy = svm.score(Xtrain, ytrain)
# Validation accuracy
best_accuracy_svml = svm.score(Xval, yval)
print("SVM (linear): train accuracy={:.5f}, validation accuracy={:.5f}"
      .format(train_accuracy, best_accuracy_svml))


## SVM (rbf)

In [None]:
svm = SVC(kernel='rbf', gamma='auto', random_state=100, C=1)
svm.fit(Xtrain, ytrain)
# Train accuracy
train_accuracy = svm.score(Xtrain, ytrain)
# Validation accuracy
best_accuracy_svmr = svm.score(Xval, yval)
print("SVM (rbf): train accuracy={:.5f}, validation accuracy={:.5f}"
      .format(train_accuracy, best_accuracy_svmr))

Let's compare the final results when the classifiers were trained on the raw data

In [None]:
print('KNN: {:.5f}'.format(best_accuracy_knn))
print('RF: {:.5f}'.format(best_accuracy_rf))
print('SVM (linear): {:.5f}'.format(best_accuracy_svml))
print('SVM (rbf): {:.5f}'.format(best_accuracy_svmr))

# Option 2: Feature engineering
## Gaussian mixture model (GMM)
GMMs can have difficulty converging in a high dimensional space, so we will do dimensionality reduction on the data. Here, we will use a straightforward PCA, asking it to preserve 95% of the variance in the data

In [None]:
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture

pca = PCA(0.95,whiten=True)
train_pc = pca.fit_transform(train)
test_pc = pca.transform(test)
print(train_pc.shape)
print(test_pc.shape)

In [None]:
# Let's fit GMM with various number of components
n_components = np.arange(1,11)
models = [GaussianMixture(n, covariance_type='full', random_state=100) 
          for n in n_components]
aics = [model.fit(train_pc).aic(train_pc) for model in models]
bics = [model.fit(train_pc).bic(train_pc) for model in models]

plt.plot(n_components, aics, label='AIC')
plt.plot(n_components, bics, label='BIC')
plt.xlabel('Number of Gaussians')
plt.legend()
plt.show()

It appears that around 4 components minimize both AIC and BIC. We can use the four cluster probabilities as new features.

In [None]:
best_gmm = GaussianMixture(n_components=4, covariance_type='full').fit(train_pc)

# Let's apply the model to get probabilities
train_gmm = best_gmm.predict_proba(train_pc)
test_gmm = best_gmm.predict_proba(test_pc)
print(train_gmm.shape)
print(test_gmm.shape)

# Split the new data into train and validadtion set
Xtrain_gmm, Xval_gmm, ytrain_gmm, yval_gmm = train_test_split(train_gmm, train_label, random_state=42, test_size=0.2)

## kNN

In [None]:
num_neighs = np.arange(2,26)
train_accuracy = []
val_accuracy = []
best_accuracy_knn_gmm = 0.0

# Run kNN over different hyperparameter values
for i, n in enumerate(num_neighs):
    knn = KNeighborsClassifier(n_neighbors=n)
    # Fit kNN
    knn.fit(Xtrain_gmm, ytrain_gmm)
    # Train accuracy
    train_accuracy.append(knn.score(Xtrain_gmm, ytrain_gmm))
    # Validation accuracy
    val_accuracy.append(knn.score(Xval_gmm, yval_gmm))
    print("kNN (k={}): train accuracy={:.5f}, validation accuracy={:.5f}"
          .format(n, train_accuracy[i], val_accuracy[i]))
    if val_accuracy[i] > best_accuracy_knn_gmm:
        best_accuracy_knn_gmm = val_accuracy[i]
        best_knn = knn
        best_num_neighs = n

plt.plot(num_neighs, train_accuracy, label='Train')
plt.plot(num_neighs, val_accuracy, label='Validation')
plt.ylabel('Accuracy')
plt.xlabel('Number of neighbors')
plt.legend()
plt.show()

print('Best validation accuracy (k={}): {:.5f}'.format(best_num_neighs, best_accuracy_knn_gmm))

## RF

In [None]:
# Random Forest
estimators = [25, 50, 75, 100]  # number of estimators 
max_depths = [20, 30, 40, 50]
splits = [5, 10, 15]
param = {'n_estimators':estimators, 
         'max_depth':max_depths, 
         'min_samples_split':splits}

rf = RandomForestClassifier(random_state=100)
RandomForestCV = GridSearchCV(estimator=rf, param_grid=param, cv=10)  # cv=5 in default
RandomForestCV.fit(Xtrain_gmm, ytrain_gmm)
train_accuracy = RandomForestCV.score(Xtrain_gmm, ytrain_gmm)
best_accuracy_rf_gmm = RandomForestCV.score(Xval_gmm, yval_gmm)

# Print results
print(RandomForestCV.best_params_)
print("RF: train accuracy={:.5f}, validation accuracy={:.5f}"
      .format(train_accuracy, best_accuracy_rf_gmm))

## SVM (linear)

In [None]:
# SVM (linear)
svm = SVC(kernel='linear', random_state=100, C=1)
svm.fit(Xtrain_gmm, ytrain_gmm)
# Train accuracy
train_accuracy = svm.score(Xtrain_gmm, ytrain_gmm)
# Validation accuracy
best_accuracy_svml_gmm = svm.score(Xval_gmm, yval_gmm)
print("SVM (linear): train accuracy={:.5f}, validation accuracy={:.5f}"
      .format(train_accuracy, best_accuracy_svml_gmm))

## SVM (rbf)

In [None]:
# SVM (rbf)
svm = SVC(kernel='rbf', gamma='auto', random_state=100, C=1)
svm.fit(Xtrain_gmm, ytrain_gmm)
# Train accuracy
train_accuracy = svm.score(Xtrain_gmm, ytrain_gmm)
# Validation accuracy
best_accuracy_svmr_gmm = svm.score(Xval_gmm, yval_gmm)
print("SVM (rbf): train accuracy={:.5f}, validation accuracy={:.5f}"
      .format(train_accuracy, best_accuracy_svmr_gmm))

# Summary
Let's compare all the results we have gotten so far

In [None]:
acc_raw = [best_accuracy_knn, best_accuracy_rf, best_accuracy_svml, best_accuracy_svmr]
acc_gmm = [best_accuracy_knn_gmm, best_accuracy_rf_gmm, best_accuracy_svml_gmm, best_accuracy_svmr_gmm]

res = pd.DataFrame({'Raw': acc_raw, 'GMM': acc_gmm}, index=['kNN','RF','SVM (linear)','SVM (rbf)'])
print(res)

# Submission
From the above, we can see that applying GMM increased the classification accuracy for all the four classifiers. We will re-train the four classifiers using entire samples and submit the results.

In [None]:
best_knn.fit(train_gmm, train_label)
ypred = best_knn.predict(test_gmm)
submission = {'Id': np.arange(1, ypred.shape[0]+1),
             'Solution': ypred}
submission = pd.DataFrame(submission)
submission.to_csv('submission_knn.csv', index=False)

In [None]:
rf = RandomForestClassifier(max_depth=20, min_samples_split=5, n_estimators=25)
rf.fit(train_gmm, train_label)
ypred = rf.predict(test_gmm)
submission = {'Id': np.arange(1, ypred.shape[0]+1),
             'Solution': ypred}
submission = pd.DataFrame(submission)
submission.to_csv('submission_rf.csv', index=False)

In [None]:
svm = SVC(kernel='linear', random_state=100, C=1)
svm.fit(train_gmm, train_label)
ypred = svm.predict(test_gmm)
submission = {'Id': np.arange(1, ypred.shape[0]+1),
             'Solution': ypred}
submission = pd.DataFrame(submission)
submission.to_csv('submission_svm_linear.csv', index=False)

In [None]:
svm = SVC(kernel='rbf', random_state=100, C=1)
svm.fit(train_gmm, train_label)
ypred = svm.predict(test_gmm)
submission = {'Id': np.arange(1, ypred.shape[0]+1),
             'Solution': ypred}
submission = pd.DataFrame(submission)
submission.to_csv('submission_svm_rbf.csv', index=False)