## Data Science London + Scikit-learn

### Some preprocessing and useful functions

In [1]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.svm import SVC
import pandas as pd

input_path = "../input/data-science-london-scikit-learn/"
output_path = "../working/"

train_data = pd.read_csv(f"{input_path}train.csv", header=None)
train_labels = pd.read_csv(f"{input_path}/trainLabels.csv", header=None)
test_data = pd.read_csv(f"{input_path}/test.csv", header=None)

def get_svc_fitted_model(x_train, y_train):
    svc_model = SVC(gamma='auto', kernel='rbf', C=1.0)
    svc_model.fit(x_train, y_train)
    return svc_model

def get_train_test_data(train_data):
    # don't blame me for using global variable pls
    global train_labels
    # as Rostovces said, test set is 15-30% of the whole dataset
    return train_test_split(train_data, train_labels, test_size=0.3, random_state=1337)

### Naive approach

In [2]:
x_train, x_test, y_train, y_test = get_train_test_data(train_data)

svc_model = get_svc_fitted_model(x_train, y_train.values.ravel())

print(
    f"SVC cross_val_score: {cross_val_score(svc_model, train_data, train_labels.values.ravel(), cv=10).mean()}"
)
print(f"SVC accuracy_score: {accuracy_score(y_test, svc_model.predict(x_test))}")

SVC cross_val_score: 0.915
SVC accuracy_score: 0.8866666666666667


## Approach with data normalizing

(somehow it's worse than naive approach)

In [3]:
from sklearn.preprocessing import Normalizer

normalized_train_data = Normalizer().fit_transform(train_data)

x_train, x_test, y_train, y_test = get_train_test_data(normalized_train_data)

svc_model = get_svc_fitted_model(x_train, y_train.values.ravel())

print(
    f"SVC cross_val_score (NORM): {cross_val_score(svc_model, normalized_train_data, train_labels.values.ravel(), cv=10).mean()}"
)
print(f"SVC accuracy_score (NORM): {accuracy_score(y_test, svc_model.predict(x_test))}")

SVC cross_val_score (NORM): 0.808
SVC accuracy_score (NORM): 0.7766666666666666


## PCA Approach

(small improvement)

In [5]:
from sklearn.decomposition import PCA

pca = PCA(n_components=12)
pca_train_data = pca.fit_transform(train_data)

x_train, x_test, y_train, y_test = get_train_test_data(pca_train_data)

svc_model = get_svc_fitted_model(x_train, y_train.values.ravel())

print(
    f"SVC cross_val_score (PCA): {cross_val_score(svc_model, pca_train_data, train_labels.values.ravel(), cv=10).mean()}"
)

print(f"SVC accuracy_score (PCA): {accuracy_score(y_test, svc_model.predict(x_test))}")

SVC cross_val_score (PCA): 0.907
SVC accuracy_score (PCA): 0.92


### Trying to find best parameters for model with best PCA result

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.mixture import GaussianMixture
import numpy as np

x_all = np.r_[train_data, test_data]

lowest_bic = np.infty
bic = []
n_components_range = range(1, 7)
cv_types = ['spherical', 'tied', 'diag', 'full']
for cv_type in cv_types:
    for n_components in n_components_range:
        gmm = GaussianMixture(n_components=n_components, covariance_type=cv_type)
        gmm.fit(x_all)
        bic.append(gmm.bic(x_all))
        if bic[-1] < lowest_bic:
            lowest_bic = bic[-1]
            best_gmm = gmm

best_gmm.fit(x_all)
gmm_train = best_gmm.predict_proba(train_data)
gmm_test = best_gmm.predict_proba(test_data)


param_grid = {
    'C': [0.01, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['rbf', 'poly', 'sigmoid', 'linear'],
}

grid = GridSearchCV(SVC(), param_grid, refit=True, cv=10, verbose=1, n_jobs=-1, scoring='accuracy')
grid.fit(gmm_train, train_labels.values.ravel())

print(f"Best params: {grid.best_params_}")
print(f"Best estimator: {grid.best_estimator_}")
print(f"Best score: {grid.best_score_}")

svc_best = grid.best_estimator_

print(f"SVC cross_val_score (GMM): {cross_val_score(svc_best, gmm_train, train_labels.values.ravel(), cv=10).mean()}")

Fitting 10 folds for each of 80 candidates, totalling 800 fits
Best params: {'C': 0.01, 'gamma': 1, 'kernel': 'rbf'}
Best estimator: SVC(C=0.01, gamma=1)
Best score: 0.9960000000000001
SVC cross_val_score (GMM): 0.9960000000000001


### Use best params to fit model

In [15]:
model = svc_best

predictions = model.predict(gmm_test)

submission = pd.DataFrame({'Id': range(1, len(predictions) + 1), 'Solution': predictions})
submission.to_csv(f"submission_lab_1.csv", index=False)