### Import Libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import validation_curve
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

### Import Data

In [None]:
train_df = pd.read_csv('/kaggle/input/data-science-london-scikit-learn/train.csv', header=None)
trainLabels_df = pd.read_csv('/kaggle/input/data-science-london-scikit-learn/trainLabels.csv', header=None)
test_df = pd.read_csv ('/kaggle/input/data-science-london-scikit-learn/test.csv', header=None)

In [None]:
train_df.head()

In [None]:
train_df.shape, test_df.shape, trainLabels_df.shape

In [None]:
train_df.describe()

### Pre-Processing

In [None]:
X = train_df
y = trainLabels_df

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

### Classification Models

In [None]:
## SVM ##
from sklearn.svm import SVC
svc_clf = SVC(random_state=0, gamma='auto', C=1).fit(X_train, y_train.values.ravel())
y_svc_predict = svc_clf.predict(X_test)
print('SVM: ', accuracy_score(y_test, y_svc_predict))

## Decision Tree ##
from sklearn.tree import DecisionTreeClassifier
''' # Run validation for best max_depth
param_range = np.arange(3, 10)
train_scores, test_scores = validation_curve(DecisionTreeClassifier(random_state=0),
                                             X, y,
                                             param_name='max_depth', param_range=param_range,
                                             cv=5)
max_depth_best = param_range[np.argmax(test_scores.mean(axis=1), axis=0)] # 5'''

dt_clf = DecisionTreeClassifier(max_depth=5, random_state=0).fit(X_train, y_train)
y_dt_predict = dt_clf.predict(X_test)
print('Decision Tree: ', accuracy_score(y_test, y_dt_predict))

## KNeighborsClassifier ##
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors = 5).fit(X_train,y_train.values.ravel())
y_knn_predict = knn_clf.predict(X_test)

print('KNN',accuracy_score(y_test, y_knn_predict))

### Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler, Normalizer

norm = Normalizer()
X_norm = norm.fit_transform(X)
y_norm = y.values.ravel()

In [None]:
## SVM ##
from sklearn.svm import SVC
svc_clf = SVC(random_state=0, gamma='auto', C=1)
print('SVC: ', cross_val_score(svc_clf, X_norm, y_norm, cv=10).mean())

## Decision Tree ##
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(max_depth=5, random_state=0)
print('Decision Tree: ', cross_val_score(dt_clf, X_norm, y_norm, cv=10).mean())

## KNeighborsClassifier ##
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors = 5)
print('KNN: ', cross_val_score(knn_clf, X_norm, y_norm, cv=10).mean())

### Principal Component Analysis

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=12)
X_pca = pca.fit_transform(X)
y_pca = y.values.ravel()

X_pca.shape

In [None]:
## SVM ##
from sklearn.svm import SVC
svc_clf = SVC(random_state=0, gamma='auto', C=1)
print('SVC: ', cross_val_score(svc_clf, X_pca, y_pca, cv=10).mean())

## Decision Tree ##
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(max_depth=5, random_state=0)
print('Decision Tree: ', cross_val_score(dt_clf, X_pca, y_pca, cv=10).mean())

## KNeighborsClassifier ##
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors = 5)
print('KNN: ', cross_val_score(knn_clf, X_pca, y_pca, cv=10).mean())

### Gaussian Mixture & Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.mixture import GaussianMixture

full_df = np.r_[train_df, test_df]
full_df.shape

In [None]:
## Gaussian Mixture Model ##
lowest_bic = np.infty
bic = []
n_components_range = range(1,7)
cv_types = ['spherical', 'tied', 'diag', 'full']
best_gmm = None
for cv_type in cv_types:
    for n_components in n_components_range:
        gmm = GaussianMixture(n_components=n_components, covariance_type=cv_type)
        gmm.fit(full_df)
        bic.append(gmm.aic(full_df))
        if bic[-1] < lowest_bic:
            lowest_bic = bic[-1]
            best_gmm = gmm
print(best_gmm)

best_gmm.fit(full_df)
gmm_train = best_gmm.predict_proba(train_df)
gmm_test = best_gmm.predict_proba(test_df)

In [None]:
## SVM ##
from sklearn.svm import SVC
svc = SVC(kernel='rbf')

param_grid = [{'kernel': ['linear'], 'C': [1, 10, 100]},
              {'kernel': ['rbf'], 'C': [1, 10, 100], 'gamma': [0.0001, 0.001, 0.01, 0.1]}]
grid_search_svc = GridSearchCV(estimator=svc, param_grid=param_grid, cv=10, n_jobs=-1, scoring='accuracy' 
                              ).fit(gmm_train, y.values.ravel())
svc_clf = grid_search_svc.best_estimator_

print('SVM Best Score: ', grid_search_svc.best_score_)
print('SVM Best Params: ', grid_search_svc.best_params_)
print('SVM: ', cross_val_score(svc_clf, gmm_train, y.values.ravel(), cv=10).mean())
print()

## KNeighborsClassifier ##
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()

n_neighbors = np.arange(3, 10, 2)
param_grid = dict(n_neighbors=n_neighbors)
grid_search_knn = GridSearchCV(estimator=knn, param_grid=param_grid, cv=10, n_jobs=-1, scoring='accuracy'
                                  ).fit(gmm_train, y.values.ravel())
knn_clf = grid_search_knn.best_estimator_

print('KNN Best Score: ', grid_search_knn.best_score_)
print('KNN Best Params: ',grid_search_knn.best_params_)
print('KNN: ', cross_val_score(knn_clf, gmm_train, y.values.ravel(), cv=10).mean())

### Submission

In [None]:
knn_clf.fit(gmm_train, y.values.ravel())
predict = knn_clf.predict(gmm_test)

predict_df = pd.DataFrame(predict, columns=['Solution'], index=np.arange(1,9001))
predict_df.index.name = 'Id'
predict_df.reset_index(drop=False, inplace=True)

predict_df.head()

In [None]:
predict_df.to_csv('submission.csv', index=False)