In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output 
# when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [None]:
full_train = pd.read_csv('/kaggle/input/data-science-london-scikit-learn/train.csv', header=None)
full_test = pd.read_csv('/kaggle/input/data-science-london-scikit-learn/test.csv', header=None)
trainLabels = pd.read_csv('/kaggle/input/data-science-london-scikit-learn/trainLabels.csv', names=['y'])

# 1. **Some EDA**

In [None]:
full_train.head()

In [None]:
full_train.shape,full_test.shape,trainLabels.shape

In [None]:
# check out missing values
full_train.info()
print('-'*50)
full_test.info()

In [None]:
# check whether exist the unbalanced-problems
trainLabels.apply(pd.value_counts)

# 2. **Explore baseline models**

In [None]:
models = {}
models['LR'] = LogisticRegression()
models['LDA'] = LinearDiscriminantAnalysis()
models['KNN'] = KNeighborsClassifier()
models['DT'] = DecisionTreeClassifier()
models['RF'] = RandomForestClassifier()
models['NB'] = GaussianNB()
models['SVM'] = SVC()

In [None]:
baseline_mean_acc = {}
results = []
for model in models:
    kfold = KFold(n_splits=5)
    cv_results = cross_val_score(models[model],full_train, np.ravel(trainLabels),cv=kfold,scoring='accuracy')
    results.append(cv_results)
    baseline_mean_acc[model] = round(cv_results.mean(), 3)

baseline_mean_acc

# **3. Feature Engineering**

# Feature Scaling

Two approaches are shown below:

- The StandardScaler assumes your data is normally distributed within each ***feature*** and will scale them such that the distribution is now centred around 0, with a standard deviation of 1.

- The normalizer scales each ***value*** by dividing each value by its magnitude in n-dimensional space for n number of features.

In [None]:
from sklearn.preprocessing import StandardScaler, Normalizer

std = StandardScaler()
std_train_data = std.fit_transform(full_train)

norm = Normalizer()
norm_train_data = norm.fit_transform(full_train)

In [None]:
std_mean_acc = {}
results_std = []
for model in models:
    kfold = KFold(n_splits=5)
    cv_results_std = cross_val_score(models[model],std_train_data, np.ravel(trainLabels),cv=kfold,scoring='accuracy')
    results_std.append(cv_results_std)
    std_mean_acc[model] = round(cv_results_std.mean(), 3)
    
std_mean_acc

In [None]:
norm_mean_acc = {}
results_norm = []
for model in models:
    kfold = KFold(n_splits=5)
    cv_results_norm = cross_val_score(models[model], norm_train_data, np.ravel(trainLabels),cv=kfold,scoring='accuracy')
    results_norm.append(cv_results_norm)
    norm_mean_acc[model] = round(cv_results_norm.mean(), 3)
    
norm_mean_acc

# **Principal Component Analysis (PCA)**

In [None]:
from sklearn.decomposition import PCA

pca = PCA(0.85, whiten=True)
pca_train_data = pca.fit_transform(full_train)
print(pca_train_data.shape,'\n')

explained_variance = pca.explained_variance_ratio_ 
print(explained_variance)

In [None]:
pca_mean_acc = {}
results_pca = []
for model in models:
    kfold = KFold(n_splits=5)
    cv_results_pca = cross_val_score(models[model], pca_train_data, np.ravel(trainLabels),cv=kfold,scoring='accuracy')
    results_pca.append(cv_results_pca)
    pca_mean_acc[model] = round(cv_results_pca.mean(), 3)
    
pca_mean_acc

# **Visualisation**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=[12,7])


plt.plot(range(len(baseline_mean_acc)), list(baseline_mean_acc.values()), label='Baseline')
plt.plot(range(len(std_mean_acc)), list(std_mean_acc.values()), label='Std_scale')
plt.plot(range(len(norm_mean_acc)), list(norm_mean_acc.values()), label='Norm_scale')
plt.plot(range(len(pca_mean_acc)), list(pca_mean_acc.values()), label='PCA')


plt.legend(loc='lower right')
plt.title('Approach comparison')
plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.xticks(range(len(baseline_mean_acc)), list(baseline_mean_acc.keys()))
plt.yticks(np.arange(0.75, 0.93, 0.005))
plt.grid()
plt.show()

# Gaussian Mixture and Grid Search

Lets take the above 2 algorithms (**KNN and SVM**) which gave maximum accuracy for the further analysis

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.mixture import GaussianMixture

In [None]:
X = np.r_[full_train,full_test]
print('X shape :',X.shape)

In [None]:
# USING THE GAUSSIAN MIXTURE MODEL 
# The Bayesian information criterion (BIC) can be used to select the number of components 
# in a Gaussian Mixture in an efficient way. As the AIC does.

lowest_bic = np.infty
bic = []
n_components_range = range(1, 10)

# The GaussianMixture comes with different options to constrain the covariance of the difference classes estimated: 
# spherical, diagonal, tied or full covariance.

cv_types = ['spherical', 'tied', 'diag', 'full']
for cv_type in cv_types:
    for n_components in n_components_range:
        gmm = GaussianMixture(n_components=n_components,covariance_type=cv_type)
        gmm.fit(X)
        bic.append(gmm.bic(X))
        if bic[-1] < lowest_bic:
            lowest_bic = bic[-1]
            best_gmm = gmm
            
best_gmm.fit(X)
gmm_train = best_gmm.predict_proba(full_train)
gmm_test = best_gmm.predict_proba(full_test)

The predict_proba method take in new data points and predict the probability that this data point came from each Gaussian distribution.

In [None]:
#KNN 
knn = KNeighborsClassifier()

#USING GRID SEARCH
param_grid_knn = {"n_neighbors": range(1, 11, 2), 
              "weights": ['uniform', 'distance']}

grid_search_knn = GridSearchCV(estimator=knn, 
                               param_grid=param_grid_knn, 
                               cv = 5, n_jobs=-1,
                               scoring='accuracy').fit(gmm_train, trainLabels.values.ravel())

knn_best = grid_search_knn.best_estimator_

print('KNN Best Score', grid_search_knn.best_score_)
print('KNN Best Params',grid_search_knn.best_params_)
print('KNN Accuracy',cross_val_score(knn_best, gmm_train, trainLabels.values.ravel(), cv=5).mean())

In [None]:
#SVM
svc = SVC()

#USING GRID SEARCH
param_grid_svm = {'C':[1,10,100,1000],
              'gamma':[1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel':['linear','rbf']}

grid_search_svm = GridSearchCV(estimator=svc, 
                               param_grid=param_grid_svm, 
                               cv = 5, n_jobs=-1,
                               scoring='accuracy').fit(gmm_train, trainLabels.values.ravel())

svm_best = grid_search_svm.best_estimator_

print('SVM Best Score',grid_search_svm.best_score_)
print('SVM Best Params',grid_search_svm.best_params_)
print('SVM Accuracy',cross_val_score(svm_best,gmm_train, trainLabels.values.ravel(), cv=5).mean())

# **4. Submission**

In [None]:
# Fitting our model
svm_best.fit(gmm_train,trainLabels.values.ravel())
pred  = svm_best.predict(gmm_test)

In [None]:
submission = pd.DataFrame(pred)
submission.columns = ['Solution']
submission['Id'] = np.arange(1,submission.shape[0]+1)
submission = submission[['Id', 'Solution']]
submission.to_csv('submission_with_GMM.csv', index=False)