In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_labels = pd.read_csv('../input/data-science-london-scikit-learn/trainLabels.csv', header=None) # (999, 1)
train = pd.read_csv('../input/data-science-london-scikit-learn/train.csv', header=None) # (999, 40)
test = pd.read_csv('../input/data-science-london-scikit-learn/test.csv', header=None) # (8999, 40)

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score

train_labels = np.ravel(train_labels)

X_train, X_test, y_train, y_test = train_test_split(train, train_labels)

In [None]:
type(y_train)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X_train, y_train)
clf.predict(X_test)
clf.score(X_test, y_test)

In [None]:
neighbors = np.arange(1, 20)
kfold = 10
train_acc = []
val_acc = []
bestKnn = None
bestAcc = 0.0

for i, k in enumerate(neighbors):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    train_acc.append(knn.score(X_train, y_train))
    nominee = np.mean(cross_val_score(knn, train, train_labels, cv=kfold))
    val_acc.append(nominee)
    if nominee > bestAcc:
        bestAcc = nominee
        bestKnn = knn


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=[13, 8])
plt.plot(neighbors, val_acc, label='Validation Accuracy')
plt.plot(neighbors, train_acc, label='Training Accuracy')
plt.legend()
plt.title('K values VS Accuarcy')
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.xticks(neighbors)
plt.show()

print("Best Accuracy without feature scaling: ", bestAcc)
print(bestKnn)

In [None]:
plt.style.use('ggplot')

In [None]:
test_fill = np.nan_to_num(test)
submission = pd.DataFrame(bestKnn.predict(test_fill))
print(submission.shape)

In [None]:
submission.columns = ['Solution']
submission['Id'] = np.arange(1, submission.shape[0]+1)
submission = submission[['Id', 'Solution']]
submission.head()

In [None]:
submission.to_csv('submission_with_copy.csv', index=False)
from subprocess import check_output
print(check_output(['ls', '../working']).decode('utf8'))

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer

std = StandardScaler()
X_std = std.fit_transform(train)

mms = MinMaxScaler()
X_mms = mms.fit_transform(train)

norm = Normalizer()
X_norm = norm.fit_transform(train)

In [None]:
val_acc = {'std': [], 'mms': [], 'norm': []}
bestScaling = None
bestKnn = None
bestAcc = 0.0

for i, k in enumerate(neighbors):
    knn = KNeighborsClassifier(n_neighbors=k)
    
    s1 = np.mean(cross_val_score(knn, X_std, train_labels, cv=kfold))
    val_acc['std'].append(s1)
    
    s2 = np.mean(cross_val_score(knn, X_mms, train_labels, cv=kfold))
    val_acc['mms'].append(s2)
    
    s3 = np.mean(cross_val_score(knn, X_norm, train_labels, cv=kfold))
    val_acc['norm'].append(s3)
    
    if s1 > bestAcc:
        bestAcc = s1
        bestKnn = knn
        bestScaling = 'std'
        
    if s2 > bestAcc:
        bestAcc = s2
        bestKnn = knn
        bestScaling = 'mms'
        
    if s3 > bestAcc:
        bestAcc = s3
        bestKnn = knn
        bestScaling = 'norm'

In [None]:
plt.figure(figsize=(13, 8))
plt.plot(neighbors, val_acc['std'], label='Cross Validation Accuracy with Standard Scaler')
plt.plot(neighbors, val_acc['mms'], label='Cross Validation Accuracy with Min Max Scalser')
plt.plot(neighbors, val_acc['norm'], label='Cross Validation Accuracy with Normalizer')
plt.legend()
plt.title('Find best K')
plt.xlabel('# of neighbors')
plt.ylabel('Accuracy')
plt.xticks(neighbors)
plt.show()

print('Best Accuracy with feature scaling: ', bestAcc)
print('Best KNN Classifier: ', bestKnn)
print('Best Scaler: ', bestScaling)

In [None]:
bestKnn.fit(X_norm, train_labels)
submission = pd.DataFrame(bestKnn.predict(norm.transform(test_fill)))
print(submission.shape)

In [None]:
submission.columns = ['Solution']
submission['Id'] = np.arange(1, submission.shape[0] + 1)
submission = submission[['Id', 'Solution']]
submission.head()

In [None]:
submission.to_csv('submission_with_normalize.csv', index=False)
print(check_output(['ls', '../working']).decode('utf8'))

In [None]:
import seaborn as sns

In [None]:
f, ax = plt.subplots(figsize=(18, 18))
sns.heatmap(pd.DataFrame(X_std).corr(), annot=True, linewidths=.5, fmt='.1f', ax=ax)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X_std, train_labels, test_size=.3)

In [None]:
clf_rf = RandomForestClassifier()
clf_rf = clf_rf.fit(X_train, y_train)

In [None]:
ac = accuracy_score(y_test, clf_rf.predict(X_test))
print("Accuracy: ", ac)

In [None]:
cm = confusion_matrix(y_test, clf_rf.predict(X_test))
sns.heatmap(cm, annot=True, fmt='d')

In [None]:
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV

In [None]:
kfold = 10
bestSVC = None
bestAcc = 0.0
val_acc = []
cv_range = np.arange(5, 11)
n_features = []

for cv in cv_range:
    svc = SVC(kernel='linear')
    rfecv = RFECV(estimator=svc, step=1, cv=cv, scoring='accuracy')
    rfecv.fit(X_std, train_labels)
    
    val_acc.append(np.mean(cross_val_score(svc, X_std[:, rfecv.support_],
                                           train_labels, cv=kfold)))
    
    n_features.append(rfecv.n_features_)
    if val_acc[-1] > bestAcc:
        bestAcc = val_acc[-1]

In [None]:
plt.figure(figsize=(13, 8))
plt.plot(cv_range, val_acc, label="CV Accuracy")

for i in range(len(cv_range)):
    plt.annotate(str(n_features[i]), xy=(cv_range[i], val_acc[i]))
    
plt.legend()
plt.title('Cross Validation Accuracy')
plt.xlabel('K Fold')
plt.ylabel('Accuracy')
plt.show()

print("Best Accuracy with feature scaling and RFECV: ", bestAcc)

Summary

In [None]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import VotingClassifier

X_train = train
y_train = train_labels
X_test = test

X_train = np.asarray(X_train)
y_train = np.asarray(y_train)
X_test = np.asarray(X_test)
y_train = y_train.ravel()

print("Training Data Shape: ", X_train.shape)
print("Training Target Shape: ", y_train.shape)
print("Testing Data Shape: ", X_test.shape)

In [None]:
X_all = np.r_[X_train, X_test]
print("Whole Data Shape: ", X_all.shape)

In [None]:
from sklearn.mixture import GaussianMixture

lowest_bic = np.infty
bic = []
n_components_range = range(1, 7)
cv_types = ['spherical', 'tied', 'diag', 'full']

for cv_type in cv_types:
    for n_components in n_components_range:
        gmm = GaussianMixture(n_components=n_components, covariance_type=cv_type)
        gmm.fit(X_all)
        bic.append(gmm.aic(X_all))
        
        if bic[-1] < lowest_bic:
            lowest_bic = bic[-1]
            best_gmm = gmm
            
best_gmm.fit(X_all)
X_train = best_gmm.predict_proba(X_train)
X_test = best_gmm.predict_proba(X_test)

knn = KNeighborsClassifier()
rf = RandomForestClassifier()

param_grid = dict()

grid_search_knn = GridSearchCV(knn, param_grid=param_grid, cv=10, scoring='accuracy').fit(X_train, y_train)
print("Best estimator KNN: ", grid_search_knn.best_estimator_,
     "Best score: ", grid_search_knn.best_estimator_.score(X_train, y_train))
knn_best = grid_search_knn.best_estimator_

grid_search_rf = GridSearchCV(rf, param_grid=dict(), verbose=3, cv=10, scoring='accuracy').fit(X_train, y_train)
print("Best estimator RandomForest: ", grid_search_rf.best_estimator_,
     "Best score: ", grid_search_rf.best_estimator_.score(X_train, y_train))
rf_best = grid_search_rf.best_estimator_

In [None]:
knn_best.fit(X_train, y_train)
rf_best.fit(X_train, y_train)

print("Score for KNN: ", cross_val_score(knn_best, X_train, y_train, cv=10, scoring='accuracy').mean())
print("Score for Random Forest: ", cross_val_score(rf_best, X_train, y_train, cv=10, scoring='accuracy').max())

In [None]:
knn_best_pred = pd.DataFrame(knn_best.predict(X_test))
knn_best_pred.index += 1
knn_best_pred.columns = ['Solution']
knn_best_pred['Id'] = np.arange(1, knn_best_pred.shape[0] + 1)
knn_best_pred = knn_best_pred[['Id', 'Solution']]
knn_best_pred.to_csv('submission.csv', index=False)

In [None]:
print(check_output(['ls', '../working']).decode('utf8'))