In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
import warnings 
warnings.simplefilter("ignore")

In [None]:
trainLabels_df = pd.read_csv("/kaggle/input/data-science-london-scikit-learn/trainLabels.csv", header=None)
train_df = pd.read_csv("/kaggle/input/data-science-london-scikit-learn/train.csv", header=None)
test_df = pd.read_csv("/kaggle/input/data-science-london-scikit-learn/test.csv", header=None)


In [None]:
print("trainLabels shape: ", trainLabels_df.shape)
print("train shape: ", train_df.shape)
print("test shape: ", test_df.shape)

In [None]:
train_df.head()

In [None]:
trainLabels_df.head()

In [None]:
test_df.head()

In [None]:
train_df.info()

In [None]:
train_df.isnull().sum()

In [None]:
train_df.describe()

In [None]:
trainLabels_df.isnull().sum()

In [None]:
trainLabels_df.columns = ["label"]
m_train_df = pd.merge(train_df, trainLabels_df, left_index=True, right_index=True)
m_train_df.head()

In [None]:
plt.figure(figsize=(25,25))
sns.heatmap(m_train_df.corr(), annot=True)

In [None]:
train_df.hist(figsize=(20,20))

In [None]:
m_train_df

In [None]:
int_cols = train_df.columns
fig, ax= plt.subplots(nrows=10, ncols=4, figsize=(40,20), constrained_layout=True)
plt.suptitle('Feature distribution by label', size=20, weight='bold')
ax=ax.flatten()
for x, i in enumerate(int_cols):
    sns.boxplot(data=m_train_df, y=i, x='label', ax=ax[x])
    for s in ['left','right','top','bottom']:
        ax[x].spines[s].set_visible(False)

In [None]:
values_freq = m_train_df["label"].value_counts()
print(values_freq)
values_freq.plot(kind='pie', autopct='%1.1f%%')
plt.title('values_freq')
plt.axis('equal')
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train = scaler.fit_transform(train_df.values)

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_train, trainLabels_df.values, test_size=0.1, shuffle=True)

In [None]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

def print_score(y_pred, y_real):
    print("Accuracy: ", accuracy_score(y_real, y_pred))

    print()
    print("Macro precision_recall_fscore_support (macro) average")
    print(precision_recall_fscore_support(y_real, y_pred, average="macro"))

    print()
    print("Macro precision_recall_fscore_support (micro) average")
    print(precision_recall_fscore_support(y_real, y_pred, average="micro"))

    print()
    print("Macro precision_recall_fscore_support (weighted) average")
    print(precision_recall_fscore_support(y_real, y_pred, average="weighted"))
    
    print()
    print("Confusion Matrix")
    print(confusion_matrix(y_real, y_pred))

In [None]:
from sklearn.model_selection import GridSearchCV

def get_trained_grid(model, grid_params, x_train, y_train ,refit=True, cv=10, verbose=1):
    grid = GridSearchCV(model, grid_params, refit=refit, cv=cv, verbose=verbose)
    grid.fit(x_train, y_train)
    return grid

In [None]:
def get_grid_best_params(grid):
    print(grid.best_params_)
    print(grid.best_estimator_)

In [None]:
def print_grid_performance(grid, x_test, y_test):
    y_pred = grid.predict(x_test)
    print_score(y_pred, y_test)

In [None]:
from sklearn.linear_model import SGDClassifier

%time
grid_params = { "loss": ["hinge", "log", "modified_huber"],
               "penalty": ["l1", "l2", "elasticnet"]   
}

grid = get_trained_grid(SGDClassifier(), grid_params, x_train, y_train)
get_grid_best_params(grid)

In [None]:
print_grid_performance(grid, x_test, y_test)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

%time
grid_params = { "n_neighbors": np.arange(1,50)}

grid = get_trained_grid(KNeighborsClassifier(), grid_params, x_train, y_train)
get_grid_best_params(grid)

In [None]:
print_grid_performance(grid, x_test, y_test)

In [None]:
from sklearn.naive_bayes import GaussianNB

%time
grid_params = { "var_smoothing": [1e-09] }

grid = get_trained_grid(GaussianNB(), grid_params, x_train, y_train)
get_grid_best_params(grid)

In [None]:
print_grid_performance(grid, x_test, y_test)

In [None]:
from sklearn.tree import DecisionTreeClassifier

%time
grid_params = {'criterion': ["gini", "entropy"], 
              'splitter': ['best', 'random'], 
              'max_depth': [3,4,None], 
              'min_samples_split':[2, 4, 6],
              'min_samples_leaf':[1,2,3]}

grid = get_trained_grid(DecisionTreeClassifier(), grid_params, x_train, y_train)
get_grid_best_params(grid)

In [None]:
print_grid_performance(grid, x_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier

%time
grid_params = {'n_estimators': [10, 20, 50], 
              'max_features': ['auto', 'sqrt', 'log2'], 
              'bootstrap': [True, False], 
              'criterion':['entropy', 'gini']}

grid = get_trained_grid(RandomForestClassifier(), grid_params, x_train, y_train)
get_grid_best_params(grid)

In [None]:
print_grid_performance(grid, x_test, y_test)

In [None]:
from sklearn import svm

%time
grid_params = { "kernel": ["linear", "poly", "rbf", "sigmoid"],
               "degree": [1, 2 ,3, 4, 5, 6] }

grid = get_trained_grid(svm.SVC(), grid_params, x_train, y_train)
get_grid_best_params(grid)

In [None]:
print_grid_performance(grid, x_test, y_test)

In [None]:
from sklearn.naive_bayes import BernoulliNB

%time
grid_params = {'alpha': [0.25, 0.5, 1]}

grid = get_trained_grid(BernoulliNB(), grid_params, x_train, y_train)
get_grid_best_params(grid)

In [None]:
print_grid_performance(grid, x_test, y_test)

In [None]:
from xgboost import XGBClassifier

%time
grid_params = {'learning_rate': [0.01, 0.05, 0.1], 
              'eval_metric': ['error']}

grid = get_trained_grid(XGBClassifier(), grid_params, x_train, y_train)
get_grid_best_params(grid)

In [None]:
print_grid_performance(grid, x_test, y_test)

### best model accuracy from Random forest tree 88% to 90%
#### {'bootstrap': False, 'criterion': 'gini', 'max_features': 'log2', 'n_estimators': 50}

In [None]:
from sklearn.ensemble import RandomForestClassifier

%time
clr = RandomForestClassifier(bootstrap=False, criterion="gini", max_features="log2", n_estimators=50)
clr.fit(x_train, y_train)

In [None]:
print_grid_performance(clr, x_train, y_train)

In [None]:
print_grid_performance(clr, x_test, y_test)

In [None]:
x_test = test_df.values
x_test = scaler.transform(x_test)
x_test[:5]

In [None]:
submit = pd.DataFrame(clr.predict(x_test))
submission = pd.DataFrame(submit)
submission.columns = ['Solution']
submission['Id'] = np.arange(1,submission.shape[0]+1)
submission = submission[['Id','Solution']]

In [None]:
submission

In [None]:
submission.to_csv('submission.csv', index=False)
print(submission.shape)