## Coding Exercise #0310

### 1. Classification with KNN:

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics, preprocessing
warnings.filterwarnings(action='ignore')                  # Turn off the warnings.
%matplotlib inline

#### 1.1. Read in data:

The data and explanation can be found [here](https://www.kaggle.com/c/titanic/data) (requires sign in).

In [None]:
# Go to the directory where the data file is located. 
# os.chdir(r'~~')                # Please, replace the path with your own.   

In [None]:
df = pd.read_csv('75. data_titanic.csv', header='infer')

In [None]:
df.shape

In [None]:
df.head(3)

#### 1.2. Missing value processing: 

In [None]:
# Check for the missing values.
df.isnull().sum(axis=0)

In [None]:
# Fill the missing values in the Age variable.
n = df.shape[0]
Age = []                                                               # A temporary list.
for i in range(n):
    if np.isnan(df.Age[i]):
        if ('Mr' in df.Name[i]) or ('Mrs' in df.Name[i]) :
            Age.append(30)                                             # If Mr. or Mrs. in the name, then fill with 30.
        else:
            Age.append(10)                                             # Likely a child. So, fill with 10.
    else:
        Age.append(df.Age[i])
df.Age = pd.Series(Age)

In [None]:
# We will drop some columns.
df = df.drop(columns = ['PassengerId','Name','Ticket','Fare','Cabin'])
df.head(3)

In [None]:
# Delete the rest of missing values.
df=df.dropna(axis=0)

In [None]:
df.shape

#### 1.3. Exploratory data analysis:

In [None]:
# The frequency table of Survived.
sns.countplot(x='Survived',data=df)
plt.show()

In [None]:
# Survival rate by Age category. 
df['AgeCategory'] = pd.qcut(df.Age,4)                   # Using quantiles cut into 4 intervals.
sns.barplot(x='AgeCategory',y='Survived', ci=None, data=df)
plt.show()

In [None]:
# Survival rate by SibSp category. 
sns.barplot(x='SibSp', y='Survived', ci=None, data=df)
plt.show()

In [None]:
# Survival rate by Parch.
sns.barplot(x='Parch', y='Survived', ci=None, data=df)
plt.show()

In [None]:
# Survival rate by Pclass.
sns.barplot(x='Pclass', y='Survived', ci=None, data=df)
plt.show()

In [None]:
# Survival rate by Embarked.
sns.barplot(x='Embarked', y='Survived', ci=None, data=df)
plt.show()

In [None]:
# Survival rate by Sex.
sns.barplot(x='Sex', y='Survived', ci=None,  data=df)
plt.show()

#### 1.4. Feature engineering:

In [None]:
# Convert into dummy variables and then remove the original variables.
df = pd.get_dummies(df.AgeCategory, drop_first=True,prefix='Age').join(df.drop(columns=['Age','AgeCategory']))
df = pd.get_dummies(df.Pclass, drop_first=True,prefix='Pclass').join(df.drop(columns=['Pclass']))
df = pd.get_dummies(df.SibSp, drop_first=True,prefix='SibSp').join(df.drop(columns=['SibSp']))
df = pd.get_dummies(df.Parch, drop_first=True,prefix='Parch').join(df.drop(columns=['Parch']))
df = pd.get_dummies(df.Sex, drop_first=True,prefix='Sex').join(df.drop(columns=['Sex']))
df = pd.get_dummies(df.Embarked, drop_first=True,prefix='Embarked').join(df.drop(columns=['Embarked']))
df.head(5)

In [None]:
# Save to an external file.
# df.to_csv('data_titanic_2.csv',index=False)

#### 1.5. KNN train and test:

In [None]:
X = df.drop(columns=['Survived'])
Y = df.Survived

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1234)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

In [None]:
# Q: Instantiate, fit and predict a KNN with n_neighbours = 5 (https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)
knn5 = ??
knn5.??
Y_pred = ??
print(metrics.confusion_matrix(Y_test,Y_pred))
print("------------------------")

#Q: Print the accuracy_score (https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html)
print( "Accuracy : " + str(??)

In [None]:
# Instantiate, fit and predict a KNN with n_neighbours = 100
knn100 = ??
knn100.??
Y_pred = ??
print(metrics.confusion_matrix(Y_test,Y_pred))
print("------------------------")

#Q: Print the accuracy_score (https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html)
print( "Accuracy : " + str(??)

#### 1.6. KNN bias-Variance tradeoff as function of *k*: 

In [None]:
# Instantiate, fit and predict aKNN with n_neighbours = k

accs = []
k_grid = range(1,100,1)
for k in k_grid:
    knn = ??
    knn.??
    Y_pred = ??
    accs.append(metrics.accuracy_score(Y_test,Y_pred))

In [None]:
# Visualize.
plt.scatter(k_grid,accs,c='red',marker='o',s=10,alpha=0.6)
plt.xlabel('k')
plt.ylabel('Accuracy')
plt.title('Accuracy vs k')
plt.show()

#### 1.7. KNN hyperparameter optimization:

In [None]:
# Q: Generate a parameter gri for k in [1,51] (https://numpy.org/doc/stable/reference/generated/numpy.arange.html)
k_grid = ??
parameters = {'n_neighbors':k_grid}

In [None]:
# Optimize the k (Grid  Search  CV  with KNeighborsClassifier as estimator, parameters, cv = 10 and n_jobs = -1 for using all the CPU cores)
gridCV = ??
gridCV.fit(X_train, Y_train)
best_k = gridCV.best_params_['n_neighbors']
print("Best k : " + str(best_k))

In [None]:
# Q: Instantiate, test and predict with the best k.
KNN_best = ??
KNN_best.??
Y_pred = ??
print( "Best Accuracy : " + str(np.round(metrics.accuracy_score(Y_test,Y_pred),3)))