## Importing the required libraries


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

## Load dataset and understanding it by seeing the data

In [2]:

df = pd.read_csv("../input/ml-assignment-2/features.csv", low_memory=False, names=["Height", "Weight"], header=0)
target = pd.read_csv("../input/ml-assignment-2/target.csv", low_memory=False, names=["Gender"],header=0)
print(df.head())
print(target.head())


## Checking for rows duplicacy


In [3]:
any(df.duplicated())

## Mapping the target variables, for Male as 1 and Female as 0.

### And checking the replacement done.

In [4]:
mapping_dict = {"Gender": {"Male": 1, "Female": 0}}
target.replace(mapping_dict, inplace=True)
print(target.dtypes)
target.head()

## Checking the outliers present in the dataset by using Box and Whiskers Plot

In [5]:
df.boxplot(rot=90)

In [6]:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('Height', fontsize = 15)
ax.set_ylabel('Weight', fontsize = 15)
ax.set_title('Scatter Plot', fontsize = 20)
y_values = [1, 0]
colors = ['b', 'y']
markers = ['x', 'o']
for t, color, marker in zip(y_values,colors,markers):
    indicesToKeep = target['Gender'] == t
    ax.scatter(df.loc[indicesToKeep, 'Height'], df.loc[indicesToKeep, 'Weight'], c = color, s=50, marker=marker)
ax.legend(y_values)
ax.grid()

## Split our dataset into training and testing dataset in the ratio of 80-20 respectively.

In [7]:
x_train, x_test, y_train, y_test = train_test_split(df, target, test_size=0.2, random_state=1)
print(len(x_train), len(x_test), len(y_train), len(y_test))
print(x_test.head(), y_test.head())

## Check for class imbalance i.e. we have data belonging to both the classes in training and testing dataset

In [8]:
fig, axes = plt.subplots(1,2,figsize=(14,7))
y_train["Gender"].value_counts().plot(kind='pie', ax=axes[0],subplots=True, autopct='%1.2f%%', labels=["Female", "Male"])
y_test["Gender"].value_counts().plot(kind='pie', ax=axes[1],subplots=True, autopct='%1.2f%%', labels=["Female", "Male"])
plt.legend()
plt.show()

## As I can see from the plots, both the classes are split in appropriate proportions. So our model will not become biased after training.

## Question 1 A

In [9]:
model = DecisionTreeClassifier()
fittedModel = model.fit(x_train, y_train)
fittedModel

In [10]:
predict = fittedModel.predict(x_test)
sns.set(font_scale=1.5)
fig, ax = plt.subplots(figsize=(4, 4))
ax = sns.heatmap(confusion_matrix(y_test, predict), annot=True, cbar=True)
plt.xlabel("True label")
plt.ylabel("Predicted label")
plt.show()

## Printing the required evaluation metrics

In [11]:
print("Confusion Matrix : \n",confusion_matrix(y_test, predict))
print("Accuracy Score : ",accuracy_score(y_test, predict))
print("Precision Score : ",precision_score(y_test, predict))
print("Recall Score : ",recall_score(y_test, predict))
print("F1 Score : ",f1_score(y_test, predict))
target_names = ['Female', 'Male']
print(classification_report(y_test, predict, target_names=target_names))

## As we can see from the evaluation metrics, the model has an accuracy of 100% even without hyperparameter tuning. Since this is a small and easy dataset, we were able to get a 100% accuracy. But this need not be true for complex datasets.

In [12]:
fig = plt.figure(figsize=(10,10))
out = tree.plot_tree(fittedModel, feature_names=["Height", "Weight"], class_names=["Female", "Male"], filled=True)
for outs_ in out:
    arrow = outs_.arrow_patch
    if arrow is not None:
        arrow.set_edgecolor('yellow')
out

## This is the obtained Decision tree.
### gridsearchCV is used to tune the hyperparameters. 
### 'min_impurity_decrease' is an efficient parameter to prevent overfitting of the model.



## Question 1 B

In [13]:
# RandomForestClassifier hyperparameters
grid_params = {"max_depth": [None, 1, 2, 4, 8],"min_samples_split": list(np.arange(2, 70, 10)),
               "min_samples_leaf": np.arange(1, 20, 2),
               "min_impurity_decrease": [0,0.1,0.2],
               "max_features": [1, 2]}
grid_model = GridSearchCV(DecisionTreeClassifier(),param_grid=grid_params,verbose=True)
grid_model.fit(x_train, y_train)

In [14]:
grid_model.best_params_

In [15]:
model = DecisionTreeClassifier(max_depth=grid_model.best_params_["max_depth"],
 max_features=grid_model.best_params_["max_features"],
 min_samples_leaf=grid_model.best_params_["min_samples_leaf"],
 min_impurity_decrease = grid_model.best_params_["min_impurity_decrease"],
 min_samples_split=grid_model.best_params_["min_samples_split"])
fittedModel = model.fit(x_train, y_train)


In [16]:
predict = fittedModel.predict(x_test)
sns.set(font_scale=1.5)
fig, ax = plt.subplots(figsize=(4, 4))
ax = sns.heatmap(confusion_matrix(y_test, predict), annot=True, cbar=True)
plt.xlabel("True label")
plt.ylabel("Predicted label")
print(accuracy_score(y_test, predict))
print(precision_score(y_test, predict))
print(recall_score(y_test, predict))
print(f1_score(y_test, predict))
plt.show()

In [17]:
target_names = ['Female', 'Male']
print(classification_report(y_test, predict, target_names=target_names))

## Here we got the same accuracy for both the trees. 
### But, for new complex realtime data, this hyperparameter tuned decision tree may perform better

In [18]:
fig = plt.figure(figsize=(10,10))
out = tree.plot_tree(fittedModel, feature_names=["Height", "Weight"], class_names=["Female", "Male"], filled=True)
for o in out:
    arrow = o.arrow_patch
    if arrow is not None:
        arrow.set_edgecolor('yellow')


### Reshape y_train and y_test to a one dimensional numpy array to suit KNN input format (avoid warnings)

In [19]:
y_train = y_train.values.ravel()
y_train.shape

In [20]:
y_test = y_test.values.ravel()
y_test.shape

## Implementation of KNN
### 1. Import necesaary class from sklearn
### 2. Initialise model
### 3. Fit model on training dataset

## Question 2 A

In [21]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(x_train, y_train)


## Make predictions on test split

In [22]:
y_pred = knn_model.predict(x_test)

## Evaluating the model-confusion matrix and classification report

In [23]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

## Selecting best K value (number of nearest neighbors)

In [24]:
errors = []
# Trying different K values
for i in range(1, 60):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(x_train, y_train)
    pred_i = knn.predict(x_test)
    errors.append(np.mean(pred_i != y_test))

In [25]:
#We print the values for each k value.
for i in range (1,60):
    print('Error is',errors[i-1],', When k is',i)


In [26]:
plt.figure(figsize=(16, 8))
plt.plot(range(1, 60), errors, color='green', linestyle='dashed', marker='o', markerfacecolor='yellow', markersize=10)
plt.title('Mean error vs K Value')
plt.xlabel('K Value')
plt.ylabel('Mean Error')

## Creating a new model with n_neighbors parameter as 25

In [27]:
knn_model = KNeighborsClassifier(n_neighbors=25)
knn_model.fit(x_train, y_train)
y_pred = knn_model.predict(x_test)
print(classification_report(y_test, y_pred))

## Confusion matrix: 
### 9 labels that are predicted as 0 but true label is 1

In [28]:
sns.set(font_scale=1.5)
fig, ax = plt.subplots(figsize=(4, 4))
ax = sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cbar=True)
plt.xlabel("True label")
plt.ylabel("Predicted label")

## Visualising the dataset (data points under each target class) before normalisation

In [29]:
df_out = pd.merge(df,target['Gender'],how = 'right',left_index = True,
right_index = True)
plt.figure(figsize=(10, 6))
plt.scatter(df_out.Height[df_out.Gender==1],df_out.Weight[df_out.Gender==1], c="yellow")
plt.scatter(df_out.Height[df_out.Gender==0],df_out.Weight[df_out.Gender==0], c="red")
plt.title("Visualize dataset before normalisation for males and females")
plt.xlabel("Height")
plt.ylabel("Weight")
plt.legend(["Male", "Female"]);

In [30]:
x_train.head()

## Min-max normalisation
## Question 2 B
### Use same scalar we used for the train split to transform the test split

In [31]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled_train = scaler.fit_transform(x_train)
scaled_test = scaler.transform(x_test)

In [32]:
scaled_train[:5]

## Visualising data points after normalisation

In [33]:
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
x = scaled_train[:,0]
y = scaled_train[:,1]
ax.scatter(x, y, color = 'lightgreen')
plt.title("Visualze min-max normalised data")
plt.xlabel("Height")
plt.ylabel("Weight")


## finding the optimal K value by plotting the error vs K value. But this time I use the normalised dataset to train the model

In [34]:
errors = []
# Trying different K values
for i in range(1, 60):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(scaled_train, y_train)
    pred_i = knn.predict(scaled_test)
    errors.append(np.mean(pred_i != y_test))

In [35]:
plt.figure(figsize=(16, 8))
plt.plot(range(1, 60), errors, color='green', linestyle='dashed', marker='o',markerfacecolor='yellow', markersize=10)
plt.title('Error vs K')
plt.xlabel('K Value')
plt.ylabel('Average Error')

In [36]:
#We print the values for each k value.
for i in range(1,60):
    print('Error is',errors[i-1],', When k is',i)

### K=5 is the optimal value as obtained from the above graph. 
### Now I will create a new model with n_neighbors tuned to 5

In [37]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(scaled_train, y_train)

## Prediction on the test datset

In [38]:
y_pred = knn_model.predict(scaled_test)

## Classification Report and confusion matrix of the predicted values

In [39]:
print(classification_report(y_test, y_pred))

In [40]:
sns.set(font_scale=1.5)
fig, ax = plt.subplots(figsize=(4, 4))
ax = sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cbar=True)
plt.xlabel("True label")
plt.ylabel("Predicted label")

### Again Using gridsearch to find the best parameters (like for n_neighbors)

In [41]:
params = {'n_neighbors': np.arange(5, 60, 5)}
gs = GridSearchCV(KNeighborsClassifier(), param_grid=params, cv=5, verbose=True)
gs.fit(scaled_train, y_train)

## Got best value for n_neighbors as 5 which matches with the same value obtained from the K vs error plot above

In [42]:
gs.best_params_

## Accuracy of 100% on the test split data is achieved

In [43]:
gs.score(scaled_test, y_test)

## ROC curves and AUC
## Question 2 C

In [44]:
from sklearn.metrics import plot_roc_curve

In [45]:
model_diff_neighbors = KNeighborsClassifier(n_neighbors=5)
model_diff_neighbors.fit(x_train, y_train)
plot_roc_curve(model_diff_neighbors, scaled_test, y_test)

## Got slightly bent roc curve (AUC not exactly one - rounded off in the plot) if I use different number of n_neighbors like 11 as the best hyperparameter is 5 for n_neighbors

In [46]:
model_diff_neighbors = KNeighborsClassifier(n_neighbors=9)
model_diff_neighbors.fit(scaled_train, y_train)
plot_roc_curve(model_diff_neighbors, scaled_test, y_test)

## Perfect ROC curve
### Perfect plot with well tuned hyperparameter(n_neighbors=5) and fitted on normalised data. AUC is one, hence this is the most optimum solution

In [47]:
plot_roc_curve(gs, scaled_test, y_test)