In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing data and python packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

In [None]:
df=pd.read_csv('/kaggle/input/cvdcvd-vd/Social_Network_Ads.csv')

# Data overview

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

# Exploratory Data analysis

Exploration of gender

In [None]:
df['Gender'].value_counts()

In [None]:
sns.countplot(x=df['Gender'],hue=df['Purchased'])
plt.show()

Exploration of Age

In [None]:
plt.figure(figsize=(10,6))
ax = sns.kdeplot(df["Age"][df.Purchased == 1], color="darkturquoise", shade=True)
sns.kdeplot(df["Age"][df.Purchased == 0], color="lightcoral", shade=True)
plt.legend(['Purchased', 'No purchase'])
plt.title('Density Plot of Age for people who made/did not make purchase')
ax.set(xlabel='Age')
plt.show()

Exploration of estimated salary

In [None]:
plt.figure(figsize=(10,6))
ax = sns.kdeplot(df["EstimatedSalary"][df.Purchased == 1], color="darkturquoise", shade=True)
sns.kdeplot(df["EstimatedSalary"][df.Purchased == 0], color="lightcoral", shade=True)
plt.legend(['Purchased', 'No purchase'])
plt.title('Density Plot of Estimated Salary for people who made/did not make purchase')
ax.set(xlabel='Estimated Salary')
plt.show()

Correlation map

In [None]:
figure = plt.figure(figsize=(10,8))
sns.heatmap(df.corr(),annot=True, cmap='coolwarm')
plt.show()

# Building model

In [None]:
#dropping uneccessary User ID column
del df['User ID']

In [None]:
#mapping gender
df['Gender']=df['Gender'].map({'Male':0, 'Female':1})

In [None]:
#assinging variables
y=df['Purchased']
X=df.drop(['Purchased'], axis=1)

In [None]:
#scaling data
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_scaled=scaler.fit_transform(X)

In [None]:
#dividing dataset into train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=2)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn.score(X_test, y_test)

In [None]:
errors = []
for i in range(1,21):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    errors.append(np.mean(pred_i != y_test))

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(1,21), errors, color='blue', marker='o', markerfacecolor='red', markersize=10)
plt.title('Error rate vs. number of neighbors')
plt.xlabel('n_neighbors')
plt.ylabel('Error rate')
plt.show()

# Hyperparameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = { 'n_neighbors' : [9, 10, 11, 14, 16, 17],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan', 'chebyshev']}

In [None]:
gs = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid, cv=5)

In [None]:
gs.fit(X_train, y_train)

In [None]:
gs.best_score_

In [None]:
gs.best_params_

Retraining model with best parameters found

In [None]:
knn2 = KNeighborsClassifier(metric='minkowski', n_neighbors=9, weights='uniform')
knn2.fit(X_train, y_train)
knn2.score(X_test, y_test)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
y_pred=knn2.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d')