# Homework # 3 - K-Nearest Neighbor
Data file: social_network_ads.csv

### Import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

### Load data

In [2]:
# Read CSV into dataframe
! ls -l 'social_network_ads.csv'
! head 'social_network_ads.csv'

-rw-r--r--@ 1 sahitikovvuri  staff  10926 Oct 31 16:24 social_network_ads.csv
User ID,Gender,Age,EstimatedSalary,Purchased
15624510,Male,19,19000,0
15810944,Male,35,20000,0
15668575,Female,26,43000,0
15603246,Female,27,57000,0
15804002,Male,19,76000,0
15728773,Male,27,58000,0
15598044,Female,27,84000,0
15694829,Female,32,150000,1
15600575,Male,25,33000,0


In [3]:
df = pd.read_csv("social_network_ads.csv", sep=',')

In [4]:
# Display first few rows
df.head(6)

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
5,15728773,Male,27,58000,0


### Drop unnecessary columns 'User ID' and 'Gender'

In [5]:
df.drop(['User ID'], axis = 1, inplace = True)
df.drop(['Gender'], axis = 1, inplace = True)

In [6]:
df.head(6)

Unnamed: 0,Age,EstimatedSalary,Purchased
0,19,19000,0
1,35,20000,0
2,26,43000,0
3,27,57000,0
4,19,76000,0
5,27,58000,0


### Explore data

In [7]:
# Display distribution (min, max) of values for Age and EstimatedSalary variables
print('The maximum values for the Age and EstimatedSalary columns are:\n' , df.max())
print('The minimum values for the Age and EstimatedSalary columns are:\n' , df.min())

The maximum values for the Age and EstimatedSalary columns are:
 Age                    60
EstimatedSalary    150000
Purchased               1
dtype: int64
The minimum values for the Age and EstimatedSalary columns are:
 Age                   18
EstimatedSalary    15000
Purchased              0
dtype: int64


#### Note the scale for column EstimatedSalary is much higher than for column Age

### Separate independent and dependent variables
* Independent variables: All except Purchased
* Dependent variable: Purchased

In [8]:
# Prepare data for training and testing
x = df.iloc[:,0:2]
x.head(6)

Unnamed: 0,Age,EstimatedSalary
0,19,19000
1,35,20000
2,26,43000
3,27,57000
4,19,76000
5,27,58000


In [9]:
y = df.iloc[:,2]
y.head(6)

0    0
1    0
2    0
3    0
4    0
5    0
Name: Purchased, dtype: int64

### Split data into training and test sets

In [10]:
# Split into training (70%) and test data (30%)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)

### Scale Age and EstimatedSalary variables

In [11]:
# Use StandardScaler
scaler = StandardScaler()
scaler_transform = scaler.fit_transform(df)
print(scaler_transform)

[[-1.78179743 -1.49004624 -0.74593581]
 [-0.25358736 -1.46068138 -0.74593581]
 [-1.11320552 -0.78528968 -0.74593581]
 ...
 [ 1.17910958 -1.46068138  1.34059793]
 [-0.15807423 -1.07893824 -0.74593581]
 [ 1.08359645 -0.99084367  1.34059793]]


In [12]:
# Display distribution (min, max) of scaled values for Age and EstimatedSalary variables
print('The maximum values for the Age and EstimatedSalary scaled value columns are:\n' , scaler_transform.max())
print('The minimum values for the Age and EstimatedSalary scaled value columns are:\n' , scaler_transform.min())

The maximum values for the Age and EstimatedSalary scaled value columns are:
 2.3567499772898386
The minimum values for the Age and EstimatedSalary scaled value columns are:
 -1.8773105578331641


### Train KNeighborsClassifier (with default hyperparameters)

In [13]:
# Defaults: n_neighbors=5, weights='uniform' metric='Euclidean'
knn = KNeighborsClassifier(n_neighbors = 5, weights = 'uniform', metric = "euclidean")
knn.fit(x_train, y_train)

KNeighborsClassifier(metric='euclidean')

### Evaulate model performance

In [14]:
# Predict using the Test set results
y_predict = knn.predict(x_test)
y_predict

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0])

In [15]:
# Display the Confusion Matrix
cf_matrix = confusion_matrix(y_test, y_predict)
print(cf_matrix)

[[68 11]
 [11 30]]


In [16]:
# Generate and print model performance metrics (accuracy) on test set
from sklearn.metrics import accuracy_score

y_pred=knn.predict(x_test)
print("The accuracy of test set is", accuracy_score(y_test, y_pred) * 100)

The accuracy of test set is 81.66666666666667


In [17]:
# Generate classification report to evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.86      0.86        79
           1       0.73      0.73      0.73        41

    accuracy                           0.82       120
   macro avg       0.80      0.80      0.80       120
weighted avg       0.82      0.82      0.82       120



### Train KNeighborsClassifier (change hyperparameter: n_neighbors)

In [18]:
# Defaults: n_neighbors=5, weights='uniform' metric='Euclidean'
knn = KNeighborsClassifier(n_neighbors = 9, weights = 'uniform', metric = "euclidean")
knn.fit(x_train, y_train)

KNeighborsClassifier(metric='euclidean', n_neighbors=9)

### Evaulate model performance

In [19]:
# Predict using the Test set results
y_pred = knn.predict(x_test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0])

In [20]:
# Display the Confusion Matrix
cf_matrix = confusion_matrix(y_test, y_pred)
print(cf_matrix)

[[73  6]
 [12 29]]


In [21]:
# Generate and print model performance metrics (accuracy) on test set
y_pred = knn.predict(x_test)
print("The accuracy of the test set is", accuracy_score(y_test, y_pred) * 100)

The accuracy of the test set is 85.0


In [22]:
# Generate classification report to evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.92      0.89        79
           1       0.83      0.71      0.76        41

    accuracy                           0.85       120
   macro avg       0.84      0.82      0.83       120
weighted avg       0.85      0.85      0.85       120

