<a href="https://colab.research.google.com/github/sauravkokane/Data-Science-Training/blob/master/K_Nearest_Neighbours.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
sns.set(style="darkgrid", color_codes=True, rc={'figure.figsize':(10,6)})
%matplotlib inline

In [19]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df = pd.read_csv('/content/drive/MyDrive/Datasets/Regression/Student_Performance.csv')

In [4]:
df.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Target
0,7,99,Yes,9,1,91
1,4,82,No,4,2,65
2,8,51,Yes,7,2,45
3,5,52,Yes,5,2,36
4,7,75,No,8,5,66


In [5]:
df.columns

Index(['Hours Studied', 'Previous Scores', 'Extracurricular Activities',
       'Sleep Hours', 'Sample Question Papers Practiced', 'Target'],
      dtype='object')

In [7]:
df.shape

(10000, 6)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype
---  ------                            --------------  -----
 0   Hours Studied                     10000 non-null  int64
 1   Previous Scores                   10000 non-null  int64
 2   Extracurricular Activities        10000 non-null  int64
 3   Sleep Hours                       10000 non-null  int64
 4   Sample Question Papers Practiced  10000 non-null  int64
 5   Target                            10000 non-null  int64
dtypes: int64(6)
memory usage: 468.9 KB


In [10]:
print("Value counts of Categorical variables")

for col in df.columns:
  if df[col].nunique() < 10:
    print(df[col].value_counts())
    print()

Value counts of Categorical variables
Hours Studied
1    1152
6    1133
7    1129
3    1119
9    1115
5    1094
8    1088
4    1085
2    1085
Name: count, dtype: int64

Extracurricular Activities
No     5052
Yes    4948
Name: count, dtype: int64

Sleep Hours
8    1804
7    1676
6    1673
9    1622
4    1619
5    1606
Name: count, dtype: int64



In [11]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df['Extracurricular Activities'] = le.fit_transform(df['Extracurricular Activities'])

In [12]:
df.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Target
0,7,99,1,9,1,91
1,4,82,0,4,2,65
2,8,51,1,7,2,45
3,5,52,1,5,2,36
4,7,75,0,8,5,66


In [15]:
X = df.drop('Target', axis=1)
y = df['Target']

In [16]:
X.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced
0,7,99,1,9,1
1,4,82,0,4,2
2,8,51,1,7,2
3,5,52,1,5,2
4,7,75,0,8,5


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8000, 5), (2000, 5), (8000,), (2000,))

In [20]:

knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)

In [21]:
y_train_pred = knn.predict(X_train)
y_test_pred = knn.predict(X_test)

In [22]:
mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred)

(4.0865849999999995, 5.977620000000001)

In [23]:
r2_score(y_train, y_train_pred)

0.9889154200131887

In [24]:
r2_score(y_test, y_test_pred)

0.9838697785153357

In [25]:
from sklearn.model_selection import GridSearchCV

In [26]:
knn = KNeighborsRegressor()

In [30]:
params = {
    'n_neighbors': list(range(1, 21)),
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10, 20, 30, 40, 50],
    'p': [1, 2],
    'metric': ['minkowski', 'euclidean', 'manhattan'],
}

In [28]:
grid_search = GridSearchCV(knn, params, cv=5, scoring='neg_mean_squared_error',
                           n_jobs=-1)
grid_search.fit(X_train, y_train)

In [29]:
grid_search.best_params_

{'algorithm': 'brute',
 'leaf_size': 10,
 'metric': 'minkowski',
 'n_neighbors': 11,
 'p': 2,
 'weights': 'uniform'}

In [31]:
knn = KNeighborsRegressor(algorithm='brute', leaf_size=10, metric='minkowski',
                          n_neighbors=11, p=2, weights='uniform')
knn.fit(X_train, y_train)

In [32]:
y_train_pred = knn.predict(X_train)
y_test_pred = knn.predict(X_test)

In [33]:
mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)

In [34]:
print("Mean Squared Error (MSE) on Training Data:", mse_train)
print("Mean Squared Error (MSE) on Testing Data:", mse_test)

Mean Squared Error (MSE) on Training Data: 4.594677685950414
Mean Squared Error (MSE) on Testing Data: 5.37286776859504


In [35]:
print("R-squared (R2) Score on Training Data:", r2_score(y_train, y_train_pred))
print("R-squared (R2) Score on Testing Data:", r2_score(y_test, y_test_pred))

R-squared (R2) Score on Training Data: 0.9875372536424583
R-squared (R2) Score on Testing Data: 0.98550166335176
