In [37]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsRegressor

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [7]:
train_df = pd.read_csv(r'C:\Users\admin\Downloads\Train.csv')
train_df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [8]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [15]:
train_df.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [24]:
train_df['Item_Weight'] = train_df['Item_Weight'].fillna(train_df['Item_Weight'].median(), inplace = True)

In [26]:
train_df['Outlet_Size'].value_counts()

Medium    2793
Small     2388
High       932
Name: Outlet_Size, dtype: int64

In [27]:
Out_size_df = pd.get_dummies(train_df['Outlet_Size'], drop_first = True, prefix = 'Outlet_size')

In [31]:
train_df.drop(['Outlet_Size'], axis = 1, inplace = True)

In [33]:
train_df = pd.concat([Out_size_df,train_df],axis = 1)

In [52]:
train_df.tail()

Unnamed: 0,Outlet_size_Medium,Outlet_size_Small,Item_Identifier,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
8518,0,0,370,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,Tier 3,Supermarket Type1,2778.3834
8519,0,0,897,Regular,0.046982,Baking Goods,108.157,OUT045,2002,Tier 2,Supermarket Type1,549.285
8520,0,1,1357,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Tier 2,Supermarket Type1,1193.1136
8521,1,0,681,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Tier 3,Supermarket Type2,1845.5976
8522,0,1,50,Low Fat,0.044878,Soft Drinks,75.467,OUT046,1997,Tier 1,Supermarket Type1,765.67


In [36]:
train_df['Item_Identifier'].nunique()

1559

In [41]:
label_enco = LabelEncoder()

In [42]:
train_df['Item_Identifier'] = label_enco.fit_transform(train_df['Item_Identifier'] )

In [44]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Outlet_size_Medium         8523 non-null   uint8  
 1   Outlet_size_Small          8523 non-null   uint8  
 2   Item_Identifier            8523 non-null   int32  
 3   Item_Weight                0 non-null      object 
 4   Item_Fat_Content           8523 non-null   object 
 5   Item_Visibility            8523 non-null   float64
 6   Item_Type                  8523 non-null   object 
 7   Item_MRP                   8523 non-null   float64
 8   Outlet_Identifier          8523 non-null   object 
 9   Outlet_Establishment_Year  8523 non-null   int64  
 10  Outlet_Location_Type       8523 non-null   object 
 11  Outlet_Type                8523 non-null   object 
 12  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(3), int32(1), int64(1), object(6), ui

In [47]:
train_df.drop(['Item_Weight'], axis = 1, inplace=True)

In [53]:
train_df['Item_Fat_Content'].replace({'Low Fat': 0, 'Regular': 1, 'low fat': 0, 'LF': 0, 'reg':1}, inplace = True)

In [55]:
train_df['Item_Type'].unique()

array(['Dairy', 'Soft Drinks', 'Meat', 'Fruits and Vegetables',
       'Household', 'Baking Goods', 'Snack Foods', 'Frozen Foods',
       'Breakfast', 'Health and Hygiene', 'Hard Drinks', 'Canned',
       'Breads', 'Starchy Foods', 'Others', 'Seafood'], dtype=object)

In [56]:
Item_type_df = pd.get_dummies(train_df['Item_Type'], drop_first= True, prefix = 'Item_type')
Item_type_df

Unnamed: 0,Item_type_Breads,Item_type_Breakfast,Item_type_Canned,Item_type_Dairy,Item_type_Frozen Foods,Item_type_Fruits and Vegetables,Item_type_Hard Drinks,Item_type_Health and Hygiene,Item_type_Household,Item_type_Meat,Item_type_Others,Item_type_Seafood,Item_type_Snack Foods,Item_type_Soft Drinks,Item_type_Starchy Foods
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8518,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
8519,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8520,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
8521,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [57]:
train_df = pd.concat([train_df,Item_type_df], axis = 1)
train_df.head()

Unnamed: 0,Outlet_size_Medium,Outlet_size_Small,Item_Identifier,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Location_Type,...,Item_type_Fruits and Vegetables,Item_type_Hard Drinks,Item_type_Health and Hygiene,Item_type_Household,Item_type_Meat,Item_type_Others,Item_type_Seafood,Item_type_Snack Foods,Item_type_Soft Drinks,Item_type_Starchy Foods
0,1,0,156,0,0.016047,Dairy,249.8092,OUT049,1999,Tier 1,...,0,0,0,0,0,0,0,0,0,0
1,1,0,8,1,0.019278,Soft Drinks,48.2692,OUT018,2009,Tier 3,...,0,0,0,0,0,0,0,0,1,0
2,1,0,662,0,0.01676,Meat,141.618,OUT049,1999,Tier 1,...,0,0,0,0,1,0,0,0,0,0
3,0,0,1121,1,0.0,Fruits and Vegetables,182.095,OUT010,1998,Tier 3,...,1,0,0,0,0,0,0,0,0,0
4,0,0,1297,0,0.0,Household,53.8614,OUT013,1987,Tier 3,...,0,0,0,1,0,0,0,0,0,0


In [58]:
train_df.drop(['Item_Type'], axis = 1, inplace=True)

In [59]:
train_df.head()

Unnamed: 0,Outlet_size_Medium,Outlet_size_Small,Item_Identifier,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Location_Type,Outlet_Type,...,Item_type_Fruits and Vegetables,Item_type_Hard Drinks,Item_type_Health and Hygiene,Item_type_Household,Item_type_Meat,Item_type_Others,Item_type_Seafood,Item_type_Snack Foods,Item_type_Soft Drinks,Item_type_Starchy Foods
0,1,0,156,0,0.016047,249.8092,OUT049,1999,Tier 1,Supermarket Type1,...,0,0,0,0,0,0,0,0,0,0
1,1,0,8,1,0.019278,48.2692,OUT018,2009,Tier 3,Supermarket Type2,...,0,0,0,0,0,0,0,0,1,0
2,1,0,662,0,0.01676,141.618,OUT049,1999,Tier 1,Supermarket Type1,...,0,0,0,0,1,0,0,0,0,0
3,0,0,1121,1,0.0,182.095,OUT010,1998,Tier 3,Grocery Store,...,1,0,0,0,0,0,0,0,0,0
4,0,0,1297,0,0.0,53.8614,OUT013,1987,Tier 3,Supermarket Type1,...,0,0,0,1,0,0,0,0,0,0


In [61]:
Out_df = pd.get_dummies(train_df['Outlet_Identifier'], drop_first=True, prefix = 'Outlet_Identifier')
Out_df

Unnamed: 0,Outlet_Identifier_OUT013,Outlet_Identifier_OUT017,Outlet_Identifier_OUT018,Outlet_Identifier_OUT019,Outlet_Identifier_OUT027,Outlet_Identifier_OUT035,Outlet_Identifier_OUT045,Outlet_Identifier_OUT046,Outlet_Identifier_OUT049
0,0,0,0,0,0,0,0,0,1
1,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
8518,1,0,0,0,0,0,0,0,0
8519,0,0,0,0,0,0,1,0,0
8520,0,0,0,0,0,1,0,0,0
8521,0,0,1,0,0,0,0,0,0


In [62]:
train_df.drop(['Outlet_Identifier'], axis = 1, inplace=True)

In [63]:
train_df

Unnamed: 0,Outlet_size_Medium,Outlet_size_Small,Item_Identifier,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,...,Item_type_Fruits and Vegetables,Item_type_Hard Drinks,Item_type_Health and Hygiene,Item_type_Household,Item_type_Meat,Item_type_Others,Item_type_Seafood,Item_type_Snack Foods,Item_type_Soft Drinks,Item_type_Starchy Foods
0,1,0,156,0,0.016047,249.8092,1999,Tier 1,Supermarket Type1,3735.138,...,0,0,0,0,0,0,0,0,0,0
1,1,0,8,1,0.019278,48.2692,2009,Tier 3,Supermarket Type2,443.4228,...,0,0,0,0,0,0,0,0,1,0
2,1,0,662,0,0.01676,141.618,1999,Tier 1,Supermarket Type1,2097.27,...,0,0,0,0,1,0,0,0,0,0
3,0,0,1121,1,0.0,182.095,1998,Tier 3,Grocery Store,732.38,...,1,0,0,0,0,0,0,0,0,0
4,0,0,1297,0,0.0,53.8614,1987,Tier 3,Supermarket Type1,994.7052,...,0,0,0,1,0,0,0,0,0,0


In [64]:
train_df = pd.concat([train_df,Out_df], axis = 1)
train_df.head()

Unnamed: 0,Outlet_size_Medium,Outlet_size_Small,Item_Identifier,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,...,Item_type_Starchy Foods,Outlet_Identifier_OUT013,Outlet_Identifier_OUT017,Outlet_Identifier_OUT018,Outlet_Identifier_OUT019,Outlet_Identifier_OUT027,Outlet_Identifier_OUT035,Outlet_Identifier_OUT045,Outlet_Identifier_OUT046,Outlet_Identifier_OUT049
0,1,0,156,0,0.016047,249.8092,1999,Tier 1,Supermarket Type1,3735.138,...,0,0,0,0,0,0,0,0,0,1
1,1,0,8,1,0.019278,48.2692,2009,Tier 3,Supermarket Type2,443.4228,...,0,0,0,1,0,0,0,0,0,0
2,1,0,662,0,0.01676,141.618,1999,Tier 1,Supermarket Type1,2097.27,...,0,0,0,0,0,0,0,0,0,1
3,0,0,1121,1,0.0,182.095,1998,Tier 3,Grocery Store,732.38,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1297,0,0.0,53.8614,1987,Tier 3,Supermarket Type1,994.7052,...,0,1,0,0,0,0,0,0,0,0


In [66]:
train_df['Outlet_Location_Type'].replace({'Tier 1':0, 'Tier 2':1, 'Tier 3':2}, inplace = True)

In [67]:
train_df.head()

Unnamed: 0,Outlet_size_Medium,Outlet_size_Small,Item_Identifier,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,...,Item_type_Starchy Foods,Outlet_Identifier_OUT013,Outlet_Identifier_OUT017,Outlet_Identifier_OUT018,Outlet_Identifier_OUT019,Outlet_Identifier_OUT027,Outlet_Identifier_OUT035,Outlet_Identifier_OUT045,Outlet_Identifier_OUT046,Outlet_Identifier_OUT049
0,1,0,156,0,0.016047,249.8092,1999,0,Supermarket Type1,3735.138,...,0,0,0,0,0,0,0,0,0,1
1,1,0,8,1,0.019278,48.2692,2009,2,Supermarket Type2,443.4228,...,0,0,0,1,0,0,0,0,0,0
2,1,0,662,0,0.01676,141.618,1999,0,Supermarket Type1,2097.27,...,0,0,0,0,0,0,0,0,0,1
3,0,0,1121,1,0.0,182.095,1998,2,Grocery Store,732.38,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1297,0,0.0,53.8614,1987,2,Supermarket Type1,994.7052,...,0,1,0,0,0,0,0,0,0,0


In [69]:
train_df['Outlet_Type'] = label_enco.fit_transform(train_df['Outlet_Type'])

In [71]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 34 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Outlet_size_Medium               8523 non-null   uint8  
 1   Outlet_size_Small                8523 non-null   uint8  
 2   Item_Identifier                  8523 non-null   int32  
 3   Item_Fat_Content                 8523 non-null   int64  
 4   Item_Visibility                  8523 non-null   float64
 5   Item_MRP                         8523 non-null   float64
 6   Outlet_Establishment_Year        8523 non-null   int64  
 7   Outlet_Location_Type             8523 non-null   int64  
 8   Outlet_Type                      8523 non-null   int32  
 9   Item_Outlet_Sales                8523 non-null   float64
 10  Item_type_Breads                 8523 non-null   uint8  
 11  Item_type_Breakfast              8523 non-null   uint8  
 12  Item_type_Canned    

In [74]:
x = train_df.drop(['Item_Outlet_Sales'], axis = 1)
y = train_df['Item_Outlet_Sales']

## Scalling

### Min max scaller

In [73]:
train_df.columns

Index(['Outlet_size_Medium', 'Outlet_size_Small', 'Item_Identifier',
       'Item_Fat_Content', 'Item_Visibility', 'Item_MRP',
       'Outlet_Establishment_Year', 'Outlet_Location_Type', 'Outlet_Type',
       'Item_Outlet_Sales', 'Item_type_Breads', 'Item_type_Breakfast',
       'Item_type_Canned', 'Item_type_Dairy', 'Item_type_Frozen Foods',
       'Item_type_Fruits and Vegetables', 'Item_type_Hard Drinks',
       'Item_type_Health and Hygiene', 'Item_type_Household', 'Item_type_Meat',
       'Item_type_Others', 'Item_type_Seafood', 'Item_type_Snack Foods',
       'Item_type_Soft Drinks', 'Item_type_Starchy Foods',
       'Outlet_Identifier_OUT013', 'Outlet_Identifier_OUT017',
       'Outlet_Identifier_OUT018', 'Outlet_Identifier_OUT019',
       'Outlet_Identifier_OUT027', 'Outlet_Identifier_OUT035',
       'Outlet_Identifier_OUT045', 'Outlet_Identifier_OUT046',
       'Outlet_Identifier_OUT049'],
      dtype='object')

In [77]:
normal_scaller = MinMaxScaler()
x_nor_df = normal_scaller.fit_transform(x)
x_nor_df

x_df = pd.DataFrame(x_nor_df, columns = x.columns)
x_df.head()

Unnamed: 0,Outlet_size_Medium,Outlet_size_Small,Item_Identifier,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Location_Type,Outlet_Type,Item_type_Breads,...,Item_type_Starchy Foods,Outlet_Identifier_OUT013,Outlet_Identifier_OUT017,Outlet_Identifier_OUT018,Outlet_Identifier_OUT019,Outlet_Identifier_OUT027,Outlet_Identifier_OUT035,Outlet_Identifier_OUT045,Outlet_Identifier_OUT046,Outlet_Identifier_OUT049
0,1.0,0.0,0.100128,0.0,0.048866,0.927507,0.583333,0.0,0.333333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.005135,1.0,0.058705,0.072068,1.0,1.0,0.666667,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.424904,0.0,0.051037,0.468288,0.583333,0.0,0.333333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.719512,1.0,0.0,0.640093,0.541667,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.832478,0.0,0.0,0.095805,0.083333,1.0,0.333333,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Standard scaller

In [79]:
standard_scaller = StandardScaler()
x_stand_scaller = standard_scaller.fit_transform(x)
x_stand_scaller

x_stand_df = pd.DataFrame(x_stand_scaller, columns = x.columns)
x_stand_df

Unnamed: 0,Outlet_size_Medium,Outlet_size_Small,Item_Identifier,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Location_Type,Outlet_Type,Item_type_Breads,...,Item_type_Starchy Foods,Outlet_Identifier_OUT013,Outlet_Identifier_OUT017,Outlet_Identifier_OUT018,Outlet_Identifier_OUT019,Outlet_Identifier_OUT027,Outlet_Identifier_OUT035,Outlet_Identifier_OUT045,Outlet_Identifier_OUT046,Outlet_Identifier_OUT049
0,1.432326,-0.623893,-1.388514,-0.738147,-0.970732,1.747454,0.139541,-1.369334,-0.252658,-0.174193,...,-0.132935,-0.350395,-0.349128,-0.349551,-0.256985,-0.351028,-0.349973,-0.349762,-0.349973,2.857362
1,1.432326,-0.623893,-1.717991,1.354743,-0.908111,-1.489023,1.334103,1.091569,1.002972,-0.174193,...,-0.132935,-0.350395,-0.349128,2.860816,-0.256985,-0.351028,-0.349973,-0.349762,-0.349973,-0.349973
2,1.432326,-0.623893,-0.262057,-0.738147,-0.956917,0.010040,0.139541,-1.369334,-0.252658,-0.174193,...,-0.132935,-0.350395,-0.349128,-0.349551,-0.256985,-0.351028,-0.349973,-0.349762,-0.349973,2.857362
3,-0.698165,-0.623893,0.759769,1.354743,-1.281758,0.660050,0.020085,1.091569,-1.508289,-0.174193,...,-0.132935,-0.350395,-0.349128,-0.349551,-0.256985,-0.351028,-0.349973,-0.349762,-0.349973,-0.349973
4,-0.698165,-0.623893,1.151580,-0.738147,-1.281758,-1.399220,-1.293934,1.091569,-0.252658,-0.174193,...,-0.132935,2.853918,-0.349128,-0.349551,-0.256985,-0.351028,-0.349973,-0.349762,-0.349973,-0.349973
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8518,-0.698165,-0.623893,-0.912107,-0.738147,-0.181193,1.180783,-1.293934,1.091569,-0.252658,-0.174193,...,-0.132935,2.853918,-0.349128,-0.349551,-0.256985,-0.351028,-0.349973,-0.349762,-0.349973,-0.349973
8519,-0.698165,-0.623893,0.261100,1.354743,-0.371154,-0.527301,0.497909,-0.138882,-0.252658,-0.174193,...,-0.132935,-0.350395,-0.349128,-0.349551,-0.256985,-0.351028,-0.349973,2.859087,-0.349973,-0.349973
8520,-0.698165,1.602840,1.285152,-0.738147,-0.599784,-0.897208,0.736822,-0.138882,-0.252658,-0.174193,...,-0.132935,-0.350395,-0.349128,-0.349551,-0.256985,-0.351028,2.857362,-0.349762,-0.349973,-0.349973
8521,1.432326,-0.623893,-0.219759,1.354743,1.532880,-0.607977,1.334103,1.091569,1.002972,-0.174193,...,-0.132935,-0.350395,-0.349128,2.860816,-0.256985,-0.351028,-0.349973,-0.349762,-0.349973,-0.349973


## Train test split

In [82]:
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size= 0.2, random_state=1)

## Hyperparameter Tunning

### Grid search cv

In [80]:
param_grid = {'n_neighbors': np.arange(2,20), 'p':[1,2]}
param_grid

{'n_neighbors': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
        19]),
 'p': [1, 2]}

In [81]:
knn_clf = KNeighborsRegressor()

In [83]:
gscv_model = GridSearchCV(knn_clf,param_grid, cv = 10)
gscv_model.fit(x_train,y_train)

GridSearchCV(cv=10, estimator=KNeighborsRegressor(),
             param_grid={'n_neighbors': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19]),
                         'p': [1, 2]})

In [84]:
gscv_model.best_params_

{'n_neighbors': 19, 'p': 1}

### Randomized search cv

In [85]:
rscv_model = RandomizedSearchCV(knn_clf, param_grid, cv=10)
rscv_model.fit(x_train,y_train)

RandomizedSearchCV(cv=10, estimator=KNeighborsRegressor(),
                   param_distributions={'n_neighbors': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19]),
                                        'p': [1, 2]})

In [86]:
gscv_model.best_params_

{'n_neighbors': 19, 'p': 1}

## Model Evaluation

In [87]:
x_train,x_test,y_train,y_test = train_test_split(x_df,y, test_size= 0.2, random_state=1)

### As per MinMax scaller  and GSCV

In [103]:
knn_clf = KNeighborsRegressor(n_neighbors = 19, p = 1)
knn_clf.fit(x_train,y_train)

KNeighborsRegressor(n_neighbors=19, p=1)

In [104]:
y_pred = knn_clf.predict(x_test)
y_pred

array([1441.10657895, 1311.626     , 3275.03515789, ..., 2790.89343158,
       3210.97818947, 3091.72990526])

In [105]:
r2_score(y_test,y_pred)

0.4439556382455455

### As per standard scaller and GSCV

In [107]:
x_train,x_test,y_train,y_test = train_test_split(x_stand_df,y, test_size= 0.2, random_state=1)

In [108]:
knn_clf = KNeighborsRegressor(n_neighbors = 19, p = 1)
knn_clf.fit(x_train,y_train)

KNeighborsRegressor(n_neighbors=19, p=1)

In [109]:
y_pred = knn_clf.predict(x_test)
y_pred

array([1441.10657895, 1285.30937895, 3486.40913684, ..., 2876.67650526,
       2918.90224211, 2821.23989474])

In [110]:
r2_score(y_test,y_pred)

0.4520605783482309

## Trial and error on values of K and P

In [111]:
x_train,x_test,y_train,y_test = train_test_split(x_stand_df,y, test_size= 0.2, random_state=1)

In [112]:
knn_clf = KNeighborsRegressor(n_neighbors = 9, p = 2)
knn_clf.fit(x_train,y_train)

KNeighborsRegressor(n_neighbors=9)

In [113]:
y_pred = knn_clf.predict(x_test)
y_pred

array([ 992.41188889, 1128.531     , 3289.86575556, ..., 2874.40655556,
       2254.25084444, 2956.22597778])

In [114]:
r2_score(y_test,y_pred)

0.49054843348280297