Read Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('weight-height.csv')

In [3]:
df.head()

Unnamed: 0,Gender,Height,Weight
0,Male,73.847017,241.893563
1,Male,68.781904,162.310473
2,Male,74.110105,212.740856
3,Male,71.730978,220.04247
4,Male,69.881796,206.349801


Info of data

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8555 entries, 0 to 8554
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Gender  8555 non-null   object 
 1   Height  8555 non-null   float64
 2   Weight  8555 non-null   float64
dtypes: float64(2), object(1)
memory usage: 200.6+ KB


Statistical Info of data

In [5]:
df.describe()

Unnamed: 0,Height,Weight
count,8555.0,8555.0
mean,66.809925,165.632735
std,3.851454,32.043922
min,54.616858,65.78
25%,63.957684,139.876803
50%,66.985923,168.521567
75%,69.604427,190.666305
max,80.45,269.989698


Converting Numerical data to categorical data

In [6]:
from pandas.core.dtypes.common import is_numeric_dtype

for col in df.columns:
    if is_numeric_dtype(df[col]):
        continue
    else:
        one = pd.get_dummies(df[col],prefix=col, drop_first=True)
        df = pd.concat([df,one],axis=1).drop(col,axis=1)
        

In [7]:
df.head()

Unnamed: 0,Height,Weight,Gender_Male
0,73.847017,241.893563,1
1,68.781904,162.310473,1
2,74.110105,212.740856,1
3,71.730978,220.04247,1
4,69.881796,206.349801,1


Separating Features and Target

In [8]:
x = df.drop('Weight', axis=1)
y = df['Weight']

In [9]:
x.head()

Unnamed: 0,Height,Gender_Male
0,73.847017,1
1,68.781904,1
2,74.110105,1
3,71.730978,1
4,69.881796,1


Splitting the data as 70:30 ratio

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [12]:
x_train.head()

Unnamed: 0,Height,Gender_Male
553,67.594031,1
1397,71.601697,1
7934,62.625985,0
8367,66.136131,0
3320,71.843308,1


In [13]:
y_train.head()

553     186.751417
1397    211.031652
7934    143.768451
8367    151.814648
3320    196.505814
Name: Weight, dtype: float64

# LinearRegression

In [14]:
from sklearn.linear_model import LinearRegression

In [15]:
lr = LinearRegression()

In [16]:
lr.fit(x_train, y_train) # fiiting the model

LinearRegression()

# Prediction

In [17]:
pred_lr = lr.predict(x_test)
pred_lr

array([142.44868114, 186.0517281 , 198.42283706, ..., 102.94080643,
       190.83713617, 143.34786753])

# Predicted weight using LinearRegression

In [18]:
df2 = pd.DataFrame(pred_lr, columns=['new_weight_lr'])
df2.head()

Unnamed: 0,new_weight_lr
0,142.448681
1,186.051728
2,198.422837
3,129.884274
4,190.171436


# Accuracy

In [19]:
lr_score = lr.score(x_test, y_test)
lr_score

0.905911242442266

# Mean Squared Error

In [20]:
from sklearn.metrics import mean_squared_error
lr_mse = mean_squared_error(y_test,pred_lr) #testing error in terms of mse
lr_mse

96.83734437830606

# Mean Absolute Error

In [21]:
from sklearn.metrics import mean_absolute_error
lr_mae = mean_absolute_error(y_test,pred_lr) #testing error in terms of mae
lr_mae

7.870097130738319

# KNN Regressor

In [22]:
from sklearn.neighbors import KNeighborsRegressor

In [23]:
knn_reg = KNeighborsRegressor()

In [24]:
# training the model
knn_reg.fit(x_train, y_train)

KNeighborsRegressor()

In [25]:
pred_knn_reg = knn_reg.predict(x_test)
pred_knn_reg

array([142.1445657 , 181.38276924, 187.07075266, ..., 101.79654426,
       192.43702356, 145.3632227 ])

# Predicted weight using KNNRegressor

In [26]:
df2['new_weight_knn'] = pd.DataFrame(pred_knn_reg)
df2.head()

Unnamed: 0,new_weight_lr,new_weight_knn
0,142.448681,142.144566
1,186.051728,181.382769
2,198.422837,187.070753
3,129.884274,113.005495
4,190.171436,186.658958


# Accuracy

In [27]:
knn_reg_score = knn_reg.score(x_test, y_test)
knn_reg_score 

0.8821086423439015

# Mean Squared Error¶

In [28]:
from sklearn.metrics import mean_squared_error
knn_reg_mse = mean_squared_error(y_test,pred_knn_reg) #testing error in terms of mse
knn_reg_mse

121.33528273624482

# Tuned Using Randomized SearchCV

In [29]:
no_neighbors = np.sqrt(x_train.shape[0])
no_neighbors

77.38216848861241

In [30]:
x_train.shape[0]

5988

In [31]:
neighbors = list(range(1, 200))

In [32]:
neighbors

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185

In [33]:
ids = {
   'n_neighbors': neighbors 
}

In [34]:
from sklearn.model_selection import RandomizedSearchCV

In [35]:
rcv = RandomizedSearchCV(knn_reg, ids, n_iter=200, cv=3, random_state=42)

In [36]:
rcv.fit(x_train, y_train)



RandomizedSearchCV(cv=3, estimator=KNeighborsRegressor(), n_iter=200,
                   param_distributions={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8,
                                                        9, 10, 11, 12, 13, 14,
                                                        15, 16, 17, 18, 19, 20,
                                                        21, 22, 23, 24, 25, 26,
                                                        27, 28, 29, 30, ...]},
                   random_state=42)

In [37]:
rcv.best_score_

0.893638314865469

# Accuracy

In [38]:
score_knn_reg_tuned = rcv.score(x_test, y_test)
score_knn_reg_tuned 

0.9033586009403429

In [39]:
cv_results_knn_reg_tuned = pd.DataFrame(rcv.cv_results_)

In [40]:
cv_results_knn_reg_tuned

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.009659,5.897643e-03,0.006996,2.155492e-03,1,{'n_neighbors': 1},0.786849,0.801918,0.791626,0.793465,0.006288,199
1,0.005000,2.402658e-06,0.004999,2.368238e-06,2,{'n_neighbors': 2},0.834092,0.847906,0.850782,0.844260,0.007285,198
2,0.004334,4.732811e-04,0.005656,4.649685e-04,3,{'n_neighbors': 3},0.847549,0.867771,0.870413,0.861911,0.010212,197
3,0.004331,4.720449e-04,0.006001,8.485379e-07,4,{'n_neighbors': 4},0.857413,0.874657,0.879118,0.870396,0.009359,196
4,0.003998,2.247832e-07,0.006334,4.724943e-04,5,{'n_neighbors': 5},0.863524,0.880387,0.883870,0.875927,0.008885,195
...,...,...,...,...,...,...,...,...,...,...,...,...
194,0.005211,7.369405e-03,0.053274,9.066518e-03,195,{'n_neighbors': 195},0.876589,0.886902,0.891152,0.884881,0.006115,187
195,0.000000,0.000000e+00,0.062517,1.243490e-05,196,{'n_neighbors': 196},0.876518,0.886760,0.891069,0.884783,0.006103,188
196,0.002322,3.284195e-03,0.061156,6.315755e-03,197,{'n_neighbors': 197},0.876406,0.886688,0.890947,0.884680,0.006104,189
197,0.000000,0.000000e+00,0.062505,2.794079e-05,198,{'n_neighbors': 198},0.876316,0.886582,0.890897,0.884598,0.006116,190


# Mean Squared Error

In [41]:
pred_knn_reg_tuned = rcv.predict(x_test)
mse_knn_reg_tuned = mean_squared_error(y_test,pred_knn_reg_tuned) #testing error in terms of mse
mse_knn_reg_tuned

99.46455543531674

In [42]:
df2['new_weight_knn_tuned'] = pd.DataFrame(pred_knn_reg_tuned)
df2.head()

Unnamed: 0,new_weight_lr,new_weight_knn,new_weight_knn_tuned
0,142.448681,142.144566,144.296043
1,186.051728,181.382769,186.51196
2,198.422837,187.070753,196.180703
3,129.884274,113.005495,125.585819
4,190.171436,186.658958,189.271876


In [43]:
compar_mse ={
    'lr':lr_mse,
    'knn_reg':knn_reg_mse,
    'knn_reg_tuned': mse_knn_reg_tuned
}

In [44]:
compar_mse

{'lr': 96.83734437830606,
 'knn_reg': 121.33528273624482,
 'knn_reg_tuned': 99.46455543531674}

In [45]:
compar_score ={
    'lr':lr_score,
    'knn_reg':knn_reg_score,
    'knn_reg_tuned': score_knn_reg_tuned
}

In [46]:
compar_score

{'lr': 0.905911242442266,
 'knn_reg': 0.8821086423439015,
 'knn_reg_tuned': 0.9033586009403429}