### 1. Importing Libraries and Dataset

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('weight-height.csv')
df.head()

Unnamed: 0,Gender,Height,Weight
0,Male,73.847017,241.893563
1,Male,68.781904,162.310473
2,Male,74.110105,212.740856
3,Male,71.730978,220.04247
4,Male,69.881796,206.349801


In [4]:
df = pd.get_dummies(df, columns=['Gender'], drop_first=True)
df.head()

Unnamed: 0,Height,Weight,Gender_Male
0,73.847017,241.893563,True
1,68.781904,162.310473,True
2,74.110105,212.740856,True
3,71.730978,220.04247,True
4,69.881796,206.349801,True


### 2. Separating X (Gender, Height) and Y (Weight)

In [5]:
x = df.drop('Weight', axis = 1)
x.head()

Unnamed: 0,Height,Gender_Male
0,73.847017,True
1,68.781904,True
2,74.110105,True
3,71.730978,True
4,69.881796,True


In [6]:
y = df[['Weight']]
y.head()

Unnamed: 0,Weight
0,241.893563
1,162.310473
2,212.740856
3,220.04247
4,206.349801


### 3. Splitting the dataset into the Training set and Test set
Train = 70%, Test = 30%

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.30)

In [9]:
df.shape

(8555, 3)

In [10]:
xtrain.shape #70% data

(5988, 2)

In [11]:
ytest

Unnamed: 0,Weight
1591,189.392841
2430,173.208632
4757,190.047880
2048,178.929043
2814,201.795504
...,...
6436,143.363631
8187,156.232125
3055,166.669570
1995,182.746514


### 4. Applying Linear Regression Model

In [12]:
from sklearn.linear_model import LinearRegression

In [13]:
reg = LinearRegression()

In [14]:
reg.fit(xtrain, ytrain) # training ml model with train data

In [15]:
reg.predict(xtest)

array([[202.91228057],
       [173.54569363],
       [194.77860654],
       ...,
       [159.28683805],
       [172.70452064],
       [184.05625015]])

In [16]:
ytest['Predicted_Weight'] = reg.predict(xtest)
ytest

Unnamed: 0,Weight,Predicted_Weight
1591,189.392841,202.912281
2430,173.208632,173.545694
4757,190.047880,194.778607
2048,178.929043,184.352833
2814,201.795504,185.994289
...,...,...
6436,143.363631,135.017469
8187,156.232125,151.913757
3055,166.669570,159.286838
1995,182.746514,172.704521


In [17]:
ytest

Unnamed: 0,Weight,Predicted_Weight
1591,189.392841,202.912281
2430,173.208632,173.545694
4757,190.047880,194.778607
2048,178.929043,184.352833
2814,201.795504,185.994289
...,...,...
6436,143.363631,135.017469
8187,156.232125,151.913757
3055,166.669570,159.286838
1995,182.746514,172.704521


In [18]:
ytest.drop('Predicted_Weight', axis=1, inplace=True)

### 5. Evaluating the Model (Testing and training Accuracy, MSE for testing)

In [19]:
#training accuracy
reg.score(xtrain, ytrain)

0.8982995394241642

In [20]:
#testing accuracy
reg.score(xtest, ytest)

0.9034163644418937

In [21]:
#MSE
from sklearn.metrics import mean_squared_error
mse_lr = mean_squared_error(ytest, reg.predict(xtest))
mse_lr

102.75904346160375

### 6. Applying KNN Regressor

In [22]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=3)

knn.fit(xtrain, ytrain)

In [23]:
ytest['Predicted_Weight'] = knn.predict(xtest)

In [24]:
ytest

Unnamed: 0,Weight,Predicted_Weight
1591,189.392841,203.145921
2430,173.208632,173.134786
4757,190.047880,186.783032
2048,178.929043,185.036105
2814,201.795504,183.989905
...,...,...
6436,143.363631,130.776609
8187,156.232125,160.162256
3055,166.669570,161.649891
1995,182.746514,170.444331


In [25]:
ytest.drop('Predicted_Weight', axis=1, inplace=True)

### 7. Evaluating the Model (Testing and training Accuracy, MSE for testing)

In [26]:
#training accuracy
knn.score(xtrain, ytrain)

0.9317464917554705

In [27]:
#testing accuracy
knn.score(xtest, ytest)

0.8738588657623685

In [28]:
#mse
from sklearn.metrics import mean_squared_error
mse_knn = mean_squared_error(ytest, knn.predict(xtest))
mse_knn

134.20640277744076

### 8. Comparing KNN & Linear Regression Model

In [29]:
compare_df = ytest.copy()

In [30]:
compare_df['knn_weight'] = knn.predict(xtest)
compare_df['lr_weight'] = reg.predict(xtest)


In [31]:
compare_df.head()

Unnamed: 0,Weight,knn_weight,lr_weight
1591,189.392841,203.145921,202.912281
2430,173.208632,173.134786,173.545694
4757,190.04788,186.783032,194.778607
2048,178.929043,185.036105,184.352833
2814,201.795504,183.989905,185.994289


In [32]:
compare_df['Difference'] = compare_df['lr_weight'] - compare_df['knn_weight']

compare_df.head()

Unnamed: 0,Weight,knn_weight,lr_weight,Difference
1591,189.392841,203.145921,202.912281,-0.23364
2430,173.208632,173.134786,173.545694,0.410907
4757,190.04788,186.783032,194.778607,7.995574
2048,178.929043,185.036105,184.352833,-0.683271
2814,201.795504,183.989905,185.994289,2.004384


In [33]:
print('The Difference between the MSE of KNN and Linear Regression is: ', mse_lr - mse_knn)

The Difference between the MSE of KNN and Linear Regression is:  -31.447359315837005
