In [34]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

## Import data set

In [35]:
df=pd.read_csv('asset/weight-height.csv', sep=',')
df.head()

Unnamed: 0,Gender,Height,Weight
0,Male,73.847017,241.893563
1,Male,68.781904,162.310473
2,Male,74.110105,212.740856
3,Male,71.730978,220.04247
4,Male,69.881796,206.349801


## Separate X (Gender, Height) and Y (y=Weight)

In [36]:
x = df.drop("Weight", axis=1)
x.head()

Unnamed: 0,Gender,Height
0,Male,73.847017
1,Male,68.781904
2,Male,74.110105
3,Male,71.730978
4,Male,69.881796


## Encode
#### Gender-> Female=1, Male=0

In [37]:
x = x.replace({'Male':0, 'Female':1})
x.head()

Unnamed: 0,Gender,Height
0,0,73.847017
1,0,68.781904
2,0,74.110105
3,0,71.730978
4,0,69.881796


In [38]:
y = df['Weight']
y.head()

0    241.893563
1    162.310473
2    212.740856
3    220.042470
4    206.349801
Name: Weight, dtype: float64

## Train = 70%, Test = 30%

In [39]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=.30, random_state=42)

In [40]:
df_new = pd.DataFrame(xtest)
df_new.head()

Unnamed: 0,Gender,Height
6006,1,64.846644
1197,0,68.886367
2862,0,70.963369
6497,1,62.737189
2860,0,69.57803


In [41]:
xtest.head()

Unnamed: 0,Gender,Height
6006,1,64.846644
1197,0,68.886367
2862,0,70.963369
6497,1,62.737189
2860,0,69.57803


## Apply Linear Regression

In [42]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(xtrain,ytrain) #training
y_predict = reg.predict(xtest)
df_new['y_predict'] = reg.predict(xtest)
y_predict

array([142.44868114, 186.0517281 , 198.42283706, ..., 102.94080643,
       190.83713617, 143.34786753])

In [43]:
df_new.head()

Unnamed: 0,Gender,Height,y_predict
6006,1,64.846644,142.448681
1197,0,68.886367,186.051728
2862,0,70.963369,198.422837
6497,1,62.737189,129.884274
2860,0,69.57803,190.171436


## Evaluate the Model (Testing and training Accuracy, MSE for testing)

In [44]:
residue = ytest - y_predict
residue

6006     7.219687
1197    11.590516
2862   -19.871646
6497    11.458821
2860     5.151239
          ...    
184     -2.168954
2988   -12.531456
6048    -4.071553
420      0.096264
5669   -19.479438
Name: Weight, Length: 2567, dtype: float64

#### training Accuracy

In [45]:
reg.score(xtrain,ytrain)

0.8973793060969246

#### Testing Accuracy

In [46]:
reg.score(xtest, ytest)

0.9059112424422658

In [47]:
#MSE
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(ytest , y_predict)
mse

96.83734437830613

##  Appling KNN Regressor

In [48]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=3)
knn.fit(xtrain,ytrain) #training
predict_by_KNN = knn.predict(xtest)
df_new['predict_by_KNN']=knn.predict(xtest)
predict_by_KNN

array([142.5130416 , 173.90630823, 184.6686197 , ...,  95.44597987,
       182.99507227, 147.37662317])

In [49]:
df_new

Unnamed: 0,Gender,Height,y_predict,predict_by_KNN
6006,1,64.846644,142.448681,142.513042
1197,0,68.886367,186.051728,173.906308
2862,0,70.963369,198.422837,184.668620
6497,1,62.737189,129.884274,115.376399
2860,0,69.578030,190.171436,185.437171
...,...,...,...,...
184,0,71.935887,204.215380,210.588864
2988,0,67.616383,178.487408,182.694234
6048,1,58.213614,102.940806,95.445980
420,0,69.689795,190.837136,182.995072


## Testing and training Accuracy, MSE for testing
#### training Accuracy

In [50]:
knn.score(xtrain,ytrain)

0.9304521916751347

#### Testing accuracy

In [51]:
knn.score(xtest,ytest)

0.8679879688589832

In [52]:
#MSE
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(ytest , predict_by_KNN)
mse

135.8684592453893

## Compare KNN & Linear Regression with the KNN Model and Linear regression as well.


In [66]:
df_new.head()

Unnamed: 0,Gender,Height,y_predict,predict_by_KNN
6006,1,64.846644,142.448681,142.513042
1197,0,68.886367,186.051728,173.906308
2862,0,70.963369,198.422837,184.66862
6497,1,62.737189,129.884274,115.376399
2860,0,69.57803,190.171436,185.437171


## Assignment 2
####  Import data set
#### Separate X and Y. (y=Gender)

In [55]:
X = df.drop("Gender", axis=1)
X.head()

Unnamed: 0,Height,Weight
0,73.847017,241.893563
1,68.781904,162.310473
2,74.110105,212.740856
3,71.730978,220.04247
4,69.881796,206.349801


In [56]:
Y = df['Gender']
Y.head()

0    Male
1    Male
2    Male
3    Male
4    Male
Name: Gender, dtype: object

#### Split the data into training and test sets

In [58]:

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

#### Scale the features using StandardScaler

In [59]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#### Fitting and Evaluating the Model

In [61]:
from sklearn.neighbors import KNeighborsClassifier
knC = KNeighborsClassifier(n_neighbors=3)
knC.fit(X_train, y_train)


In [63]:
y_pred = knC.predict(X_test)
y_pred

array(['Female', 'Male', 'Female', ..., 'Male', 'Female', 'Female'],
      dtype=object)

## Testing and training Accuracy, MSE for testing
#### training Accuracy

In [65]:
knC.score(X_train, y_train)

0.9395457581830328

#### Training accuracy

In [67]:
knC.score(X_test,y_test)

0.901441371250487