In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from seaborn import regplot
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
df = pd.read_csv("./Weight-S2017-18.csv")
print(df.shape)
df

(100, 3)


Unnamed: 0,height,sex,weight
0,1.69,M,67.89
1,1.32,F,32.21
2,1.77,M,82.71
3,1.53,F,51.50
4,1.80,M,85.40
5,1.63,M,71.18
6,1.47,F,40.88
7,1.59,F,48.45
8,1.77,M,78.99
9,1.54,F,45.71


In [3]:
features = df.drop('weight',axis=1)
features = pd.get_dummies(features)
features.head(3)

Unnamed: 0,height,sex_F,sex_M
0,1.69,0,1
1,1.32,1,0
2,1.77,0,1


In [4]:
target = df.weight
target.head(3)

0    67.89
1    32.21
2    82.71
Name: weight, dtype: float64

In [5]:
features = (features - features.mean())/features.std()
lr = LinearRegression()
lr.fit(features,target)
R2 = lr.score(features,target)
print('R-squared =',round(R2,3))

coef = pd.DataFrame()
coef['feature'] = features.columns
coef['coef'] = lr.coef_.round(2)
coef.sort_values('coef',ascending=False)

R-squared = 0.955


Unnamed: 0,feature,coef
0,height,11.82
2,sex_M,2.6
1,sex_F,-2.6


## (i) *Justifications provided for method used*

#### I knew right off the bat that I wanted to use a Regression Model, since the output of my prediction is a number, not a model (model = use a classifier). From the plot we are given, there appears to be a rough, linear relationship between height and weight. From this, I decided a Linear Regression algorithm would be best. This would keep things simple, yet accurate.

#### I one-hot encode the qualitative feature in the data set: 'sex'. This is I can fit the quantitative feature 'height' with the qualitative feature of 'sex'.

#### There is a need to standardize the variables in this case. There is collinearity between the independent variables. The independent variables are 'height' and 'sex'. One could use sex to predict height; changing sex will have an effect on height, as women are generally shorter than men.

## (ii) *Simplicity and interpretability of method used and interpretations provided for results*

#### As one can see, a person's height has a positive coefficient, meaning height has a positive correlation with weight. Same with being a male, but to a lesser degree. We can see that women have negative correlation with weight, meaning, keeping height constant, a woman is more likely to weigh less than a man.

#### Let's get specific. See that height's coefficient is 11.82. This means, give a one unit change in height increases weight by 11.82 units; a one meter increase in height results in a 11.82 kg weight gain, on average. If a person is male, their weight usually increases by 2.6 kg, and if they're a women, their weight usually decreases by 2.6 kg

## (iii) *Accuracy of predictions for weight*

In [6]:
df = pd.read_csv("./Weight_predict-S2017-18.csv")
df.head(3)

Unnamed: 0,height,sex,weight
0,1.48,F,0
1,1.8,M,0
2,1.75,M,0


In [7]:
features = df.drop('weight',axis=1)
features = pd.get_dummies(features)
features.head(3)
features = (features - features.mean())/features.std()

In [8]:
predictions = lr.predict(features)

In [9]:
df['weight'] = predictions
df

Unnamed: 0,height,sex,weight
0,1.48,F,46.907505
1,1.80,M,81.726261
2,1.75,M,77.906290
3,1.70,M,74.086319
4,1.64,M,69.502354
5,1.54,F,51.491471
6,1.72,M,75.614308
7,1.48,F,46.907505
8,1.81,M,82.490255
9,1.45,F,44.615523


In [10]:
df.to_csv("Weight_rileyma.csv",index=False)
df.head(3)

Unnamed: 0,height,sex,weight
0,1.48,F,46.907505
1,1.8,M,81.726261
2,1.75,M,77.90629
