# Weight-Height Prediction using Linear Regression

simple linear regression model to predict the height of person for given weight

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.metrics import r2_score,mean_squared_error

In [None]:
weight_height_dataset = pd.read_csv('../input/weight-height/weight-height.csv')
weight_height_dataset.head()

In [None]:
weight_height_dataset.info()

In [None]:
weight_height_dataset.describe()

In [None]:
weight_height_dataset.duplicated().sum()

In [None]:
weight_height_dataset.isnull().sum()

# Univariate analysis

In [None]:
sns.boxplot(weight_height_dataset.Weight)
plt.show()

In [None]:
sns.boxplot(weight_height_dataset.Height)
plt.show()

## IQR method to remove outliers

outliers affect the regression line

In [None]:
q1 = weight_height_dataset['Weight'].quantile(0.25)
q3 = weight_height_dataset['Weight'].quantile(0.75)
iqr = q3 - q1
ul = q3 + 1.5*iqr
ll = q1 - 1.5*iqr
weight_height_dataset = weight_height_dataset[(weight_height_dataset.Weight >= ll) & (weight_height_dataset.Weight <= ul)]

In [None]:
q1 = weight_height_dataset['Height'].quantile(0.25)
q3 = weight_height_dataset['Height'].quantile(0.75)
iqr = q3 - q1
ul = q3 + 1.5*iqr
ll = q1 - 1.5*iqr
weight_height_dataset = weight_height_dataset[(weight_height_dataset.Height >= ll) & (weight_height_dataset.Height <= ul)]

# Bivariate analysis

In [None]:
sns.scatterplot(weight_height_dataset.Weight,weight_height_dataset.Height,color='g')
plt.show()

### Split the dataset into train and test
70:30 ratio

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x = pd.DataFrame(weight_height_dataset['Weight'])
y = pd.DataFrame(weight_height_dataset['Height'])

In [None]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.30,random_state=123)
print(xtrain.shape,ytrain.shape,xtest.shape,ytest.shape)

# Apply Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr = LinearRegression()
lr.fit(xtrain,ytrain)
yPredict = lr.predict(xtest)

In [None]:
print(lr.coef_)
print(lr.intercept_)

### Equation of line : y = 0.11x + 48.5

## Check Rsquare and RMSE for accuracy

In [None]:
r2_score(ytest,yPredict)

In [None]:
np.sqrt(mean_squared_error(ytest,yPredict))

# Plotting the Regression Line

In [None]:
sns.scatterplot(xtrain.Weight,ytrain.Height)
plt.plot(xtrain.Weight,lr.predict(xtrain),c='r')
plt.show()

In [None]:
sns.scatterplot(xtest.Weight,ytest.Height,color='r')
plt.plot(xtest.Weight,yPredict,c='b')
plt.show()

# Linear Regression Assumptions

In [None]:
residual = ytest - yPredict

### 1. No pattern in residual

In [None]:
sns.residplot(yPredict,residual)
plt.show()

### 2. Normal Distribution

In [None]:
import pylab
import scipy.stats as stats

In [None]:
stats.probplot(residual.Height,plot=pylab)
plt.show()

In [None]:
test,pvalue = stats.shapiro(residual)
print(pvalue)

### 3. Multicollinearity

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
vif = [variance_inflation_factor(weight_height_dataset.drop('Gender',axis=1).values,i) for i in range(weight_height_dataset.drop('Gender',axis=1).shape[1])]

In [None]:
pd.DataFrame({'vif':vif},index=['Weight','Height']).T

### 4. Heteroscadastic

if heteroscadastic, linear regression cannot be used. 

h0: residual is not heteroscadastic

h1: residual is heteroscadastic

In [None]:
from statsmodels.stats.api import het_goldfeldquandt

In [None]:
df = pd.DataFrame(weight_height_dataset['Height'])

In [None]:
residual2 = df - lr.predict(df)

In [None]:
ftest,pvalue,result = het_goldfeldquandt(residual2,weight_height_dataset.drop('Gender',axis=1))
print(pvalue)

### 5. Auto-correlation

The errors should not be auto correlated in nature as it will violate the assumptions of the linear regression model.

- Durbin Watson Test

0 to 4

[0-2) - (+)ve coorelation

=2 - no correlation

(2-4] - (-)ve correlaion

In [None]:
from statsmodels.stats.stattools import durbin_watson

In [None]:
print(durbin_watson(residual))

### 6. Linearity

- Rainbow Test

h0: linear in nature

h1: not linear in nature

In [None]:
import statsmodels.api as sms

In [None]:
model = sms.OLS(y,x).fit()
model.summary()

In [None]:
test,pvalue = sms.stats.diagnostic.linear_rainbow(model)
pvalue

# Using One hot Encoding & Scaling to improve accuracy

In [None]:
weight_height_dataset[['Female','Male']] = pd.get_dummies(weight_height_dataset['Gender'])
weight_height_dataset.head()

In [None]:
weight_height_dataset.drop('Gender',axis=1,inplace=True)

In [None]:
weight_height_dataset.head()

In [None]:
temp = pd.DataFrame(StandardScaler().fit_transform(weight_height_dataset),columns=weight_height_dataset.columns)
temp.head()

In [None]:
x = temp.drop('Height',axis=1)
y = temp['Height']

In [None]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.30,random_state=123)
print(xtrain.shape,ytrain.shape,xtest.shape,ytest.shape)

In [None]:
lr = LinearRegression()
lr.fit(xtrain,ytrain)
yPredict = lr.predict(xtest)

In [None]:
r2_score(ytest,yPredict)

In [None]:
np.sqrt(mean_squared_error(ytest,yPredict))