In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, AdaBoostRegressor

from sklearn import metrics

In [2]:
df = pd.read_csv('../data/survey.csv')
print(df.shape)
df.head(15)

(2111, 17)


Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II
5,Male,29.0,1.62,53.0,no,yes,2.0,3.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Automobile,Normal_Weight
6,Female,23.0,1.5,55.0,yes,yes,3.0,3.0,Sometimes,no,2.0,no,1.0,0.0,Sometimes,Motorbike,Normal_Weight
7,Male,22.0,1.64,53.0,no,no,2.0,3.0,Sometimes,no,2.0,no,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
8,Male,24.0,1.78,64.0,yes,yes,3.0,3.0,Sometimes,no,2.0,no,1.0,1.0,Frequently,Public_Transportation,Normal_Weight
9,Male,22.0,1.72,68.0,yes,yes,2.0,3.0,Sometimes,no,2.0,no,1.0,1.0,no,Public_Transportation,Normal_Weight


In [3]:
df['family_history_with_overweight'] = df['family_history_with_overweight'].map({'yes':1, 'no':0})
df['FAVC'] = df['FAVC'].map({'yes':1, 'no':0})
df['SMOKE'] = df['SMOKE'].map({'yes':1, 'no':0})
df['SCC'] = df['SCC'].map({'yes':1, 'no':0})

In [4]:
df = pd.get_dummies(data = df, columns = ['Gender', 'CAEC', 'CALC', 'MTRANS'], drop_first = True)
df.shape

(2111, 24)

In [5]:
# Train test split
X = df.drop(columns = ['NObeyesdad', 'Weight'])
y = df['Weight']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [6]:
# Pipelines
def pipe_model(model):
    pipe = Pipeline([
        ('ss', StandardScaler()),
        ('model', model)
    ])
    pipe.fit(X_train, y_train)
    print(f'{model}')
    print(f'Training r2 score: {pipe.score(X_train, y_train)}')
    print(f'Testing r2 score: {pipe.score(X_test, y_test)}')
    print(f'Testing MSE: {metrics.mean_squared_error(y_test, pipe.predict(X_test))}')
    return print('='*40)

In [7]:
pipe_model(LinearRegression())
pipe_model(KNeighborsRegressor())
pipe_model(DecisionTreeRegressor())
pipe_model(BaggingRegressor())
pipe_model(RandomForestRegressor())
pipe_model(AdaBoostRegressor())

LinearRegression()
Training r2 score: 0.583058887606605
Testing r2 score: 0.5776341443290094
Testing MSE: 298.905998367195
KNeighborsRegressor()
Training r2 score: 0.9112589623119534
Testing r2 score: 0.8092018277048589
Testing MSE: 135.02681954703345
DecisionTreeRegressor()
Training r2 score: 1.0
Testing r2 score: 0.7534289595412301
Testing MSE: 174.49697229829536
BaggingRegressor()
Training r2 score: 0.9806996030920353
Testing r2 score: 0.852494707999796
Testing MSE: 104.38868572773663
RandomForestRegressor()
Training r2 score: 0.9867834733670161
Testing r2 score: 0.8589041528442395
Testing MSE: 99.8527567825233
AdaBoostRegressor()
Training r2 score: 0.7294128514634586
Testing r2 score: 0.6814217289210884
Testing MSE: 225.4560942755612


In [8]:
# Read Data
df = pd.read_csv('../data/survey.csv')

# Binarize columns
df['family_history_with_overweight'] = df['family_history_with_overweight'].map({'yes':1, 'no':0})
df['FAVC'] = df['FAVC'].map({'yes':1, 'no':0})
df['SMOKE'] = df['SMOKE'].map({'yes':1, 'no':0})
df['SCC'] = df['SCC'].map({'yes':1, 'no':0})

# Get dummies
df = pd.get_dummies(data = df, columns = ['Gender', 'CAEC', 'CALC', 'MTRANS'], drop_first = True)


In [9]:
# Create polynomial features
poly = PolynomialFeatures(2, interaction_only = True, include_bias = False)
array_poly = poly.fit_transform(df[['Height', 'FAVC', 'FCVC', 'NCP', 'CH2O']])

# Polynomial Feature Dataframe
df_poly = pd.DataFrame(array_poly, columns = poly.get_feature_names_out(['Height', 'FAVC', 'FCVC', 'NCP', 'CH2O']))

# Merge Polynomial Features df
df = df.merge(df_poly.iloc[:,4:])

# Create height^2 column
df['h2'] = df['Height'] ** 2

In [10]:
X = df.drop(columns = ['NObeyesdad', 'Weight'])
y = df['Weight']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [11]:
pipe = Pipeline([
        ('ss', StandardScaler()),
        ('rf', RandomForestRegressor())
    ])
pipe.fit(X_train, y_train)
print(f'Training r2 score: {pipe.score(X_train, y_train)}')
print(f'Testing r2 score: {pipe.score(X_test, y_test)}')
print(f'Testing MSE: {metrics.mean_squared_error(y_test, pipe.predict(X_test))}')

Training r2 score: 0.9998656151911552
Testing r2 score: 0.99879391357343
Testing MSE: 0.40872904199109983
