In [1]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, FunctionTransformer, PolynomialFeatures
from sklearn.pipeline import Pipeline

from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

%matplotlib inline

## Training Data Preparation

In [2]:
# import training data set
pp_train = pd.read_csv("../data/progresspics_2018_training_data.cvs")

In [3]:
pp_train.head()

Unnamed: 0,sex,age,height,start_weight,end_weight,score,num_comments,num_posts,num_posts_cat,post_order,...,gym,lifting,working,diet,muscle,nsfw,weight_diff,gain_or_lose,fill_period_weeks,change_rate
0,1.0,34.0,68.0,189.0,175.0,1338,42,3,"(2.0, 4.0]",3.0,...,0,0,0,0,0,0,14.0,0,312.0,0.044872
1,1.0,35.0,76.0,316.0,268.0,1,1,0,"(-inf, 0.0]",0.0,...,0,1,0,0,0,0,48.0,0,16.0,3.0
2,1.0,33.0,74.0,260.0,220.0,1,0,8,"(4.0, 8.0]",3.0,...,0,0,0,0,0,0,40.0,0,10.0,4.0
3,1.0,27.0,74.0,235.0,170.0,44,2,3,"(2.0, 4.0]",3.0,...,0,0,0,0,0,0,65.0,0,59.888934,1.085342
4,1.0,17.0,73.0,165.3465,211.64352,1,1,1,"(0.0, 1.0]",1.0,...,0,0,0,0,0,0,-46.29702,1,156.0,-0.296776


In [4]:
pp_train.shape

(17187, 29)

In [5]:
pp_train.columns

Index(['sex', 'age', 'height', 'start_weight', 'end_weight', 'score',
       'num_comments', 'num_posts', 'num_posts_cat', 'post_order', 'month',
       'dayofweek', 'progress', 'face', 'goal', 'finally', 'cico', 'keto',
       'gains', 'gym', 'lifting', 'working', 'diet', 'muscle', 'nsfw',
       'weight_diff', 'gain_or_lose', 'fill_period_weeks', 'change_rate'],
      dtype='object')

In [6]:
# create the feature set by dropping the target, "end_weight", "score" which is highly correlated 
# with "num_comments", "weigh_diff" which can be used with "start_weight" to exactly predict "end_weight",
# and "change_rate" which can be used with "fill_period_weeks" and "start_weight" to exactly predict "end_weight". 

# also drop "num_post_cat" which was found to be less useful than "num_posts" in model testing

# set the target, "end_weight", equal to y

pp_train_features = pp_train.drop(["end_weight", "score", "weight_diff", "change_rate", "num_posts_cat"], axis=1)
y = pp_train.loc[:, 'end_weight']

In [7]:
# Are there any Nans?
pp_train_features.isnull().sum()

sex                  0
age                  0
height               0
start_weight         0
num_comments         0
num_posts            0
post_order           0
month                0
dayofweek            0
progress             0
face                 0
goal                 0
finally              0
cico                 0
keto                 0
gains                0
gym                  0
lifting              0
working              0
diet                 0
muscle               0
nsfw                 0
gain_or_lose         0
fill_period_weeks    0
dtype: int64

In [8]:
# Standard scale numerical columns and one-hot encode categorical columns that include categories other than 0 and 1. 
num_columns = ['age', 'height', 'start_weight', 'fill_period_weeks', 'num_comments']
cat_columns = ['month', 'dayofweek', 'num_posts', 'post_order']
    
col_preprocessing = ColumnTransformer([
    ('numeric_col_preprocessing', StandardScaler(), num_columns),
    ('cat_col_preprocessing', OneHotEncoder(sparse=False, categories='auto'), cat_columns)], 
    remainder='passthrough', verbose=True)

data_prep_pipeline = Pipeline([
    ('col_preprocessing', col_preprocessing)],
    verbose=True)

X = data_prep_pipeline.fit_transform(pp_train_features.copy())

[ColumnTransformer]  (1 of 3) Processing numeric_col_preprocessing, total=   0.0s
[ColumnTransformer]  (2 of 3) Processing cat_col_preprocessing, total=   0.0s
[ColumnTransformer] ..... (3 of 3) Processing remainder, total=   0.0s
[Pipeline] . (step 1 of 1) Processing col_preprocessing, total=   0.0s


In [9]:
X.shape

(17187, 79)

In [10]:
X[0, :]

array([ 1.29083369,  0.015753  , -0.50173119,  3.84479427,  0.657463  ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.  

## Training the selected model - Random Forest

In [11]:
# train Random Forest regression model using the normal dataset
final_RF_reg = RandomForestRegressor(n_estimators=1400, min_samples_split=2, min_samples_leaf=1, max_features='auto',
                                     max_depth=None, bootstrap=True)
final_RF_reg.fit(X, y)

RandomForestRegressor(n_estimators=1400)

## Testing Data Preparation

In [12]:
pp_test = pd.read_csv("../data/progresspics_2018_testing_data.cvs")

In [13]:
pp_test.shape

(4297, 29)

In [14]:
pp_test.head()

Unnamed: 0,sex,age,height,start_weight,end_weight,score,num_comments,num_posts,num_posts_cat,post_order,...,gym,lifting,working,diet,muscle,nsfw,weight_diff,gain_or_lose,fill_period_weeks,change_rate
0,1.0,23.0,72.0,147.0,156.0,90,18,3,"(2.0, 4.0]",3.0,...,0,0,0,0,0,0,-9.0,1,32.0,-0.28125
1,1.0,24.0,70.0,275.0,170.0,158,12,1,"(0.0, 1.0]",1.0,...,0,0,0,1,0,0,105.0,0,63.34077,1.6577
2,0.0,52.0,65.0,191.0,172.0,727,21,1,"(0.0, 1.0]",1.0,...,0,0,0,0,0,0,19.0,0,54.132889,0.350988
3,1.0,43.0,71.0,205.0,175.0,79,12,0,"(-inf, 0.0]",0.0,...,0,0,0,0,0,0,30.0,0,72.0,0.416667
4,1.0,21.0,69.0,138.0,148.0,46,5,1,"(0.0, 1.0]",1.0,...,0,0,0,0,0,0,-10.0,1,156.0,-0.064103


In [15]:
pp_test_features = pp_test.drop(["end_weight", "score", "weight_diff", "change_rate", "num_posts_cat"], axis=1)
y_test = pp_test.loc[:, 'end_weight']

In [16]:
pp_test_features.isnull().sum()

sex                  0
age                  0
height               0
start_weight         0
num_comments         0
num_posts            0
post_order           0
month                0
dayofweek            0
progress             0
face                 0
goal                 0
finally              0
cico                 0
keto                 0
gains                0
gym                  0
lifting              0
working              0
diet                 0
muscle               0
nsfw                 0
gain_or_lose         0
fill_period_weeks    0
dtype: int64

In [17]:
X_test = data_prep_pipeline.transform(pp_test_features.copy())

In [18]:
X_test.shape

(4297, 79)

## Use trained models to predict outcomes based on the prepared test features

In [19]:
final_RF_reg_predictions = final_RF_reg.predict(X_test)

## Scores 

In [20]:
final_RF_mse = mean_squared_error(y_test, final_RF_reg_predictions)
final_RF_rmse = np.sqrt(final_RF_mse)
final_RF_r2 = r2_score(y_test, final_RF_reg_predictions)
print("Random Forest")
print("RMSE: ", final_RF_rmse)
print("R2: ", final_RF_r2)

Random Forest
RMSE:  20.261883787160286
R2:  0.797661433361521
