### Model Training
#### Import data and required packages
Importing Pandas, Numpy, Seaborn, Maplotlib and Warnings library

In [126]:
# Basic import
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib .pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Modeling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

#### Import the CSV Data as Pandas DataFrame

In [127]:
df = pd.read_csv(r"C:\Users\ysanc\OneDrive\Desktop\Callorie_Intake_calculator\data\balanc_diet.csv")

#### Show top 5 records

In [128]:
df.head()

Unnamed: 0,ID,Age,Gender,Working_Type,Sleep_Hours,Height_m,Required_Daily_Calories
0,5398,68.0,Female,Unemployed,6.627585,1.861374,1898.520157
1,3078,19.0,Female,Desk Job,5.671469,1.732762,2439.173224
2,6051,50.0,Female,Freelancer,7.060029,1.509387,1746.165518
3,6503,54.0,Male,Healthcare,4.154597,1.832796,2461.167138
4,3602,27.0,Female,Desk Job,3.84702,1.62467,2105.472452


In [129]:
df.shape

(10200, 7)

#### Droping Unnecessary columns

In [130]:
df = df.drop('ID', axis = 1)

In [131]:
df.head()

Unnamed: 0,Age,Gender,Working_Type,Sleep_Hours,Height_m,Required_Daily_Calories
0,68.0,Female,Unemployed,6.627585,1.861374,1898.520157
1,19.0,Female,Desk Job,5.671469,1.732762,2439.173224
2,50.0,Female,Freelancer,7.060029,1.509387,1746.165518
3,54.0,Male,Healthcare,4.154597,1.832796,2461.167138
4,27.0,Female,Desk Job,3.84702,1.62467,2105.472452


In [132]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10200 entries, 0 to 10199
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      10171 non-null  float64
 1   Gender                   10200 non-null  object 
 2   Working_Type             10171 non-null  object 
 3   Sleep_Hours              10171 non-null  float64
 4   Height_m                 10171 non-null  float64
 5   Required_Daily_Calories  10171 non-null  float64
dtypes: float64(4), object(2)
memory usage: 478.3+ KB


In [133]:
# Checking the missing values
df.isnull().sum()

Age                        29
Gender                      0
Working_Type               29
Sleep_Hours                29
Height_m                   29
Required_Daily_Calories    29
dtype: int64

#### Filling missing values

In [134]:
df['Age'] = df.groupby('Gender')['Age'].transform(lambda x: x.fillna(x.median()))
df['Sleep_Hours'] = df['Sleep_Hours'].interpolate()
df['Working_Type'] = df['Working_Type'].fillna(df['Working_Type'].mode()[0])
df['Height_m'] = df['Height_m'].fillna(df['Height_m'].median())
df['Required_Daily_Calories'] = df['Required_Daily_Calories'].fillna(df['Required_Daily_Calories'].median())

In [135]:
df.isnull().sum()

Age                        0
Gender                     0
Working_Type               0
Sleep_Hours                0
Height_m                   0
Required_Daily_Calories    0
dtype: int64

In [136]:
# Setting the Required daily calories at two decimal Points
df['Required_Daily_Calories'] = df['Required_Daily_Calories'].round(2)  

#### Preparing X and Y variables

In [137]:
x = df.drop(columns=['Required_Daily_Calories'],axis=1)
y = df['Required_Daily_Calories']

In [138]:
x.head()

Unnamed: 0,Age,Gender,Working_Type,Sleep_Hours,Height_m
0,68.0,Female,Unemployed,6.627585,1.861374
1,19.0,Female,Desk Job,5.671469,1.732762
2,50.0,Female,Freelancer,7.060029,1.509387
3,54.0,Male,Healthcare,4.154597,1.832796
4,27.0,Female,Desk Job,3.84702,1.62467


In [139]:
y.head()

0    1898.52
1    2439.17
2    1746.17
3    2461.17
4    2105.47
Name: Required_Daily_Calories, dtype: float64

In [140]:
# Create Column Transformer
num_features = x.select_dtypes(exclude="object").columns
cat_features = x.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [141]:
x = preprocessor.fit_transform(x)

In [142]:
x.shape

(10200, 13)

In [143]:
# Separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((8160, 13), (2040, 13))

In [144]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [145]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 291.8057
- Mean Absolute Error: 227.3660
- R2 Score: 0.5475
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 294.7980
- Mean Absolute Error: 230.2723
- R2 Score: 0.5549


Lasso
Model performance for Training set
- Root Mean Squared Error: 291.9244
- Mean Absolute Error: 227.5598
- R2 Score: 0.5471
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 295.0424
- Mean Absolute Error: 230.6751
- R2 Score: 0.5541


Ridge
Model performance for Training set
- Root Mean Squared Error: 291.8058
- Mean Absolute Error: 227.3717
- R2 Score: 0.5475
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 294.8094
- Mean Absolute Error: 230.2865
- R2 Score: 0.5548


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 217.0690
- Mean Absolute Error: 160.7566
- R2 Score: 0.

In [146]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
5,Random Forest Regressor,0.624411
3,K-Neighbors Regressor,0.611103
7,CatBoosting Regressor,0.592123
6,XGBRegressor,0.572891
0,Linear Regression,0.554855
2,Ridge,0.554821
1,Lasso,0.554117
8,AdaBoost Regressor,0.458924
4,Decision Tree,0.323842


In [147]:
Random_model = RandomForestRegressor()
Random_model = Random_model.fit(X_train, y_train)
y_pred = Random_model.predict(X_test)
score = r2_score(y_test, y_pred)*100
print(" Accuracy of the model is %.2f" %score)

 Accuracy of the model is 62.13


In [148]:
import joblib
joblib.dump(models['Random Forest Regressor'],'R_F_Regressor_heart.pkl')
joblib.dump(preprocessor,'preprocessor.pkl')
joblib.dump(df.columns.tolist(),'columns.pkl')

['columns.pkl']

In [149]:
pred_df=pd.DataFrame({'Actual Value':y_test,'Predicted Value':y_pred,'Difference':y_test-y_pred})
pred_df

Unnamed: 0,Actual Value,Predicted Value,Difference
10078,2004.82,2235.3003,-230.4803
3742,2190.43,2433.3792,-242.9492
10068,2396.55,2040.5399,356.0101
5723,2329.08,2651.9382,-322.8582
8987,2503.04,2444.6493,58.3907
...,...,...,...
5768,2632.39,2554.0849,78.3051
4906,1870.00,1906.1422,-36.1422
6746,1500.47,1965.6265,-465.1565
2820,3404.47,3321.7428,82.7272
