In [32]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

In [33]:
df = pd.read_csv('ObesityDataSet_raw_and_data_sinthetic.csv')
df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [34]:
file_path = "ObesityDataSet_raw_and_data_sinthetic.csv"

In [42]:
class DataHandler:
    def __init__(self, file_path, target_column):
        self.file_path = file_path
        self.target_column = target_column
        self.data = None
        self.input_df = None
        self.output_df = None

    def load_data(self):
        self.data = pd.read_csv(self.file_path)
        self.create_input_output()

    def create_input_output(self):
        self.output_df = self.data[self.target_column]
        self.input_df = self.data.drop(self.target_column, axis=1)

    def preprocess(self):
        imputer = SimpleImputer(strategy='most_frequent')
        self.data = pd.DataFrame(imputer.fit_transform(self.data), columns=self.data.columns)

        binary_cols = ['Gender', 'family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC']
        binary_map = {'Male': 1, 'Female': 0, 'yes': 1, 'no': 0}
        for col in binary_cols:
            self.data[col] = self.data[col].map(binary_map)

        label_cols = ['CAEC', 'CALC']
        encoder = LabelEncoder()
        for col in label_cols:
            self.data[col] = encoder.fit_transform(self.data[col])

        mtrans_map = {'Public_Transportation': 0, 'Automobile': 1, 'Walking': 2, 'Motorbike': 3, 'Bike': 4}
        self.data['MTRANS'] = self.data['MTRANS'].map(mtrans_map).astype(int)

        numerical_columns = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
        z_scores = np.abs((self.data[numerical_columns] - self.data[numerical_columns].mean()) / self.data[numerical_columns].std())
        outliers = z_scores > 3
        for col in numerical_columns:
            median_value = self.data[col].median()
            self.data.loc[outliers[col], col] = median_value

        scaler = StandardScaler()
        self.data[numerical_columns] = scaler.fit_transform(self.data[numerical_columns])

        return self.data

In [36]:
class ModelHandler:
    def __init__(self, input_data, output_data):
        self.input_data = input_data
        self.output_data = output_data
        self.model = None
        self.x_train, self.x_test, self.y_train, self.y_test, self.y_predict = [None] * 5

    def create_model(self, criteria='gini', max_depth=6):
        self.model = RandomForestClassifier(criterion=criteria, max_depth=max_depth)

    def split_data(self, test_size=0.2, random_state=42):
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.input_data, self.output_data, test_size=test_size, random_state=random_state)

    def train_model(self):
        self.model.fit(self.x_train, self.y_train)

    def make_prediction(self):
        self.y_predict = self.model.predict(self.x_test)

    def evaluate_model(self):
        return accuracy_score(self.y_test, self.y_predict)

    def tuning_parameter(self):
        parameters = {
            'criterion': ['gini', 'entropy', 'log_loss'],
            'max_depth': [2, 4, 6, 8]
        }
        RFClass = RandomForestClassifier()
        RFClass = GridSearchCV(RFClass,
                               scoring='accuracy',
                               param_grid=parameters,
                               cv=5)

        RFClass.fit(self.x_train, self.y_train)
        print("Tuned Hyperparameters ", RFClass.best_params_)
        print("Accuracy ", RFClass.best_score_)

        self.create_model(criteria=RFClass.best_params_['criterion'], max_depth=RFClass.best_params_['max_depth'])

    def create_report(self):
        print("\nClassification Report\n")
        target_names = self.y_test.unique().astype(str)
        print(classification_report(self.y_test, self.y_predict, target_names=target_names))


    def save_model_to_file(self, filename):
        with open(filename, 'wb') as file:
            pickle.dump(self.model, file)

In [43]:
target_column = 'NObeyesdad'

data_handler = DataHandler(file_path, target_column)
data_handler.load_data()
data_handler.preprocess()
data_handler.create_input_output()

input_df = data_handler.input_df
output_df = data_handler.output_df

In [44]:
model_handler = ModelHandler(input_df, output_df)
model_handler.split_data()

In [45]:
print("Before Tuning Parameter")
model_handler.create_model()
model_handler.train_model()
model_handler.make_prediction()
print("Model Accuracy: ", model_handler.evaluate_model())
model_handler.create_report()

Before Tuning Parameter
Model Accuracy:  0.900709219858156

Classification Report

                     precision    recall  f1-score   support

Insufficient_Weight       0.93      0.96      0.95        56
   Obesity_Type_III       0.83      0.84      0.83        62
      Normal_Weight       0.94      0.87      0.91        78
    Obesity_Type_II       0.92      0.98      0.95        58
 Overweight_Level_I       1.00      1.00      1.00        63
     Obesity_Type_I       0.86      0.79      0.82        56
Overweight_Level_II       0.80      0.86      0.83        50

           accuracy                           0.90       423
          macro avg       0.90      0.90      0.90       423
       weighted avg       0.90      0.90      0.90       423



In [46]:
print("After Tuning Parameter")
model_handler.tuning_parameter()
model_handler.train_model()
model_handler.make_prediction()

After Tuning Parameter
Tuned Hyperparameters  {'criterion': 'entropy', 'max_depth': 8}
Accuracy  0.9401699647077415


In [47]:
print("Model Accuracy: ", model_handler.evaluate_model())
model_handler.create_report()

Model Accuracy:  0.9385342789598109

Classification Report

                     precision    recall  f1-score   support

Insufficient_Weight       0.96      0.96      0.96        56
   Obesity_Type_III       0.87      0.87      0.87        62
      Normal_Weight       0.96      0.96      0.96        78
    Obesity_Type_II       0.97      0.98      0.97        58
 Overweight_Level_I       1.00      1.00      1.00        63
     Obesity_Type_I       0.87      0.84      0.85        56
Overweight_Level_II       0.92      0.94      0.93        50

           accuracy                           0.94       423
          macro avg       0.94      0.94      0.94       423
       weighted avg       0.94      0.94      0.94       423



In [48]:
model_handler.save_model_to_file('trained_model.pkl')