In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV

In [94]:
import os

In [31]:
df = pd.read_csv('obesity_data.csv')
df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [51]:
for col in df.columns:
  print(df[col].value_counts(), '\n')

Gender
Male      1068
Female    1043
Name: count, dtype: int64 

Age
21    236
23    218
26    213
18    212
19    169
22    163
20    150
24     95
25     82
17     69
31     62
30     53
27     43
33     37
38     34
29     33
34     29
32     26
39     25
37     24
41     20
28     20
16     20
35     19
40     19
44      6
36      6
42      6
55      5
45      3
43      3
46      2
51      2
48      1
61      1
14      1
56      1
15      1
52      1
47      1
Name: count, dtype: int64 

Height
1.70    125
1.75    122
1.62     96
1.76     96
1.65     88
1.60     77
1.72     76
1.63     75
1.77     71
1.71     68
1.74     66
1.67     66
1.64     66
1.79     65
1.78     64
1.61     62
1.68     61
1.80     59
1.66     58
1.69     54
1.82     50
1.73     43
1.84     40
1.56     39
1.81     39
1.85     39
1.83     35
1.55     32
1.57     30
1.59     29
1.58     27
1.53     27
1.87     22
1.86     21
1.54     20
1.52     19
1.50     17
1.91     12
1.51     11
1.88     10
1.90      7
1.89

In [74]:
class DataHandler:
    def __init__(self, file_path):
        self.file_path = file_path
        self.data = None
        self.input_df = None
        self.output_df = None

    def load_data(self):
        self.data = pd.read_csv(self.file_path)
    
    def create_input_output(self, target_column):
        self.output_df = self.data[target_column]
        self.input_df = self.data.drop(target_column, axis=1)
    
    def encode_ordinal(self, ordinal_columns, categories):
        print("\nBefore Ordinal Encoding:\n", self.input_df[ordinal_columns].head())
        ordinal_encoder = OrdinalEncoder(categories=categories)
        self.input_df[ordinal_columns] = ordinal_encoder.fit_transform(self.input_df[ordinal_columns])
        print("After Ordinal Encoding:\n", self.input_df[ordinal_columns].head())
        return ordinal_encoder

    def encode_label(self):
        categorical_columns = self.input_df.select_dtypes(include=['object']).columns
        print("\nBefore Label Encoding:\n", self.input_df[categorical_columns].head())
        label_encoders = {}
        for column in categorical_columns:
            label_encoders[column] = LabelEncoder()
            self.input_df[column] = label_encoders[column].fit_transform(self.input_df[column])
        print("After Label Encoding:\n", self.input_df[categorical_columns].head())
        return label_encoders

    def normalize_data(self):
        print("\nBefore Normalization:\n", self.input_df.head())
        scaler = StandardScaler()
        self.input_df = pd.DataFrame(scaler.fit_transform(self.input_df), columns=self.input_df.columns)
        print("After Normalization:\n", self.input_df.head())
        return scaler

In [88]:
class ModelHandler:
    def __init__(self, input_data, output_data):
        self.input_data = input_data
        self.output_data = output_data
        self.createModel()
        self.x_train, self.x_test, self.y_train, self.y_test, self.y_predict = [None] * 5

    def checkAgeOutlierWithBox(self, kolom):
        boxplot = self.x_train.boxplot(column=[kolom])
        plt.show()

    def createMeanFromColumn(self, kolom):
        return np.mean(self.x_train[kolom])
    
    def createModel(self, criteria='gini', maxdepth=6):
        self.model = RandomForestClassifier(criterion=criteria, max_depth=maxdepth)

    def dataConvertToNumeric(self, columns):
        self.x_train[columns] = pd.to_numeric(self.x_train[columns], errors='coerce')
        self.x_test[columns] = pd.to_numeric(self.x_test[columns], errors='coerce')

    def fillingNAWithNumbers(self, columns, number):
        self.x_train[columns].fillna(number, inplace=True)
        self.x_test[columns].fillna(number, inplace=True)

    def makePrediction(self):
        self.y_predict = self.model.predict(self.x_test)

    def createReport(self):
        print('\nClassification Report\n')
        print(classification_report(self.y_test, self.y_predict))

    def split_data(self, test_size=0.2, random_state=42):
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(
            self.input_data, self.output_data, test_size=test_size, random_state=random_state
        )

    def train_model(self):
        self.model.fit(self.x_train, self.y_train)
    
    def evaluate_model(self):
        predictions = self.model.predict(self.x_test)
        return accuracy_score(self.y_test, predictions)
    
    def tuningParameter(self):
        parameters = {
            'criterion':['gini', 'entropy', 'log_loss'],
            'max_depth':[2,4,6,8]
        }

        RFClass = RandomForestClassifier() 
        RFClass = GridSearchCV(RFClass,
                               scoring='accuracy',
                               param_grid=parameters,
                               cv=5)
        
        RFClass.fit(self.x_train, self.y_train)
        print("Tuned Hyperparameters :", RFClass.best_params_)
        print("Accuracy :", RFClass.best_score_)
        self.createModel(criteria=RFClass.best_params_['criterion'], maxdepth=RFClass.best_params_['max_depth'])

    def save_model_to_file(self, filename):
        save_path = r"C:\Primary\University\Semester 4\Model Deployment\MODELS"
        full_path = f"{save_path}\\{filename}"
        with open(full_path, 'wb') as file:
            pickle.dump(self.model, file)

In [76]:
file_path = 'obesity_data.csv'
data_handler = DataHandler(file_path)
data_handler.load_data()
data_handler.create_input_output('NObeyesdad')

ord_enc = data_handler.encode_ordinal(['CALC'], [['no', 'Sometimes', 'Frequently', 'Always']])
lab_enc = data_handler.encode_label()

input_df = data_handler.input_df
output_df = data_handler.output_df


Before Ordinal Encoding:
          CALC
0          no
1   Sometimes
2  Frequently
3  Frequently
4   Sometimes
After Ordinal Encoding:
    CALC
0   0.0
1   1.0
2   2.0
3   2.0
4   1.0

Before Label Encoding:
    Gender family_history_with_overweight FAVC       CAEC SMOKE  SCC  \
0  Female                            yes   no  Sometimes    no   no   
1  Female                            yes   no  Sometimes   yes  yes   
2    Male                            yes   no  Sometimes    no   no   
3    Male                             no   no  Sometimes    no   no   
4    Male                             no   no  Sometimes    no   no   

                  MTRANS  
0  Public_Transportation  
1  Public_Transportation  
2  Public_Transportation  
3                Walking  
4  Public_Transportation  
After Label Encoding:
    Gender  family_history_with_overweight  FAVC  CAEC  SMOKE  SCC  MTRANS
0       0                               1     0     2      0    0       3
1       0                      

In [80]:
model_handler = ModelHandler(input_df, output_df)
model_handler.split_data()

In [90]:
print("Pre-Tuning")
model_handler.train_model()
print("\n\nModel Accuracy:", model_handler.evaluate_model())
model_handler.makePrediction()
model_handler.createReport()
print("---------------------------------------------------")
print("Post-Tuning")
model_handler.tuningParameter()
model_handler.train_model()
print("\n\nModel Accuracy:", model_handler.evaluate_model())
model_handler.makePrediction()
model_handler.createReport()
model_handler.save_model_to_file('MDBonus1.pkl')

Pre-Tuning


Model Accuracy: 0.9456264775413712

Classification Report

                     precision    recall  f1-score   support

Insufficient_Weight       1.00      0.96      0.98        56
      Normal_Weight       0.88      0.90      0.89        62
     Obesity_Type_I       0.97      0.96      0.97        78
    Obesity_Type_II       0.97      0.98      0.97        58
   Obesity_Type_III       1.00      1.00      1.00        63
 Overweight_Level_I       0.84      0.88      0.86        56
Overweight_Level_II       0.96      0.92      0.94        50

           accuracy                           0.95       423
          macro avg       0.95      0.94      0.94       423
       weighted avg       0.95      0.95      0.95       423

---------------------------------------------------
Post-Tuning
Tuned Hyperparameters : {'criterion': 'log_loss', 'max_depth': 8}
Accuracy : 0.9431320562569135


Model Accuracy: 0.9456264775413712

Classification Report

                     precision   

In [102]:
print("File exists:", os.path.exists(r"C:\Primary\University\Semester 4\Model Deployment\MODELS"))

File exists: True
