# Model Training

## 1.1 Import Data and Required Packages

In [118]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

# Modelling
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier


from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score
import warnings

#### imporing csv data as dataframe

In [119]:
df =  pd.read_csv("D:\Sanjay\github\Assestments\ML-assig-2\Dataset\ObesityDataSet_raw_and_data_sinthetic.csv")

In [120]:
X= df.drop(columns="NObeyesdad", axis=1)
X.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation


In [121]:
y = df['NObeyesdad']
y

0             Normal_Weight
1             Normal_Weight
2             Normal_Weight
3        Overweight_Level_I
4       Overweight_Level_II
               ...         
2106       Obesity_Type_III
2107       Obesity_Type_III
2108       Obesity_Type_III
2109       Obesity_Type_III
2110       Obesity_Type_III
Name: NObeyesdad, Length: 2111, dtype: object

In [122]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer


numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()


preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),
    ]
)

In [123]:
X = preprocessor.fit_transform(X)

In [111]:
X.shape

(2111, 31)

In [124]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Encode the target variable
y_encoded = label_encoder.fit_transform(y)

In [125]:
# Encode the target variable
y_encoded = label_encoder.fit_transform(y)

In [126]:
y_encoded

array([1, 1, 1, ..., 4, 4, 4])

In [127]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y_encoded,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((1688, 31), (423, 31))

## Create an Evaluate Function to give all metrics after model Training

In [128]:
def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    
    conf_mat = confusion_matrix(true, predicted)
    true_positive = conf_mat[0][0]
    false_positive = conf_mat[0][1]
    false_negative  = conf_mat[1][0]
    true_negative  = conf_mat[1][1]
    
    precision = (true_positive)/(true_positive +  false_positive)
    recall = true_positive/(true_positive + false_negative)
    f1_score = 2*(recall*precision)/(recall+precision)
    
    return accuracy, precision, recall, f1_score

In [129]:
models = {
    "Logistic Regression": LogisticRegression(),
    "SVC": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random ForestClassifierr": RandomForestClassifier(),
    "Ada Boost Classifier": AdaBoostClassifier(), 
    "Bagging Classifier": BaggingClassifier()
}

model_list = []
model_accuracy =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_accuracy, model_train_precision, model_train_recall, model_train_f1_score = evaluate_model(y_train, y_train_pred)

    model_test_accuracy, model_test_precision, model_test_recall, model_test_f1_score = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(model_train_accuracy))
    print("- Precision: {:.4f}".format(model_train_precision))
    print("- Recall: {:.4f}".format(model_train_recall))
    print("- F1 Score: {:.4f}".format(model_train_f1_score))

    print('----------------------------------')
    
    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(model_test_accuracy))
    print("- Precision: {:.4f}".format(model_test_precision))
    print("- Recall: {:.4f}".format(model_test_recall))
    print("- F1 Score: {:.4f}".format(model_test_f1_score))
    model_accuracy.append(model_test_accuracy)
    print('='*35)
    print('\n')


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression
Model performance for Training set
- Accuracy: 0.8975
- Precision: 0.9861
- Recall: 0.9025
- F1 Score: 0.9425
----------------------------------
Model performance for Training set
- Accuracy: 0.8723
- Precision: 1.0000
- Recall: 0.8485
- F1 Score: 0.9180


SVC
Model performance for Training set
- Accuracy: 0.9621
- Precision: 0.9769
- Recall: 0.9860
- F1 Score: 0.9814
----------------------------------
Model performance for Training set
- Accuracy: 0.9338
- Precision: 0.9643
- Recall: 0.9643
- F1 Score: 0.9643


Decision Tree
Model performance for Training set
- Accuracy: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- F1 Score: 1.0000
----------------------------------
Model performance for Training set
- Accuracy: 0.9362
- Precision: 0.9643
- Recall: 0.9153
- F1 Score: 0.9391


Random ForestClassifierr
Model performance for Training set
- Accuracy: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- F1 Score: 1.0000
----------------------------------
Model performance for

## Results

In [130]:
model_list

['Logistic Regression',
 'SVC',
 'Decision Tree',
 'Random ForestClassifierr',
 'Ada Boost Classifier',
 'Bagging Classifier']

In [131]:
pd.DataFrame(list(zip(model_list, model_accuracy)), columns=['Model Name', 'Model Accuracy']).sort_values(by=["Model Accuracy"],ascending=False)


Unnamed: 0,Model Name,Model Accuracy
3,Random ForestClassifierr,0.945626
5,Bagging Classifier,0.945626
2,Decision Tree,0.93617
1,SVC,0.933806
0,Logistic Regression,0.87234
4,Ada Boost Classifier,0.269504


## Bagging Classifier

In [132]:
Bag_model = BaggingClassifier()
Bag_model = Bag_model.fit(X_train, y_train)
y_pred = Bag_model.predict(X_test)
score = accuracy_score(y_test, y_pred)*100
print(" Accuracy of the model is %.2f" %score)

 Accuracy of the model is 94.80


In [133]:
pred_df=pd.DataFrame({'Actual Value':y_test,'Predicted Value':y_pred,'Difference':y_test-y_pred})
pred_df

Unnamed: 0,Actual Value,Predicted Value,Difference
0,0,0,0
1,4,4,0
2,0,0,0
3,0,0,0
4,1,1,0
...,...,...,...
418,5,5,0
419,3,3,0
420,1,1,0
421,3,3,0


In [134]:
y_decoded = label_encoder.inverse_transform(y_pred)

In [135]:
y_decoded

array(['Insufficient_Weight', 'Obesity_Type_III', 'Insufficient_Weight',
       'Insufficient_Weight', 'Normal_Weight', 'Insufficient_Weight',
       'Obesity_Type_II', 'Overweight_Level_I', 'Obesity_Type_I',
       'Insufficient_Weight', 'Obesity_Type_I', 'Obesity_Type_III',
       'Obesity_Type_II', 'Obesity_Type_II', 'Obesity_Type_III',
       'Overweight_Level_II', 'Obesity_Type_III', 'Obesity_Type_II',
       'Obesity_Type_I', 'Overweight_Level_II', 'Obesity_Type_I',
       'Overweight_Level_I', 'Obesity_Type_III', 'Obesity_Type_I',
       'Normal_Weight', 'Overweight_Level_II', 'Overweight_Level_II',
       'Obesity_Type_III', 'Obesity_Type_III', 'Insufficient_Weight',
       'Obesity_Type_III', 'Overweight_Level_I', 'Overweight_Level_II',
       'Normal_Weight', 'Insufficient_Weight', 'Obesity_Type_III',
       'Obesity_Type_I', 'Normal_Weight', 'Normal_Weight',
       'Normal_Weight', 'Normal_Weight', 'Obesity_Type_II',
       'Normal_Weight', 'Normal_Weight', 'Overweight_Level

In [136]:
df

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.000000,1.620000,64.000000,yes,no,2.0,3.0,Sometimes,no,2.000000,no,0.000000,1.000000,no,Public_Transportation,Normal_Weight
1,Female,21.000000,1.520000,56.000000,yes,no,3.0,3.0,Sometimes,yes,3.000000,yes,3.000000,0.000000,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.000000,1.800000,77.000000,yes,no,2.0,3.0,Sometimes,no,2.000000,no,2.000000,1.000000,Frequently,Public_Transportation,Normal_Weight
3,Male,27.000000,1.800000,87.000000,no,no,3.0,3.0,Sometimes,no,2.000000,no,2.000000,0.000000,Frequently,Walking,Overweight_Level_I
4,Male,22.000000,1.780000,89.800000,no,no,2.0,1.0,Sometimes,no,2.000000,no,0.000000,0.000000,Sometimes,Public_Transportation,Overweight_Level_II
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,Female,20.976842,1.710730,131.408528,yes,yes,3.0,3.0,Sometimes,no,1.728139,no,1.676269,0.906247,Sometimes,Public_Transportation,Obesity_Type_III
2107,Female,21.982942,1.748584,133.742943,yes,yes,3.0,3.0,Sometimes,no,2.005130,no,1.341390,0.599270,Sometimes,Public_Transportation,Obesity_Type_III
2108,Female,22.524036,1.752206,133.689352,yes,yes,3.0,3.0,Sometimes,no,2.054193,no,1.414209,0.646288,Sometimes,Public_Transportation,Obesity_Type_III
2109,Female,24.361936,1.739450,133.346641,yes,yes,3.0,3.0,Sometimes,no,2.852339,no,1.139107,0.586035,Sometimes,Public_Transportation,Obesity_Type_III


In [161]:
df.columns

Index(['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight',
       'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE',
       'CALC', 'MTRANS', 'NObeyesdad'],
      dtype='object')

In [183]:
row_list = df.loc[4].tolist()
print(row_list.pop(-1))
row_list
column = [feature for feature in df.columns if feature != 'NObeyesdad']

Overweight_Level_II


In [189]:
column

['Gender',
 'Age',
 'Height',
 'Weight',
 'family_history_with_overweight',
 'FAVC',
 'FCVC',
 'NCP',
 'CAEC',
 'SMOKE',
 'CH2O',
 'SCC',
 'FAF',
 'TUE',
 'CALC',
 'MTRANS']

In [184]:
row_list = ['Male',27.0,1.78,89.8,'no','no',2.0,1.0,'Sometimes','no',2.0,'no',0.0,0.0,'Sometimes','Public_Transportation']

['Male',
 22.0,
 1.78,
 89.8,
 'no',
 'no',
 2.0,
 1.0,
 'Sometimes',
 'no',
 2.0,
 'no',
 0.0,
 0.0,
 'Sometimes',
 'Public_Transportation']

In [185]:
input_data=pd.DataFrame([row_list],columns=column)
input_data

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
0,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation


In [186]:
transformed_input=preprocessor.transform(input_data)

In [187]:
predc=Bag_model.predict(transformed_input)

In [192]:
predc

array([6])

In [191]:
type(predc)

numpy.ndarray

In [190]:
label_encoder.inverse_transform(predc)

array(['Overweight_Level_II'], dtype=object)