### Placement Assignment Machine Learning

#### Q-2. Imagine you have a dataset where you have different features like Age, Gender , Height , Weight , BMI , and Blood Pressure and you have to classify the people into different classes like Normal , Overweight , Obesity , Underweight , and Extreme Obesity by using any 4 different classification algorithms. Now you have to build a model which can classify people into different classes.

    Gender
    Age
    Height
    Weight
    family_history_with_overweight
    FAVC (Frecuent consumption of high caloric foods)
    FCVC (Frequency of consumption of vegetables)
    NCP (Number of main meals)
    CAEC (Consumption of food between meals)
    SMOKE (Smoking habit)
    CH2O (Consumption of water daily)
    SCC (Calories consumption monitoring)
    FAF (Physical activity frequency)
    TUE (Time using technology devices)
    CALC (Consumption of alcohol)
    MTRANS (Transportation used)
    NObeyesdad (Number of Obesity Diseases)

In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import cross_val_score
import re
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings('ignore')

In [3]:
data= pd.read_csv("ObesityDataSet_raw_and_data_sinthetic.csv")
data.head(2)

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight


In [4]:
df = data.copy()

In [5]:
df['NObeyesdad'].value_counts()

Obesity_Type_I         351
Obesity_Type_III       324
Obesity_Type_II        297
Overweight_Level_I     290
Overweight_Level_II    290
Normal_Weight          287
Insufficient_Weight    272
Name: NObeyesdad, dtype: int64

In [6]:
dic_to_replace = {"NObeyesdad": {"Insufficient_Weight": 0, 
                          "Normal_Weight": 1,
                         'Overweight_Level_I': 2,
                          'Overweight_Level_II': 3,
                          'Obesity_Type_I': 4,
                          'Obesity_Type_II': 5,
                          'Obesity_Type_III': 6,}
                         }
df.replace(dic_to_replace, inplace=True)

In [7]:
df = df.rename(columns={'family_history_with_overweight': 'FHWO'})

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Gender      2111 non-null   object 
 1   Age         2111 non-null   float64
 2   Height      2111 non-null   float64
 3   Weight      2111 non-null   float64
 4   FHWO        2111 non-null   object 
 5   FAVC        2111 non-null   object 
 6   FCVC        2111 non-null   float64
 7   NCP         2111 non-null   float64
 8   CAEC        2111 non-null   object 
 9   SMOKE       2111 non-null   object 
 10  CH2O        2111 non-null   float64
 11  SCC         2111 non-null   object 
 12  FAF         2111 non-null   float64
 13  TUE         2111 non-null   float64
 14  CALC        2111 non-null   object 
 15  MTRANS      2111 non-null   object 
 16  NObeyesdad  2111 non-null   int64  
dtypes: float64(8), int64(1), object(8)
memory usage: 280.5+ KB


In [9]:
df.describe()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,NObeyesdad
count,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0
mean,24.3126,1.701677,86.586058,2.419043,2.685628,2.008011,1.010298,0.657866,3.112269
std,6.345968,0.093305,26.191172,0.533927,0.778039,0.612953,0.850592,0.608927,1.985062
min,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,19.947192,1.63,65.473343,2.0,2.658738,1.584812,0.124505,0.0,1.0
50%,22.77789,1.700499,83.0,2.385502,3.0,2.0,1.0,0.62535,3.0
75%,26.0,1.768464,107.430682,3.0,3.0,2.47742,1.666678,1.0,5.0
max,61.0,1.98,173.0,3.0,4.0,3.0,3.0,2.0,6.0


In [10]:
df.isnull().sum()

Gender        0
Age           0
Height        0
Weight        0
FHWO          0
FAVC          0
FCVC          0
NCP           0
CAEC          0
SMOKE         0
CH2O          0
SCC           0
FAF           0
TUE           0
CALC          0
MTRANS        0
NObeyesdad    0
dtype: int64

In [11]:
y = df[['NObeyesdad']]
X = df.drop(["NObeyesdad"], axis= 1)

In [12]:
numerical_features = X.select_dtypes(include="number")
categorical_features = X.select_dtypes(include="object")

In [13]:
categorical_features = X.select_dtypes(include="object").columns
numerical_features = X.select_dtypes(exclude="object").columns

In [14]:
Gender_categories = ['Female', 'Male']
FHWO_categories = ['yes', 'no']
FAVC_categories = ['no', 'yes']
CAEC_categories = ['Sometimes', 'Frequently', 'Always', 'no']
SMOKE_categories = ['no', 'yes']
SCC_categories = ['no', 'yes']
CALC_categories = ['no', 'Sometimes', 'Frequently', 'Always']
MTRANS_categories = ['Public_Transportation', 'Walking', 'Automobile', 'Motorbike','Bike']

In [15]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
    ]
)

# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[Gender_categories,FHWO_categories,FAVC_categories,CAEC_categories,SMOKE_categories,SCC_categories,
                                                 CALC_categories,MTRANS_categories])),
    ('scaler',StandardScaler())
    ]
)

preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_features),
('cat_pipeline',cat_pipeline,categorical_features)
])

In [16]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=30)


In [17]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [18]:
def evaluate_model(true, predicted):
    accuracy_score1 = accuracy_score(true, predicted)
    return accuracy_score1

In [19]:
models={
    'LogisticRegression':LogisticRegression(max_iter=1000),
    'DecisionTreeClassifier':DecisionTreeClassifier(),
    'RandomForestClassifier':RandomForestClassifier(),
    'XGBClassifier':XGBClassifier()
}

trained_model_list=[]
model_list=[]
accuracy_score_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)
    #print(y_pred.shape)
    #print(y_test.shape)
    accuracy_score1=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])


    print("Accuracy score",accuracy_score1)

    accuracy_score_list.append(accuracy_score1)
    
    print('='*35)
    print('\n')

LogisticRegression
Accuracy score 0.8943217665615142


DecisionTreeClassifier
Accuracy score 0.9290220820189274


RandomForestClassifier
Accuracy score 0.9574132492113565


XGBClassifier
Accuracy score 0.9668769716088328


