In [6]:
# Importing the libraries
import pandas as pd #using for data modeling
import numpy as np #using for performing data analysis and manipulation
import os
import matplotlib.pyplot as plt #using for data plotting & visualiztion
import seaborn as sns #using for data visualization
from scipy import stats
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score,confusion_matrix, recall_score

In [7]:
#defining the filepath
data_path = os.path.join('C:/Users/shaik/Downloads/ObesityDataset.csv')

#using for reading the file into dataframe
data = pd.read_csv(data_path)

In [8]:
#desplaying the first 5 rows of dataframe
data.head() 

Unnamed: 0,ID,Age,Gender,Height,Weight,BMI,Label
0,1,25,Male,175.0,80,25.3,Normal Weight
1,2,30,Female,160.0,60,22.5,Normal Weight
2,3,35,Male,180.0,90,27.3,Overweight
3,4,40,Female,150.0,50,20.0,Underweight
4,5,45,Male,190.0,100,31.2,Obese


In [9]:
#using print function of python & data.shape to show the rows and columns of the dataset
print(
    f"Rows:{data.shape[0]} & Columns: {data.shape[1]}"
)
   

Rows:108 & Columns: 7


In [10]:
data['Label'].value_counts()

Label
Underweight      47
Normal Weight    29
Overweight       20
Obese            12
Name: count, dtype: int64

In [11]:
data.select_dtypes(include=['object']).columns.tolist()

['Gender', 'Label']

In [12]:
# encoding the object type data

obj_cols = data.select_dtypes(include=['object']).columns.tolist()

from sklearn.preprocessing import LabelEncoder
# Create a label encoder object
le = LabelEncoder()

for obj in obj_cols:
    le.fit(data[obj])
    data[obj] = le.transform(data[obj])

In [13]:
#I am using data.corr() here to identify the correlation between the data of the following dataset.
#This will output a table showing the correlation coefficients between each pair of columns in the DataFrame, 
#providing insights into how the variables relate to each other.
data.corr()

Unnamed: 0,ID,Age,Gender,Height,Weight,BMI,Label
ID,1.0,-0.24713,-0.025706,-0.016544,-0.572625,-0.361848,0.347199
Age,-0.24713,1.0,-0.108841,-0.115135,0.438498,0.216308,-0.17213
Gender,-0.025706,-0.108841,1.0,0.872972,0.435501,0.328245,-0.283188
Height,-0.016544,-0.115135,0.872972,1.0,0.425557,0.272056,-0.231031
Weight,-0.572625,0.438498,0.435501,0.425557,1.0,0.594829,-0.565555
BMI,-0.361848,0.216308,0.328245,0.272056,0.594829,1.0,-0.341594
Label,0.347199,-0.17213,-0.283188,-0.231031,-0.565555,-0.341594,1.0


In [14]:
#Preparing data  for training and testing the model.
X = data.drop('Label',axis=1)
y = data['Label']

# Importing the train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

In [15]:
# Before smote
X_train.shape

(86, 6)

In [16]:
y_train.shape

(86,)

In [17]:
# Class distribution before resampling
y_train.value_counts()

Label
3    40
0    20
2    17
1     9
Name: count, dtype: int64

In [18]:
#I am using SMOTE here to oversample dataset in case if there remains any missing value on the dataset
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

# Step 1: Handle missing values in X_train
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)

# Step 2: Apply SMOTE
sm = SMOTE(random_state=5)
X_train_res, y_train_res = sm.fit_resample(X_train_imputed, y_train)


In [19]:
y_train_res.value_counts()

Label
2    40
1    40
3    40
0    40
Name: count, dtype: int64

In [20]:
y_train_res.shape , X_train_res.shape

((160,), (160, 6))

I am using SVC model which stands for Support Vector Classification used for model training. It can be applied on labeled data which comes under supervised learning algorithm. It works by finding the hyperplane that best seperates the classes in the feature space

In [21]:
from sklearn.svm import SVC
svc = SVC()

In [22]:
#by calling vc.fit(X_train_res, y_train_res) we can essentially train the svc model to learn the patterns in the resampled
#training data.
svc.fit(X_train_res,y_train_res)

In [23]:
#predicting the data
svc_pred = svc.predict(X_test)
svc_pred



array([3, 0, 2, 0, 0, 1, 2, 1, 1, 2, 3, 0, 2, 1, 0, 0, 3, 2, 0, 3, 0, 0])

In [24]:
print(f"Accuracy with support vector classification : {accuracy_score(y_test,svc_pred)}")

Accuracy with support vector classification : 0.6818181818181818


I am using Decision tree classifier algorithm for cllassifiying the data which comes under supervised learning. This algorithm works recursively by partitioning the feature space into smaller regions ultimately creating a tree like structure where each internal node repressnts a feature test, each branch  represents thee outcome of the test and each leaaf node represents a class label.
It evaluates  each feature  based on  certain criteria which is best and then spits the dataset. It keeps repeating until  a stopping criteria is met.
It then creates leaf node which contains  majority class of the samples of the feature space.
Afterwards it makes the prediction  by traversing the tree.

In [25]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()

In [26]:
dtc.fit(X_train_res,y_train_res)

In [27]:
dtc_pred = dtc.predict(X_test)



In [28]:
print(f"Accuracy score for Decission tree Classifier {accuracy_score(y_test,dtc_pred)}")

Accuracy score for Decission tree Classifier 0.9545454545454546


I am using Randomn forest classifier which is ensamble learning and a corrected version of the decision tree classifier. It creates bootstrap dataset from original data by randomly choosing data by creating multiple random decision trees. The decision will be based on the majority calculation of the results.

In [29]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

rfc.fit(X_train_res,y_train_res)

In [30]:
rfc_pred = rfc.predict(X_test)
rfc_pred



array([3, 0, 2, 3, 0, 2, 2, 1, 1, 2, 3, 0, 0, 1, 0, 3, 3, 0, 0, 3, 0, 3])

In [31]:
print(f"Accuracy score for Random Forest Classifier {accuracy_score(y_test,rfc_pred)}")

Accuracy score for Random Forest Classifier 0.9545454545454546


Here I have used xgboost. XGBoost is the improved version of Gradient boosing algorithm. It is so powerful library which delivers great performance with high speed. 

In [32]:
import xgboost as xgb

In [33]:
xgb_model = xgb.XGBClassifier(objective="multi:softprob", random_state=5)
xgb_model.fit(X_train_res, y_train_res)

xgb_pred = xgb_model.predict(X_test)
xgb_pred

array([3, 0, 2, 3, 0, 2, 2, 1, 1, 2, 3, 0, 0, 1, 0, 3, 3, 0, 0, 3, 0, 3],
      dtype=int64)

In [34]:
print(f"Accuracy score for Extreme Gradient Boosting Classifier {accuracy_score(y_test,xgb_pred)}")

Accuracy score for Extreme Gradient Boosting Classifier 0.9545454545454546


Accuracy with support vector classification : 0.6818181818181818
Accuracy score for Decission tree Classifier 0.9545454545454546
Accuracy score for Random Forest Classifier 0.9545454545454546
Accuracy score for Extreme Gradient Boosting Classifier 0.9545454545454546