In [1]:
import mlflow

In [16]:
mlflow.set_tracking_uri("file:///C:/Users/91637/obesity predicting/mlruns")

In [17]:
mlflow.set_experiment("logistic")

2024/10/29 16:22:06 INFO mlflow.tracking.fluent: Experiment with name 'logistic' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///C:/Users/91637/obesity predicting/mlruns/192032176498569888', creation_time=1730199126829, experiment_id='192032176498569888', last_update_time=1730199126829, lifecycle_stage='active', name='logistic', tags={}>

In [33]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.utils.class_weight import compute_class_weight

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

from pytorch_tabnet.tab_model import TabNetClassifier

import warnings
warnings.filterwarnings('ignore')

In [21]:
df = pd.read_excel(r"E:\infosys internship\obesity_data_with_status_50000.xlsx")
df.head()

Unnamed: 0,Age,Gender,Height_cm,Weight_kg,BMI,Physical_Activity_Level,Diet_Type,Smoking_Habits,Alcohol_Consumption,Family_History_Obesity,Blood_Pressure,Cholesterol_Levels,Education_Level,Income_Level,Geographical_Region,Obesity_Status
0,56,0,172.005936,68.686179,23.215738,4,1,2,0,1,Hypertension Stage 2,2,1,3,2,Normal weight
1,69,1,161.678497,79.937723,30.580688,2,0,0,0,0,Elevated,1,2,1,3,Obese
2,46,1,172.078597,99.707918,33.672545,4,1,1,1,0,Normal,3,2,3,2,Obese
3,32,1,176.845709,76.314302,24.40152,1,2,0,1,0,Normal,3,1,2,1,Normal weight
4,60,1,170.875161,84.39786,28.905039,2,1,0,2,1,Elevated,2,3,3,2,Overweight


In [22]:
df.shape

(50000, 16)

In [23]:
label_encoder = LabelEncoder()
df['Obesity_Status'] = label_encoder.fit_transform(df['Obesity_Status'])
df['Blood_Pressure'] = label_encoder.fit_transform(df['Blood_Pressure'])

In [24]:
def remove_outliers_zscore(df, column_names, threshold=3):
    z_scores = np.abs((df[column_names] - df[column_names].mean()) / df[column_names].std())
    filtered_df = df[(z_scores < threshold).all(axis=1)]
    return filtered_df

continuous_columns = ['Height_cm', 'Weight_kg']
df_cleaned = remove_outliers_zscore(df, continuous_columns)

print(f"Data shape after outlier removal: {df_cleaned.shape}")

Data shape after outlier removal: (49744, 16)


In [25]:
scaler = StandardScaler()
df_cleaned[['Height_cm', 'Weight_kg']] = scaler.fit_transform(df_cleaned[['Height_cm', 'Weight_kg']])

In [26]:
class_counts = df_cleaned['Obesity_Status'].value_counts()

scale_factor = class_counts.max() / class_counts

def augment_data(df, target_col, scale_factor):
    frames = []
    for label, factor in scale_factor.items():
        df_class = df[df[target_col] == label]
        df_class_replicated = pd.concat([df_class] * int(np.round(factor)), ignore_index=True)
        frames.append(df_class_replicated)

    df_augmented = pd.concat(frames).sample(frac=1).reset_index(drop=True)  # Shuffle the data
    return df_augmented

df_augmented = augment_data(df_cleaned, 'Obesity_Status', scale_factor)
print(f"New class distribution after augmentation:\n{df_augmented['Obesity_Status'].value_counts()}")

New class distribution after augmentation:
Obesity_Status
3    18308
0    17442
2    15094
1    12631
Name: count, dtype: int64


In [27]:
Q1 = df_augmented['Obesity_Status'].quantile(0.25)
Q3 = df_augmented['Obesity_Status'].quantile(0.75)
IQR = Q3 - Q1


lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df_augmented[(df_augmented['Obesity_Status'] < lower_bound) | (df_augmented['Obesity_Status'] > upper_bound)]

print("Outliers in Obesity_Status column:", len(outliers))

Outliers in Obesity_Status column: 0


In [28]:
X = df_augmented.drop('Obesity_Status', axis=1)
y = df_augmented['Obesity_Status']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)  # 70% train
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)  # 15% validation, 15% test

print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")
print(f"Test set shape: {X_test.shape}")

Training set shape: (44432, 15)
Validation set shape: (9521, 15)
Test set shape: (9522, 15)


In [46]:
with mlflow.start_run(run_name="Run6"):
    
    
    log_reg = LogisticRegression(max_iter=200)  
    log_reg.fit(X_train, y_train)
    
    
    y_pred_log_reg = log_reg.predict(X_val)
    
    
    accuracy = accuracy_score(y_val, y_pred_log_reg)
    print(f"Logistic Regression Accuracy: {accuracy}")

    
    mlflow.log_param("model_type", "Logistic Regression")
    mlflow.log_param("max_iter", log_reg.max_iter)
    
    
    mlflow.log_metric("accuracy", accuracy)

    mlflow.sklearn.log_model(log_reg, "logistic_regression_model")

    
    cm = confusion_matrix(y_val, y_pred_log_reg)

    plt.figure(figsize=(8, 6))  
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    cm_path = "obesity predicting/confusion_matrix.png"
    plt.savefig(cm_path) 
    plt.close()

    plt.show()

    mlflow.log_artifact(cm_path)
    mlflow.log_artifact(r"C:\Users\91637\obesity predicting\obesity predicting\logistic.py")


Logistic Regression Accuracy: 0.9782585862829535




In [47]:
mlflow.set_experiment("SVM")

2024/10/29 17:10:57 INFO mlflow.tracking.fluent: Experiment with name 'SVM' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///C:/Users/91637/obesity predicting/mlruns/902357956030472999', creation_time=1730202057950, experiment_id='902357956030472999', last_update_time=1730202057950, lifecycle_stage='active', name='SVM', tags={}>