In [1]:
import mlflow
import mlflow.sklearn

In [2]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('project-experiment')

2024/07/01 16:57:19 INFO mlflow.tracking.fluent: Experiment with name 'project-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='/workspaces/project/section-2/mlruns/1', creation_time=1719853039311, experiment_id='1', last_update_time=1719853039311, lifecycle_stage='active', name='project-experiment', tags={}>

In [24]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [5]:
data = pd.read_csv('healthcare-dataset.csv')
data.head(2)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1


In [6]:
data.shape

(5110, 12)

In [7]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

In [8]:
#1st impute/fill the missing values
#only BMI has missing values 
imputer = SimpleImputer(strategy = 'mean')
data['bmi']=imputer.fit_transform(data[['bmi']])
encoded_data= data.copy()

#2nd Scale down the numerical features
features_to_scale=['age','bmi']
scaler = MinMaxScaler()
encoded_data[features_to_scale]=scaler.fit_transform(encoded_data[features_to_scale])

In [9]:
#as the 'avg glucose level dosent have a normal distribution hence i am usign QuantileTransformer here 
from sklearn.preprocessing import QuantileTransformer

# Initialize QuantileTransformer
scaler = QuantileTransformer(output_distribution='uniform')

# Apply quantile transformation to avg_glucose_level
encoded_data['avg_glucose_level'] = scaler.fit_transform(encoded_data[['avg_glucose_level']])

In [10]:
df = encoded_data.copy()
df.columns

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [11]:
# List of columns to one-hot encode
columns_to_encode = ['Residence_type', 'work_type', 'smoking_status','ever_married','gender']

# Iterate through each column and apply pd.get_dummies
for column in columns_to_encode:
    encoded_column = pd.get_dummies(df[column], prefix=column)
    df = pd.concat([df, encoded_column], axis=1)
    df = df.drop(columns=[column],axis=1)

# Convert boolean to integers
df = df.astype(int)

In [12]:
df.drop('id',axis=1,inplace=True)

In [13]:
X = df.drop('stroke',axis=1)
y = df['stroke']
X_train,X_valid,y_train,y_valid = train_test_split(X,y,test_size = 0.2,random_state=42) 

In [16]:
import pickle

In [19]:
with open('models/log_reg.bin', 'wb') as f_out:
    pickle.dump((logreg), f_out)

In [33]:
with mlflow.start_run():

    mlflow.set_tag("developer", "Bhavana")
    mlflow.set_tag("algorithm", "logistic regression")
    mlflow.log_param("dataset", ".healthcare-dataset.csv")

    logreg = LogisticRegression()
    logreg.fit(X_train, y_train)

    y_pred = logreg.predict(X_valid)

    ## Performance metrics
    accuracy = accuracy_score(y_valid, y_pred)

    ## Log the metrics to mlflow ui
    mlflow.log_metric("accuracy", accuracy)

    mlflow.log_artifact(local_path="models/log_reg.bin", artifact_path="models_pickle")

In [34]:
print(f'Accuracy: {accuracy}')

Accuracy: 0.9393346379647749


In [35]:
import xgboost as xgb

with mlflow.start_run():

    mlflow.set_tag("developer", "Bhavana")
    mlflow.set_tag("algorithm", "xgboost")
    mlflow.log_param("dataset", ".healthcare-dataset.csv")

    xgb_clf = xgb.XGBClassifier(objective='binary:logistic', max_depth=3, learning_rate=0.1, n_estimators=100)

    xgb_clf.fit(X_train, y_train)

    with open('models/xgb.bin', 'wb') as f_out:
        pickle.dump((xgb_clf), f_out)

    xgb_y_pred = xgb_clf.predict(X_valid)

    ## Performance metrics
    xgb_accuracy = accuracy_score(y_valid, xgb_y_pred)

    ## Log the metrics to mlflow ui
    mlflow.log_metric("accuracy", accuracy)

    mlflow.log_artifact(local_path="models/xgb.bin", artifact_path="models_pickle")

In [36]:
print(f'Accuracy: {xgb_accuracy}')

Accuracy: 0.9403131115459883
