In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, LogisticRegression

from sklearn.metrics import mean_squared_error, root_mean_squared_error
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [13]:
# Show all columns
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 50

In [39]:
# Reading the data
df = pd.read_csv("../data/parkinsons_disease_data.csv")
df

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,FamilyHistoryParkinsons,TraumaticBrainInjury,Hypertension,Diabetes,Depression,Stroke,SystolicBP,DiastolicBP,CholesterolTotal,CholesterolLDL,CholesterolHDL,CholesterolTriglycerides,UPDRS,MoCA,FunctionalAssessment,Tremor,Rigidity,Bradykinesia,PosturalInstability,SpeechProblems,SleepDisorders,Constipation,Diagnosis,DoctorInCharge
0,3058,85,0,3,1,19.619878,0,5.108241,1.380660,3.893969,9.283194,0,0,0,0,0,0,129,60,222.842278,148.125623,37.867776,337.307114,6.458713,29.181289,1.572427,1,0,0,0,0,0,0,0,DrXXXConfid
1,3059,75,0,0,2,16.247339,1,6.027648,8.409804,8.513428,5.602470,0,0,0,0,0,0,163,76,210.501133,153.756463,77.228117,264.635521,37.306703,12.332639,4.787551,0,1,0,1,0,1,0,1,DrXXXConfid
2,3060,70,1,0,0,15.368239,0,2.242135,0.213275,6.498805,9.929824,0,0,0,1,0,0,113,93,287.387996,118.702600,85.588304,395.662649,67.838170,29.927783,2.130686,1,0,0,0,1,0,1,1,DrXXXConfid
3,3061,52,0,0,0,15.454557,0,5.997788,1.375045,6.715033,4.196189,0,0,0,0,0,0,146,78,280.339546,136.299186,51.869626,362.189688,52.964696,21.304268,3.391288,1,1,1,0,0,0,1,1,DrXXXConfid
4,3062,87,0,0,1,18.616042,0,9.775243,1.188607,4.657572,9.363925,0,0,0,0,0,0,115,94,284.014151,108.449452,25.069418,149.956586,21.804880,8.336364,3.200969,0,0,0,1,0,1,0,0,DrXXXConfid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2100,5158,87,1,0,2,38.483841,0,12.674393,5.325900,5.947278,6.296231,0,0,1,0,0,0,92,98,191.372707,98.730980,93.269555,161.291730,187.887925,28.634010,8.076254,1,0,0,0,1,1,0,0,DrXXXConfid
2101,5159,67,0,0,1,33.694396,1,0.977018,0.108599,4.825187,6.342325,1,0,0,0,0,0,103,96,150.188647,112.063230,47.524626,240.086616,172.325253,13.106537,1.665751,0,0,0,1,0,1,1,1,DrXXXConfid
2102,5160,65,0,0,2,22.829631,0,6.152286,5.775103,0.334244,9.740019,1,0,0,1,0,1,168,79,159.351155,61.543591,72.765698,222.231014,111.851581,26.527734,4.785614,0,0,0,0,1,1,0,1,DrXXXConfid
2103,5161,61,1,0,0,16.871030,1,0.292094,2.280475,9.598513,8.289390,0,0,0,0,0,0,129,97,232.240473,53.029891,99.945560,167.980928,144.474453,19.873404,8.378979,1,0,1,0,0,1,1,1,DrXXXConfid


In [40]:
df.shape

(2105, 35)

In [41]:
# Checking the null values
df.isna().sum()

PatientID                   0
Age                         0
Gender                      0
Ethnicity                   0
EducationLevel              0
BMI                         0
Smoking                     0
AlcoholConsumption          0
PhysicalActivity            0
DietQuality                 0
SleepQuality                0
FamilyHistoryParkinsons     0
TraumaticBrainInjury        0
Hypertension                0
Diabetes                    0
Depression                  0
Stroke                      0
SystolicBP                  0
DiastolicBP                 0
CholesterolTotal            0
CholesterolLDL              0
CholesterolHDL              0
CholesterolTriglycerides    0
UPDRS                       0
MoCA                        0
FunctionalAssessment        0
Tremor                      0
Rigidity                    0
Bradykinesia                0
PosturalInstability         0
SpeechProblems              0
SleepDisorders              0
Constipation                0
Diagnosis 

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2105 entries, 0 to 2104
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   PatientID                 2105 non-null   int64  
 1   Age                       2105 non-null   int64  
 2   Gender                    2105 non-null   int64  
 3   Ethnicity                 2105 non-null   int64  
 4   EducationLevel            2105 non-null   int64  
 5   BMI                       2105 non-null   float64
 6   Smoking                   2105 non-null   int64  
 7   AlcoholConsumption        2105 non-null   float64
 8   PhysicalActivity          2105 non-null   float64
 9   DietQuality               2105 non-null   float64
 10  SleepQuality              2105 non-null   float64
 11  FamilyHistoryParkinsons   2105 non-null   int64  
 12  TraumaticBrainInjury      2105 non-null   int64  
 13  Hypertension              2105 non-null   int64  
 14  Diabetes

In [43]:
# Check for dublicate patient ID
df['PatientID'].duplicated().sum()

0

In [44]:
df.columns

Index(['PatientID', 'Age', 'Gender', 'Ethnicity', 'EducationLevel', 'BMI',
       'Smoking', 'AlcoholConsumption', 'PhysicalActivity', 'DietQuality',
       'SleepQuality', 'FamilyHistoryParkinsons', 'TraumaticBrainInjury',
       'Hypertension', 'Diabetes', 'Depression', 'Stroke', 'SystolicBP',
       'DiastolicBP', 'CholesterolTotal', 'CholesterolLDL', 'CholesterolHDL',
       'CholesterolTriglycerides', 'UPDRS', 'MoCA', 'FunctionalAssessment',
       'Tremor', 'Rigidity', 'Bradykinesia', 'PosturalInstability',
       'SpeechProblems', 'SleepDisorders', 'Constipation', 'Diagnosis',
       'DoctorInCharge'],
      dtype='object')

In [45]:
# Describe the dataset
df.describe()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,FamilyHistoryParkinsons,TraumaticBrainInjury,Hypertension,Diabetes,Depression,Stroke,SystolicBP,DiastolicBP,CholesterolTotal,CholesterolLDL,CholesterolHDL,CholesterolTriglycerides,UPDRS,MoCA,FunctionalAssessment,Tremor,Rigidity,Bradykinesia,PosturalInstability,SpeechProblems,SleepDisorders,Constipation,Diagnosis
count,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0
mean,4110.0,69.6019,0.492637,0.692637,1.337292,27.209493,0.296437,10.040413,5.016674,4.912901,6.996639,0.145843,0.106413,0.145843,0.148219,0.205226,0.048931,133.719715,90.249881,226.86084,126.147858,59.670352,222.9405,101.415318,15.094314,4.989694,0.431829,0.252732,0.207601,0.138717,0.295012,0.245131,0.296912,0.619477
std,607.805479,11.594511,0.500065,1.003827,0.89584,7.208099,0.456795,5.687014,2.890919,2.872115,1.753065,0.353033,0.308439,0.353033,0.355401,0.403962,0.215775,26.502355,17.061488,43.589406,43.407036,23.37092,101.895822,56.591448,8.643014,2.933877,0.495449,0.434682,0.405686,0.345733,0.456156,0.430267,0.457006,0.485631
min,3058.0,50.0,0.0,0.0,0.0,15.008333,0.0,0.002228,0.004157,1.1e-05,4.000497,0.0,0.0,0.0,0.0,0.0,0.0,90.0,60.0,150.062698,50.022828,20.027981,50.113604,0.028441,0.021191,0.001505,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3584.0,60.0,0.0,0.0,1.0,20.782176,0.0,5.150278,2.455703,2.478503,5.488864,0.0,0.0,0.0,0.0,0.0,0.0,110.0,75.0,189.385178,88.84196,39.538643,132.520174,53.048148,7.51716,2.41589,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4110.0,70.0,0.0,0.0,1.0,27.184571,0.0,10.070337,5.03155,4.825187,6.929819,0.0,0.0,0.0,0.0,0.0,0.0,133.0,91.0,228.528256,126.88457,59.343357,222.802452,102.561023,14.963574,4.983227,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,4636.0,80.0,1.0,1.0,2.0,33.462452,1.0,14.829565,7.512795,7.381487,8.558719,0.0,0.0,0.0,0.0,0.0,0.0,157.0,105.0,264.6081,163.912782,79.366628,311.699109,149.831682,22.608362,7.48422,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0
max,5162.0,89.0,1.0,3.0,3.0,39.999887,1.0,19.988866,9.995255,9.995864,9.999821,1.0,1.0,1.0,1.0,1.0,1.0,179.0,119.0,299.963074,199.985981,99.982265,399.975022,198.953604,29.970107,9.992697,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [46]:
# Droping column PatientID and DoctorInCharge from the data
df = df.drop(['PatientID', 'DoctorInCharge'], axis=1)
df.describe()


Unnamed: 0,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,FamilyHistoryParkinsons,TraumaticBrainInjury,Hypertension,Diabetes,Depression,Stroke,SystolicBP,DiastolicBP,CholesterolTotal,CholesterolLDL,CholesterolHDL,CholesterolTriglycerides,UPDRS,MoCA,FunctionalAssessment,Tremor,Rigidity,Bradykinesia,PosturalInstability,SpeechProblems,SleepDisorders,Constipation,Diagnosis
count,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0,2105.0
mean,69.6019,0.492637,0.692637,1.337292,27.209493,0.296437,10.040413,5.016674,4.912901,6.996639,0.145843,0.106413,0.145843,0.148219,0.205226,0.048931,133.719715,90.249881,226.86084,126.147858,59.670352,222.9405,101.415318,15.094314,4.989694,0.431829,0.252732,0.207601,0.138717,0.295012,0.245131,0.296912,0.619477
std,11.594511,0.500065,1.003827,0.89584,7.208099,0.456795,5.687014,2.890919,2.872115,1.753065,0.353033,0.308439,0.353033,0.355401,0.403962,0.215775,26.502355,17.061488,43.589406,43.407036,23.37092,101.895822,56.591448,8.643014,2.933877,0.495449,0.434682,0.405686,0.345733,0.456156,0.430267,0.457006,0.485631
min,50.0,0.0,0.0,0.0,15.008333,0.0,0.002228,0.004157,1.1e-05,4.000497,0.0,0.0,0.0,0.0,0.0,0.0,90.0,60.0,150.062698,50.022828,20.027981,50.113604,0.028441,0.021191,0.001505,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,60.0,0.0,0.0,1.0,20.782176,0.0,5.150278,2.455703,2.478503,5.488864,0.0,0.0,0.0,0.0,0.0,0.0,110.0,75.0,189.385178,88.84196,39.538643,132.520174,53.048148,7.51716,2.41589,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,70.0,0.0,0.0,1.0,27.184571,0.0,10.070337,5.03155,4.825187,6.929819,0.0,0.0,0.0,0.0,0.0,0.0,133.0,91.0,228.528256,126.88457,59.343357,222.802452,102.561023,14.963574,4.983227,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,80.0,1.0,1.0,2.0,33.462452,1.0,14.829565,7.512795,7.381487,8.558719,0.0,0.0,0.0,0.0,0.0,0.0,157.0,105.0,264.6081,163.912782,79.366628,311.699109,149.831682,22.608362,7.48422,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0
max,89.0,1.0,3.0,3.0,39.999887,1.0,19.988866,9.995255,9.995864,9.999821,1.0,1.0,1.0,1.0,1.0,1.0,179.0,119.0,299.963074,199.985981,99.982265,399.975022,198.953604,29.970107,9.992697,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [47]:
numerical_columns = [
    'Age', 'BMI', 'AlcoholConsumption', 'PhysicalActivity', 'DietQuality', 'SleepQuality', 
    'SystolicBP', 'DiastolicBP', 'CholesterolTotal', 'CholesterolLDL', 'CholesterolHDL', 'CholesterolTriglycerides', 
    'UPDRS', 'MoCA', 'FunctionalAssessment'
]

categorical_columns = [
    'Gender', 'Ethnicity', 'EducationLevel', 'Smoking',  'FamilyHistoryParkinsons',
    'TraumaticBrainInjury', 'Hypertension', 'Diabetes', 'Depression', 'Stroke',
    'Tremor', 'Rigidity', 'Bradykinesia', 'PosturalInstability', 'SpeechProblems',
    'SleepDisorders', 'Constipation'
]

df[categorical_columns + numerical_columns]

Unnamed: 0,Gender,Ethnicity,EducationLevel,Smoking,FamilyHistoryParkinsons,TraumaticBrainInjury,Hypertension,Diabetes,Depression,Stroke,Tremor,Rigidity,Bradykinesia,PosturalInstability,SpeechProblems,SleepDisorders,Constipation,Age,BMI,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,SystolicBP,DiastolicBP,CholesterolTotal,CholesterolLDL,CholesterolHDL,CholesterolTriglycerides,UPDRS,MoCA,FunctionalAssessment
0,0,3,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,85,19.619878,5.108241,1.380660,3.893969,9.283194,129,60,222.842278,148.125623,37.867776,337.307114,6.458713,29.181289,1.572427
1,0,0,2,1,0,0,0,0,0,0,0,1,0,1,0,1,0,75,16.247339,6.027648,8.409804,8.513428,5.602470,163,76,210.501133,153.756463,77.228117,264.635521,37.306703,12.332639,4.787551
2,1,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,1,70,15.368239,2.242135,0.213275,6.498805,9.929824,113,93,287.387996,118.702600,85.588304,395.662649,67.838170,29.927783,2.130686
3,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,1,52,15.454557,5.997788,1.375045,6.715033,4.196189,146,78,280.339546,136.299186,51.869626,362.189688,52.964696,21.304268,3.391288
4,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,87,18.616042,9.775243,1.188607,4.657572,9.363925,115,94,284.014151,108.449452,25.069418,149.956586,21.804880,8.336364,3.200969
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2100,1,0,2,0,0,0,1,0,0,0,1,0,0,0,1,1,0,87,38.483841,12.674393,5.325900,5.947278,6.296231,92,98,191.372707,98.730980,93.269555,161.291730,187.887925,28.634010,8.076254
2101,0,0,1,1,1,0,0,0,0,0,0,0,0,1,0,1,1,67,33.694396,0.977018,0.108599,4.825187,6.342325,103,96,150.188647,112.063230,47.524626,240.086616,172.325253,13.106537,1.665751
2102,0,0,2,0,1,0,0,1,0,1,0,0,0,0,1,1,0,65,22.829631,6.152286,5.775103,0.334244,9.740019,168,79,159.351155,61.543591,72.765698,222.231014,111.851581,26.527734,4.785614
2103,1,0,0,1,0,0,0,0,0,0,1,0,1,0,0,1,1,61,16.871030,0.292094,2.280475,9.598513,8.289390,129,97,232.240473,53.029891,99.945560,167.980928,144.474453,19.873404,8.378979


In [48]:
df[categorical_columns].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2105 entries, 0 to 2104
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype
---  ------                   --------------  -----
 0   Gender                   2105 non-null   int64
 1   Ethnicity                2105 non-null   int64
 2   EducationLevel           2105 non-null   int64
 3   Smoking                  2105 non-null   int64
 4   FamilyHistoryParkinsons  2105 non-null   int64
 5   TraumaticBrainInjury     2105 non-null   int64
 6   Hypertension             2105 non-null   int64
 7   Diabetes                 2105 non-null   int64
 8   Depression               2105 non-null   int64
 9   Stroke                   2105 non-null   int64
 10  Tremor                   2105 non-null   int64
 11  Rigidity                 2105 non-null   int64
 12  Bradykinesia             2105 non-null   int64
 13  PosturalInstability      2105 non-null   int64
 14  SpeechProblems           2105 non-null   int64
 15  Slee

In [49]:
# Changing the categorical column values dtype from int64 to str (object)
df[categorical_columns] = df[categorical_columns].astype(str)
df[categorical_columns].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2105 entries, 0 to 2104
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Gender                   2105 non-null   object
 1   Ethnicity                2105 non-null   object
 2   EducationLevel           2105 non-null   object
 3   Smoking                  2105 non-null   object
 4   FamilyHistoryParkinsons  2105 non-null   object
 5   TraumaticBrainInjury     2105 non-null   object
 6   Hypertension             2105 non-null   object
 7   Diabetes                 2105 non-null   object
 8   Depression               2105 non-null   object
 9   Stroke                   2105 non-null   object
 10  Tremor                   2105 non-null   object
 11  Rigidity                 2105 non-null   object
 12  Bradykinesia             2105 non-null   object
 13  PosturalInstability      2105 non-null   object
 14  SpeechProblems           2105 non-null  

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2105 entries, 0 to 2104
Data columns (total 33 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       2105 non-null   int64  
 1   Gender                    2105 non-null   object 
 2   Ethnicity                 2105 non-null   object 
 3   EducationLevel            2105 non-null   object 
 4   BMI                       2105 non-null   float64
 5   Smoking                   2105 non-null   object 
 6   AlcoholConsumption        2105 non-null   float64
 7   PhysicalActivity          2105 non-null   float64
 8   DietQuality               2105 non-null   float64
 9   SleepQuality              2105 non-null   float64
 10  FamilyHistoryParkinsons   2105 non-null   object 
 11  TraumaticBrainInjury      2105 non-null   object 
 12  Hypertension              2105 non-null   object 
 13  Diabetes                  2105 non-null   object 
 14  Depressi

In [51]:
X = df.drop(['Diagnosis'], axis=1)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2105 entries, 0 to 2104
Data columns (total 32 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       2105 non-null   int64  
 1   Gender                    2105 non-null   object 
 2   Ethnicity                 2105 non-null   object 
 3   EducationLevel            2105 non-null   object 
 4   BMI                       2105 non-null   float64
 5   Smoking                   2105 non-null   object 
 6   AlcoholConsumption        2105 non-null   float64
 7   PhysicalActivity          2105 non-null   float64
 8   DietQuality               2105 non-null   float64
 9   SleepQuality              2105 non-null   float64
 10  FamilyHistoryParkinsons   2105 non-null   object 
 11  TraumaticBrainInjury      2105 non-null   object 
 12  Hypertension              2105 non-null   object 
 13  Diabetes                  2105 non-null   object 
 14  Depressi

In [52]:
# Target variable
target = 'Diagnosis'
y = df[target].values

In [100]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [101]:
X_train.shape, X_val.shape

((1684, 32), (421, 32))

In [102]:
dv = DictVectorizer()

train_dicts = X_train[categorical_columns + numerical_columns].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = X_val[categorical_columns + numerical_columns].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [103]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred_continuous = lr.predict(X_val)
# Threshold 0f 0.5, if probablity is greater than 0.5 --> 1 else 0
y_pred = [1 if value > 0.5 else 0 for value in y_pred_continuous]

accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
rmse = root_mean_squared_error(y_val, y_pred)

accuracy, precision, recall, f1, rmse

(0.8028503562945368,
 0.8296296296296296,
 0.8582375478927203,
 0.8436911487758946,
 0.4440153642673451)

In [116]:
log_reg = LogisticRegression(max_iter=5000)

# Train and evaluate Logistic Regression
log_reg.fit(X_train, y_train)
y_pred_log_reg_continuous = log_reg.predict(X_val)

# Threshold 0f 0.5, if probablity is greater than 0.5 --> 1 else 0
y_pred_log_reg = [1 if value > 0.5 else 0 for value in y_pred_log_reg_continuous]

accuracy_log_reg = accuracy_score(y_val, y_pred_log_reg)
precision_log_reg = precision_score(y_val, y_pred_log_reg)
recall_log_reg = recall_score(y_val, y_pred_log_reg)
f1_log_reg = f1_score(y_val, y_pred_log_reg)
rmse = root_mean_squared_error(y_val, y_pred_log_reg)

accuracy, precision, recall, f1, rmse

(0.8028503562945368,
 0.8296296296296296,
 0.8582375478927203,
 0.8436911487758946,
 0.4440153642673451)

# Cleaning and MLFlow Integration for experimental tracking

In [22]:
def load_data(filename):
    df = pd.read_csv(filename)
    return df

def prepare_data(df):

    numerical_columns = [
    'Age', 'BMI', 'AlcoholConsumption', 'PhysicalActivity', 'DietQuality', 'SleepQuality', 
    'SystolicBP', 'DiastolicBP', 'CholesterolTotal', 'CholesterolLDL', 'CholesterolHDL', 'CholesterolTriglycerides', 
    'UPDRS', 'MoCA', 'FunctionalAssessment'
    ]

    categorical_columns = [
        'Gender', 'Ethnicity', 'EducationLevel', 'Smoking',  'FamilyHistoryParkinsons',
        'TraumaticBrainInjury', 'Hypertension', 'Diabetes', 'Depression', 'Stroke',
        'Tremor', 'Rigidity', 'Bradykinesia', 'PosturalInstability', 'SpeechProblems',
        'SleepDisorders', 'Constipation'
    ]

    # Changing the categorical column values dtype from int64 to str (object)
    df[categorical_columns] = df[categorical_columns].astype(str)

    X = df[numerical_columns + categorical_columns]
    
    # Target variable -> y
    target = 'Diagnosis'
    y = df[target].values
    
    # Split the data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

    return X_train, X_val, y_train, y_val

def train_and_predict_linear_regression_model(X_train, y_train, X_val):
    
    # Dicts
    train_dicts = X_train.to_dict(orient='records')
    val_dicts = X_val.to_dict(orient='records')

    # DictVextorizer
    dv = DictVectorizer()
    X_train = dv.fit_transform(train_dicts)
    X_val = dv.transform(val_dicts)

    # Loading the model
    model = LinearRegression()

    # fit and train
    model.fit(X_train, y_train)

    # Predict
    y_pred_continuous = model.predict(X_val)
    # Threshold 0f 0.5, if probablity is greater than 0.5 --> 1 else 0
    y_pred = [1 if value > 0.5 else 0 for value in y_pred_continuous]

    return y_pred

def get_metrics(y_val, y_pred):

    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    rmse = root_mean_squared_error(y_val, y_pred)

    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1, 'rmse': rmse}



### Example

In [23]:
data_path = "../data/parkinsons_disease_data.csv"

df = load_data(data_path)
X_train, X_val, y_train, y_val = prepare_data(df)
y_pred = train_and_predict_linear_regression_model(X_train, y_train, X_val)
metrics = get_metrics(y_val, y_pred)

metrics

{'accuracy': 0.8028503562945368,
 'precision': 0.8296296296296296,
 'recall': 0.8582375478927203,
 'f1': 0.8436911487758946,
 'rmse': 0.4440153642673451}

# Experiemnt Tracking with MLFLOW

In [2]:
import os
import mlflow

In [160]:
EXPERIMENT_NAME = os.getenv("EXPERIMENT_NAME", "parkinson-disease-prediction")

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment(EXPERIMENT_NAME)

2024/07/09 11:23:10 INFO mlflow.tracking.fluent: Experiment with name 'parkinson-disease-prediction' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1720516990405, experiment_id='1', last_update_time=1720516990405, lifecycle_stage='active', name='parkinson-disease-prediction', tags={}>

In [161]:
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

tracking URI: 'http://localhost:5000'


In [162]:
mlflow.search_experiments()

[<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1720516990405, experiment_id='1', last_update_time=1720516990405, lifecycle_stage='active', name='parkinson-disease-prediction', tags={}>,
 <Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1720516968628, experiment_id='0', last_update_time=1720516968628, lifecycle_stage='active', name='Default', tags={}>]

In [163]:
with mlflow.start_run():

    mlflow.sklearn.autolog()
    
    X_train, X_val, y_train, y_val = prepare_data(df)

    # Dicts
    train_dicts = X_train.to_dict(orient='records')
    val_dicts = X_val.to_dict(orient='records')

    # DictVextorizer
    dv = DictVectorizer()
    X_train = dv.fit_transform(train_dicts)
    X_val = dv.transform(val_dicts)

    # Loading the model
    model = LinearRegression()

    # fit and train
    model.fit(X_train, y_train)

    # Predict
    y_pred_continuous = model.predict(X_val)
    # Threshold 0f 0.5, if probablity is greater than 0.5 --> 1 else 0
    y_pred = [1 if value > 0.5 else 0 for value in y_pred_continuous]
    
    metrics = get_metrics(y_val, y_pred)

    mlflow.log_metric("accuracy", metrics["accuracy"])
    mlflow.log_metric("precision", metrics["precision"])
    mlflow.log_metric("recall", metrics["recall"])
    mlflow.log_metric("f1", metrics["f1"])
    mlflow.log_metric("rmse", metrics["rmse"])

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


In [134]:
with mlflow.start_run():

    mlflow.sklearn.autolog()
    
    X_train, X_val, y_train, y_val = prepare_data(df)

    # Dicts
    train_dicts = X_train.to_dict(orient='records')
    val_dicts = X_val.to_dict(orient='records')

    # DictVextorizer
    dv = DictVectorizer()
    X_train = dv.fit_transform(train_dicts)
    X_val = dv.transform(val_dicts)

    # Loading the model
    logistic_regression_model = LogisticRegression(max_iter=5000)

    # fit and train
    logistic_regression_model.fit(X_train, y_train)

    # Predict
    y_pred_continuous = logistic_regression_model.predict(X_val)
    # Threshold 0f 0.5, if probablity is greater than 0.5 --> 1 else 0
    y_pred = [1 if value > 0.5 else 0 for value in y_pred_continuous]
    
    metrics = get_metrics(y_val, y_pred)

    mlflow.log_metric("accuracy", metrics["accuracy"])
    mlflow.log_metric("precision", metrics["precision"])
    mlflow.log_metric("recall", metrics["recall"])
    mlflow.log_metric("f1", metrics["f1"])
    mlflow.log_metric("rmse", metrics["rmse"])

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


In [135]:
!pip install xgboost



In [136]:
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler

with mlflow.start_run():

    mlflow.sklearn.autolog()
    
    data_path = "../data/parkinsons_disease_data.csv"

    df = load_data(data_path)
    X_train, X_val, y_train, y_val = prepare_data(df)

    # Standardize the data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)

    # Train and evaluate XGBoost
    mlflow.set_tag("model", "XGBoostClassifier")
    xgboost = XGBClassifier()
    xgboost.fit(X_train, y_train)
    y_pred_xgboost = xgboost.predict(X_val)

    metrics = get_metrics(y_val, y_pred_xgboost)

    mlflow.log_metric("accuracy", metrics["accuracy"])
    mlflow.log_metric("precision", metrics["precision"])
    mlflow.log_metric("recall", metrics["recall"])
    mlflow.log_metric("f1", metrics["f1"])
    mlflow.log_metric("rmse", metrics["rmse"])

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


In [137]:
data_path = "../data/parkinsons_disease_data.csv"
df = load_data(data_path)
X_train, X_val, y_train, y_val = prepare_data(df)

In [138]:
import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

def train_model_xgboost_search(X_train, X_val, y_train, y_val):

    mlflow.xgboost.autolog()

    # Standardize the data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    
    # Define the hyperparameter space
    search_space = {
        'max_depth': scope.int(hp.uniform('max_depth', 1, 20)),
        'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
        'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
        'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
        'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
        'objective': 'reg:squarederror',
        'seed': 42
    }

    def objective(params):
        with mlflow.start_run():

            mlflow.set_tag("model", "XGBoostClassifier")
            mlflow.log_params(params)

            # Train and evaluate XGBoost
            mlflow.set_tag("model", "XGBoostClassifier")
            xgb_model = xgb.XGBClassifier(**params)
            xgb_model.fit(X_train, y_train)
            y_pred_xgboost = xgb_model.predict(X_val)

            metrics = get_metrics(y_val, y_pred_xgboost)
            
            mlflow.log_metric("accuracy", metrics["accuracy"])
            mlflow.log_metric("precision", metrics["precision"])
            mlflow.log_metric("recall", metrics["recall"])
            mlflow.log_metric("f1", metrics["f1"])
            mlflow.log_metric("rmse", metrics["rmse"])

        return {'loss': accuracy, 'status': STATUS_OK}

    fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=32,
        trials=Trials()
    )

    return

train_model_xgboost_search(X_train, X_val, y_train, y_val)

  0%|          | 0/32 [00:00<?, ?trial/s, best loss=?]


 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



  3%|▎         | 1/32 [00:01<00:57,  1.86s/trial, best loss: 0.8028503562945368]


 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



  6%|▋         | 2/32 [00:03<00:57,  1.93s/trial, best loss: 0.8028503562945368]


 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



  9%|▉         | 3/32 [00:05<00:56,  1.96s/trial, best loss: 0.8028503562945368]


 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 12%|█▎        | 4/32 [00:07<00:53,  1.93s/trial, best loss: 0.8028503562945368]


 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 16%|█▌        | 5/32 [00:09<00:51,  1.90s/trial, best loss: 0.8028503562945368]


 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 19%|█▉        | 6/32 [00:11<00:48,  1.86s/trial, best loss: 0.8028503562945368]


 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 22%|██▏       | 7/32 [00:13<00:47,  1.89s/trial, best loss: 0.8028503562945368]


 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 25%|██▌       | 8/32 [00:15<00:45,  1.91s/trial, best loss: 0.8028503562945368]


 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 28%|██▊       | 9/32 [00:17<00:42,  1.86s/trial, best loss: 0.8028503562945368]


 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 31%|███▏      | 10/32 [00:18<00:41,  1.87s/trial, best loss: 0.8028503562945368]


 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 34%|███▍      | 11/32 [00:20<00:38,  1.84s/trial, best loss: 0.8028503562945368]


 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 38%|███▊      | 12/32 [00:22<00:36,  1.83s/trial, best loss: 0.8028503562945368]


 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 41%|████      | 13/32 [00:24<00:33,  1.77s/trial, best loss: 0.8028503562945368]


 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 44%|████▍     | 14/32 [00:25<00:31,  1.75s/trial, best loss: 0.8028503562945368]


 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 47%|████▋     | 15/32 [00:27<00:30,  1.77s/trial, best loss: 0.8028503562945368]


 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 50%|█████     | 16/32 [00:29<00:27,  1.74s/trial, best loss: 0.8028503562945368]


 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 53%|█████▎    | 17/32 [00:31<00:26,  1.78s/trial, best loss: 0.8028503562945368]


 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 56%|█████▋    | 18/32 [00:32<00:24,  1.74s/trial, best loss: 0.8028503562945368]


 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 59%|█████▉    | 19/32 [00:34<00:23,  1.79s/trial, best loss: 0.8028503562945368]


 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 62%|██████▎   | 20/32 [00:36<00:20,  1.74s/trial, best loss: 0.8028503562945368]


 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 66%|██████▌   | 21/32 [00:38<00:19,  1.73s/trial, best loss: 0.8028503562945368]


 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 69%|██████▉   | 22/32 [00:39<00:17,  1.78s/trial, best loss: 0.8028503562945368]


 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 72%|███████▏  | 23/32 [00:41<00:15,  1.77s/trial, best loss: 0.8028503562945368]


 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 75%|███████▌  | 24/32 [00:43<00:14,  1.82s/trial, best loss: 0.8028503562945368]


 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 78%|███████▊  | 25/32 [00:45<00:13,  1.86s/trial, best loss: 0.8028503562945368]


 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 81%|████████▏ | 26/32 [00:47<00:10,  1.79s/trial, best loss: 0.8028503562945368]


 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 84%|████████▍ | 27/32 [00:48<00:08,  1.78s/trial, best loss: 0.8028503562945368]


 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 88%|████████▊ | 28/32 [00:50<00:07,  1.85s/trial, best loss: 0.8028503562945368]


 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 91%|█████████ | 29/32 [00:52<00:05,  1.85s/trial, best loss: 0.8028503562945368]


 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 94%|█████████▍| 30/32 [00:54<00:03,  1.84s/trial, best loss: 0.8028503562945368]


 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 97%|█████████▋| 31/32 [00:56<00:01,  1.84s/trial, best loss: 0.8028503562945368]


 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



100%|██████████| 32/32 [00:58<00:00,  1.82s/trial, best loss: 0.8028503562945368]


In [139]:
from sklearn.ensemble import RandomForestClassifier

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

def train_model_rfc(X_train, X_val, y_train, y_val):

    mlflow.sklearn.autolog()

    # Standardize the data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    
    # Define the hyperparameter space
    search_space = {
                    'max_depth': hp.choice('max_depth', [80, 90, 100, 110]),
                    'max_features': hp.choice('max_features', [2, 3]),
                    'min_samples_leaf': hp.choice('min_samples_leaf', [3, 4, 5]),
                    'min_samples_split': hp.choice('min_samples_split', [8, 10, 12]),
                    'n_estimators': hp.choice('n_estimators', [100, 200, 300, 1000])
                }

    def objective(params):
        with mlflow.start_run():

            mlflow.log_params(params)

            # Train and evaluate XGBoost
            mlflow.set_tag("model", "RandomForestClassifier")
            random_forest = RandomForestClassifier(**params)
            random_forest.fit(X_train, y_train)
            y_pred_random_forest = random_forest.predict(X_val)

            metrics = get_metrics(y_val, y_pred_random_forest)
            
            mlflow.log_metric("accuracy", metrics["accuracy"])
            mlflow.log_metric("precision", metrics["precision"])
            mlflow.log_metric("recall", metrics["recall"])
            mlflow.log_metric("f1", metrics["f1"])
            mlflow.log_metric("rmse", metrics["rmse"])

        return {'loss': accuracy, 'status': STATUS_OK}

    fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=32,
        trials=Trials()
    )

    return

train_model_rfc(X_train, X_val, y_train, y_val)

  0%|          | 0/32 [00:00<?, ?trial/s, best loss=?]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



  3%|▎         | 1/32 [00:03<01:33,  3.01s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



  6%|▋         | 2/32 [00:05<01:18,  2.63s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



  9%|▉         | 3/32 [00:07<01:12,  2.50s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 12%|█▎        | 4/32 [00:09<01:06,  2.36s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 16%|█▌        | 5/32 [00:12<01:01,  2.29s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 19%|█▉        | 6/32 [00:16<01:16,  2.93s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 22%|██▏       | 7/32 [00:18<01:10,  2.82s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 25%|██▌       | 8/32 [00:20<01:02,  2.61s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 28%|██▊       | 9/32 [00:23<00:58,  2.55s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 31%|███▏      | 10/32 [00:25<00:56,  2.55s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 34%|███▍      | 11/32 [00:30<01:05,  3.13s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 38%|███▊      | 12/32 [00:32<00:56,  2.84s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 41%|████      | 13/32 [00:34<00:50,  2.68s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 44%|████▍     | 14/32 [00:37<00:47,  2.66s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 47%|████▋     | 15/32 [00:39<00:42,  2.51s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 50%|█████     | 16/32 [00:41<00:39,  2.47s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 53%|█████▎    | 17/32 [00:46<00:44,  2.99s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 56%|█████▋    | 18/32 [00:48<00:39,  2.83s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 59%|█████▉    | 19/32 [00:53<00:43,  3.31s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 62%|██████▎   | 20/32 [00:57<00:43,  3.63s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 66%|██████▌   | 21/32 [01:00<00:36,  3.34s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 69%|██████▉   | 22/32 [01:02<00:31,  3.12s/trial, best loss: 0.8028503562945368]


 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 72%|███████▏  | 23/32 [01:07<00:31,  3.48s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 75%|███████▌  | 24/32 [01:09<00:25,  3.25s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 78%|███████▊  | 25/32 [01:12<00:21,  3.09s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 81%|████████▏ | 26/32 [01:16<00:21,  3.51s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 84%|████████▍ | 27/32 [01:19<00:16,  3.22s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 88%|████████▊ | 28/32 [01:22<00:12,  3.04s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 91%|█████████ | 29/32 [01:26<00:10,  3.42s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 94%|█████████▍| 30/32 [01:29<00:06,  3.19s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 97%|█████████▋| 31/32 [01:31<00:03,  3.04s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



100%|██████████| 32/32 [01:36<00:00,  3.01s/trial, best loss: 0.8028503562945368]


In [140]:
!pip install catboost



In [141]:
from catboost import CatBoostClassifier

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

def train_model_catboost(X_train, X_val, y_train, y_val):

    # Standardize the data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    
    # Define the hyperparameter space
    search_space = {
                    'iterations': hp.choice('max_depth', [500, 1000, 1500]),
                    'learning_rate': hp.choice('learning_rate', [0.01, 0.05, 0.1]),
                    'depth': hp.choice('depth', [4, 6, 8, 10]),
                    'l2_leaf_reg': hp.choice('l2_leaf_reg', [1, 3, 5, 7, 9]),
                    'border_count': hp.choice('border_count', [32, 50, 100]),
                    'bagging_temperature': hp.choice('bagging_temperature', [0.5, 1, 2, 5]),
                    'random_strength': hp.choice('random_strength', [0.5, 1, 2]),
                    'scale_pos_weight': hp.choice('scale_pos_weight', [1, 2, 3, 5]),
                }

    def objective(params):
        with mlflow.start_run():

            mlflow.log_params(params)

            # Train and evaluate XGBoost
            mlflow.set_tag("model", "CatBoostClassifier")
            catboost = CatBoostClassifier(verbose=0, **params)
            catboost.fit(X_train, y_train)
            y_pred_catboost = catboost.predict(X_val)

            metrics = get_metrics(y_val, y_pred_catboost)
            
            mlflow.log_metric("accuracy", metrics["accuracy"])
            mlflow.log_metric("precision", metrics["precision"])
            mlflow.log_metric("recall", metrics["recall"])
            mlflow.log_metric("f1", metrics["f1"])
            mlflow.log_metric("rmse", metrics["rmse"])

            # Log the model manually
            mlflow.catboost.log_model(  catboost, 
                                        artifact_path="models", 
                                )

        return {'loss': accuracy, 'status': STATUS_OK}

    fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=32,
        trials=Trials()
    )

    return

train_model_catboost(X_train, X_val, y_train, y_val)

  0%|          | 0/32 [00:00<?, ?trial/s, best loss=?]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



  3%|▎         | 1/32 [00:05<03:05,  5.97s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



  6%|▋         | 2/32 [00:08<02:04,  4.16s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



  9%|▉         | 3/32 [00:11<01:34,  3.26s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 12%|█▎        | 4/32 [00:13<01:27,  3.11s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 16%|█▌        | 5/32 [00:16<01:19,  2.93s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 19%|█▉        | 6/32 [00:21<01:30,  3.47s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 22%|██▏       | 7/32 [00:25<01:35,  3.81s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 25%|██▌       | 8/32 [00:40<02:54,  7.25s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 28%|██▊       | 9/32 [00:42<02:10,  5.67s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 31%|███▏      | 10/32 [00:52<02:35,  7.06s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 34%|███▍      | 11/32 [01:02<02:47,  7.98s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 38%|███▊      | 12/32 [01:06<02:12,  6.62s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 41%|████      | 13/32 [01:19<02:42,  8.55s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 44%|████▍     | 14/32 [01:22<02:06,  7.04s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 47%|████▋     | 15/32 [01:25<01:38,  5.82s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 50%|█████     | 16/32 [01:35<01:53,  7.12s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 53%|█████▎    | 17/32 [01:39<01:30,  6.04s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 56%|█████▋    | 18/32 [01:45<01:24,  6.06s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 59%|█████▉    | 19/32 [01:47<01:05,  5.01s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 62%|██████▎   | 20/32 [01:51<00:55,  4.59s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 66%|██████▌   | 21/32 [01:57<00:55,  5.02s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 69%|██████▉   | 22/32 [02:03<00:52,  5.28s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 72%|███████▏  | 23/32 [02:07<00:43,  4.78s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 75%|███████▌  | 24/32 [02:09<00:32,  4.10s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 78%|███████▊  | 25/32 [02:15<00:32,  4.61s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 81%|████████▏ | 26/32 [02:19<00:26,  4.34s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 84%|████████▍ | 27/32 [02:21<00:19,  3.84s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 88%|████████▊ | 28/32 [02:27<00:17,  4.41s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 91%|█████████ | 29/32 [02:31<00:12,  4.18s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 94%|█████████▍| 30/32 [02:34<00:07,  3.83s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



 97%|█████████▋| 31/32 [02:40<00:04,  4.70s/trial, best loss: 0.8028503562945368]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.



100%|██████████| 32/32 [02:44<00:00,  5.15s/trial, best loss: 0.8028503562945368]


### Model Registry - Register the best model

In [3]:
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType

def register_best_model():
    client = MlflowClient()
    experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
    best_run = client.search_runs(
        experiment_ids=experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=1,
        order_by=["metrics.rmse asc"]
    )[0]
    
    # register the best model
    run_id = best_run.info.run_id
    model_uri = f"runs:/{run_id}/model"
    model_accuracy = round(best_run.data.metrics['accuracy']*100)
    model_details = mlflow.register_model(model_uri=model_uri, name=EXPERIMENT_NAME)
    client.update_registered_model(
      name=model_details.name,
      description=f"Current accuracy: {model_accuracy}%"
    )

In [143]:
register_best_model()

Successfully registered model 'parkinson-disease-prediction'.
2024/07/09 10:06:38 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: parkinson-disease-prediction, version 1
Created version '1' of model 'parkinson-disease-prediction'.


# Inference

### Getting the RUN_ID of the registered model

In [144]:
# Initialize MLflow client
client = MlflowClient()

# Specify the model name
model_name = "parkinson-disease-prediction"

# Get the latest versions of the registered model
latest_versions = client.get_latest_versions(model_name, stages=["None", "Staging", "Production"])

# Print run_ids of all the latest versions
for version in latest_versions:

    RUN_ID = version.run_id

    print(f"Model Name: {model_name}")
    print(f"Version: {version.version}")
    print(f"Run ID: {version.run_id}")

Model Name: parkinson-disease-prediction
Version: 1
Run ID: ed955bc25fa74d95bedbd1b5a72a8e73


  latest_versions = client.get_latest_versions(model_name, stages=["None", "Staging", "Production"])


### Load the registered model using RUN_ID

In [145]:
logged_model = f'runs:/{RUN_ID}/models'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)
loaded_model

Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

 - mlflow (current: 2.14.2, required: mlflow==2.13.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


mlflow.pyfunc.loaded_model:
  artifact_path: models
  flavor: mlflow.catboost
  run_id: ed955bc25fa74d95bedbd1b5a72a8e73

In [146]:
# Prepare the test data
test_data = X_val.iloc[10].to_dict()
test_data

{'Age': 67,
 'BMI': 24.77259403457728,
 'AlcoholConsumption': 13.941385740391004,
 'PhysicalActivity': 2.472534322357399,
 'DietQuality': 9.593309739128369,
 'SleepQuality': 6.060992120577725,
 'SystolicBP': 92,
 'DiastolicBP': 60,
 'CholesterolTotal': 193.1381868106296,
 'CholesterolLDL': 137.5175503762128,
 'CholesterolHDL': 25.482467035806813,
 'CholesterolTriglycerides': 313.66674876926953,
 'UPDRS': 58.99092096333794,
 'MoCA': 19.902233615718146,
 'FunctionalAssessment': 5.104914484020905,
 'Gender': '1',
 'Ethnicity': '2',
 'EducationLevel': '1',
 'Smoking': '0',
 'FamilyHistoryParkinsons': '0',
 'TraumaticBrainInjury': '0',
 'Hypertension': '1',
 'Diabetes': '0',
 'Depression': '0',
 'Stroke': '0',
 'Tremor': '0',
 'Rigidity': '1',
 'Bradykinesia': '1',
 'PosturalInstability': '0',
 'SpeechProblems': '0',
 'SleepDisorders': '1',
 'Constipation': '0'}

In [147]:
# Predict
y_pred = loaded_model.predict(pd.DataFrame(test_data, index=[0]))
y_pred

array([1])

In [148]:
# Ground Truth
y_val[10]

1

### Transit the registered model to the Stage

In [149]:
latest_versions = client.get_latest_versions(model_name, stages=["None"])

for version in latest_versions:
    print(f"version: {version.version}, stage: {version.current_stage}")

version: 1, stage: None


  latest_versions = client.get_latest_versions(model_name, stages=["None"])


In [150]:
# Move the registered model to stage
model_version = 1
new_stage = "staging"
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)

  client.transition_model_version_stage(


<ModelVersion: aliases=[], creation_timestamp=1720512398078, current_stage='Staging', description='', last_updated_timestamp=1720512476299, name='parkinson-disease-prediction', run_id='ed955bc25fa74d95bedbd1b5a72a8e73', run_link='', source='mlflow-artifacts:/1/ed955bc25fa74d95bedbd1b5a72a8e73/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='1'>

In [151]:
latest_versions = client.get_latest_versions(model_name, stages=["Staging"])

for version in latest_versions:
    print(f"version: {version.version}, stage: {version.current_stage}")

version: 1, stage: Staging


  latest_versions = client.get_latest_versions(model_name, stages=["Staging"])


In [152]:
from datetime import datetime

date = datetime.today().date()
client.update_model_version(
    name=model_name,
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_stage} on {date}"
)

<ModelVersion: aliases=[], creation_timestamp=1720512398078, current_stage='Staging', description='The model version 1 was transitioned to staging on 2024-07-09', last_updated_timestamp=1720512483181, name='parkinson-disease-prediction', run_id='ed955bc25fa74d95bedbd1b5a72a8e73', run_link='', source='mlflow-artifacts:/1/ed955bc25fa74d95bedbd1b5a72a8e73/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='1'>

###  Save the Artifacts of the best model from the MLFlow registry to S3 bucket on AWS

In [3]:
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType
import mlflow
import os

EXPERIMENT_NAME = os.getenv("EXPERIMENT_NAME", "parkinson-disease-prediction-experiment")

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment(EXPERIMENT_NAME)

# Initialize MLflow client
client = MlflowClient()

# Specify the model name
model_name = "parkinson-disease-prediction-experiment"

# Get the latest versions of the registered model
latest_versions = client.get_latest_versions(model_name, stages=["Production"])

# Print run_ids of all the latest versions
for version in latest_versions:

    RUN_ID = version.run_id

RUN_ID

  latest_versions = client.get_latest_versions(model_name, stages=["Production"])


'6f4c13e86ae94d7a958349c35af3fbb1'

In [4]:
import boto3
import os

# Define S3 parameters
bucket_name = "s3-parkinson-disease-prediction"
s3_key_prefix = f"{RUN_ID}/artifacts"

# Create an S3 client
s3_client = boto3.client('s3')

# Load the model from the MLflow model registry
model_uri = f'runs:/{RUN_ID}/models'
local_path = mlflow.artifacts.download_artifacts(model_uri)

# Walk through the local directory and upload to S3
for root, dirs, files in os.walk(local_path):
    for file in files:
        local_file_path = os.path.join(root, file)
        s3_key = os.path.join(s3_key_prefix, os.path.relpath(local_file_path, local_path))
        s3_client.upload_file(local_file_path, bucket_name, s3_key)
        print(f"Uploaded {local_file_path} to s3://{bucket_name}/{s3_key}")

print("Model artifacts uploaded to S3 successfully.")

Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Uploaded /var/folders/b4/lg6cghr91lg7q_c13xh443mw0000gn/T/tmpuo7a0414/models/python_env.yaml to s3://s3-parkinson-disease-prediction/6f4c13e86ae94d7a958349c35af3fbb1/artifacts/python_env.yaml
Uploaded /var/folders/b4/lg6cghr91lg7q_c13xh443mw0000gn/T/tmpuo7a0414/models/requirements.txt to s3://s3-parkinson-disease-prediction/6f4c13e86ae94d7a958349c35af3fbb1/artifacts/requirements.txt
Uploaded /var/folders/b4/lg6cghr91lg7q_c13xh443mw0000gn/T/tmpuo7a0414/models/MLmodel to s3://s3-parkinson-disease-prediction/6f4c13e86ae94d7a958349c35af3fbb1/artifacts/MLmodel
Uploaded /var/folders/b4/lg6cghr91lg7q_c13xh443mw0000gn/T/tmpuo7a0414/models/model.cb to s3://s3-parkinson-disease-prediction/6f4c13e86ae94d7a958349c35af3fbb1/artifacts/model.cb
Uploaded /var/folders/b4/lg6cghr91lg7q_c13xh443mw0000gn/T/tmpuo7a0414/models/conda.yaml to s3://s3-parkinson-disease-prediction/6f4c13e86ae94d7a958349c35af3fbb1/artifacts/conda.yaml
Uploaded /var/folders/b4/lg6cghr91lg7q_c13xh443mw0000gn/T/tmpuo7a0414/models/m

### Load model from the S3 and predict

In [5]:
logged_model = f's3://s3-parkinson-disease-prediction/{RUN_ID}/artifacts'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

 - mlflow (current: 2.14.2, required: mlflow==2.12.1)
 - ipywidgets (current: 8.1.2, required: ipywidgets==8.1.3)
 - lz4 (current: uninstalled, required: lz4==4.3.3)
 - pandas (current: 2.2.2, required: pandas==1.5.3)
 - scikit-learn (current: 1.4.2, required: scikit-learn==1.5.0)
 - scipy (current: 1.13.0, required: scipy==1.13.1)
 - xgboost (current: 2.0.3, required: xgboost==2.1.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


In [6]:
import pandas as pd

test_data = {
            'Age': 67,
            'BMI': 24.77259403457728,
            'AlcoholConsumption': 13.941385740391004,
            'PhysicalActivity': 2.472534322357399,
            'DietQuality': 9.593309739128369,
            'SleepQuality': 6.060992120577725,
            'SystolicBP': 92,
            'DiastolicBP': 60,
            'CholesterolTotal': 193.1381868106296,
            'CholesterolLDL': 137.5175503762128,
            'CholesterolHDL': 25.482467035806813,
            'CholesterolTriglycerides': 313.66674876926953,
            'UPDRS': 58.99092096333794,
            'MoCA': 19.902233615718146,
            'FunctionalAssessment': 5.104914484020905,
            'Gender': '1',
            'Ethnicity': '2',
            'EducationLevel': '1',
            'Smoking': '0',
            'FamilyHistoryParkinsons': '0',
            'TraumaticBrainInjury': '0',
            'Hypertension': '1',
            'Diabetes': '0',
            'Depression': '0',
            'Stroke': '0',
            'Tremor': '0',
            'Rigidity': '1',
            'Bradykinesia': '1',
            'PosturalInstability': '0',
            'SpeechProblems': '0',
            'SleepDisorders': '1',
            'Constipation': '0'
        }

# Predict
y_pred = loaded_model.predict(pd.DataFrame(test_data, index=[0]))
y_pred

array([0])

END OF NOTEBOOK!