# Assignment 5

### 1. Load "ModifiedEdibleMushroom.csv" data from the link below (note: this data set has been preliminarily prepared.)

https://github.com/pvateekul/2110446_DSDE_2023s2/blob/main/code/Week03_ML/mushroom2020_dataset.csv

In [24]:
!pip install scikit-learn
!pip install mlflow



In [25]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline

In [26]:
df = pd.read_csv("https://github.com/pvateekul/2110446_DSDE_2023s2/raw/main/code/Week03_ML/mushroom2020_dataset.csv")

In [27]:
df.head()

Unnamed: 0,id,label,cap-shape,cap-surface,bruises,odor,gill-attachment,gill-spacing,gill-size,stalk-shape,...,ring-number,ring-type,spore-print-color,population,habitat,cap-color-rate,gill-color-rate,veil-color-rate,stalk-color-above-ring-rate,stalk-color-below-ring-rate
0,1,p,x,s,t,p,f,c,n,e,...,o,p,k,s,u,1.0,3.0,1.0,1.0,1.0
1,2,e,x,s,t,a,f,c,b,e,...,o,p,n,n,g,2.0,3.0,1.0,1.0,1.0
2,3,e,b,s,t,l,f,c,b,e,...,o,p,n,n,m,3.0,1.0,1.0,1.0,1.0
3,4,p,x,y,t,p,f,c,n,e,...,o,p,k,s,u,3.0,1.0,1.0,1.0,1.0
4,5,e,x,s,f,n,f,w,b,t,...,o,e,n,a,g,4.0,3.0,1.0,1.0,1.0


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5824 entries, 0 to 5823
Data columns (total 24 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           5824 non-null   int64  
 1   label                        5764 non-null   object 
 2   cap-shape                    5824 non-null   object 
 3   cap-surface                  5797 non-null   object 
 4   bruises                      5725 non-null   object 
 5   odor                         5725 non-null   object 
 6   gill-attachment              5725 non-null   object 
 7   gill-spacing                 5694 non-null   object 
 8   gill-size                    5703 non-null   object 
 9   stalk-shape                  5703 non-null   object 
 10  stalk-root                   5793 non-null   object 
 11  stalk-surface-above-ring     5793 non-null   object 
 12  stalk-surface-below-ring     5793 non-null   object 
 13  veil-type         

In [29]:
print("grill size NA number:",df["gill-size"].isna().sum())

grill size NA number: 121


### 2. Drop rows where the target (label) variable is missing

In [30]:
df.dropna(subset=["label"], axis=0, inplace=True)
print("df shape:",df.shape)

df shape: (5764, 24)


### 3. Drop the following variables:

```'id','gill-attachment', 'gill-spacing', 'gill-size','gill-color-rate','stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring-rate','stalk-color-below-ring-rate','veil-color-rate','veil-type'```

In [31]:
# print("df shape:",df.shape)
df.drop(columns=['id','gill-attachment', 'gill-spacing', 'gill-size','gill-color-rate','stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring-rate','stalk-color-below-ring-rate','veil-color-rate','veil-type'], inplace=True)
print("df shape:",df.shape)

df shape: (5764, 12)


In [32]:
df.head()

Unnamed: 0,label,cap-shape,cap-surface,bruises,odor,stalk-shape,ring-number,ring-type,spore-print-color,population,habitat,cap-color-rate
0,p,x,s,t,p,e,o,p,k,s,u,1.0
1,e,x,s,t,a,e,o,p,n,n,g,2.0
2,e,b,s,t,l,e,o,p,n,n,m,3.0
3,p,x,y,t,p,e,o,p,k,s,u,3.0
4,e,x,s,f,n,t,o,e,n,a,g,4.0


In [33]:
rows,cols = df.shape
missing_values = df.isnull().sum().sum()

print("Number of rows:", rows)
print("Number of digits:", cols)
print("Missing values:", missing_values)


Number of rows: 5764
Number of digits: 12
Missing values: 640


### Use Pipeline



In [34]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

num_cols = ['cap-color-rate']
cat_cols = ['cap-shape', 'cap-surface', 'bruises', 'odor', 'stalk-shape', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']

# Initialize pipeline for numerical and categorical columns

num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', StandardScaler())])

cat_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one-hot',OneHotEncoder(handle_unknown='ignore'))])

In [35]:
from sklearn.compose import ColumnTransformer

col_trans = ColumnTransformer(transformers=[
    ('num_pipeline', num_pipeline, num_cols),
    ('cat_pipeline', cat_pipeline, cat_cols)],
    n_jobs=-1)

In [36]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model_pipeline = Pipeline(steps=[
    ('col_trans', col_trans),
    ('model', model)
])

In [37]:
from sklearn import set_config

set_config(display='diagram')
display(model_pipeline)

In [38]:
from sklearn.model_selection import train_test_split

X = df[num_cols + cat_cols]
y = df['label']
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=2020)

In [39]:
df.head()

Unnamed: 0,label,cap-shape,cap-surface,bruises,odor,stalk-shape,ring-number,ring-type,spore-print-color,population,habitat,cap-color-rate
0,p,x,s,t,p,e,o,p,k,s,u,1.0
1,e,x,s,t,a,e,o,p,n,n,g,2.0
2,e,b,s,t,l,e,o,p,n,n,m,3.0
3,p,x,y,t,p,e,o,p,k,s,u,3.0
4,e,x,s,f,n,t,o,e,n,a,g,4.0


### Use MLFlow

In [40]:
# Importing all Libraries
import mlflow
import mlflow.sklearn

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

import warnings
warnings.filterwarnings("ignore")

In [41]:
local_registry = "sqlite:///mlruns.db"
mlflow.set_tracking_uri(local_registry)
experiment_id = mlflow.set_experiment('test_experiment')

def eval_metrics(actual, pred):
    accuracy = accuracy_score(actual, pred)
    return accuracy

def train_model(param_grid, n_splits):

    # Starting the Experiement
    with mlflow.start_run():

        # Model building
        model = GridSearchCV(model_pipeline, param_grid, cv=StratifiedKFold(n_splits), scoring='accuracy')
        model.fit(X_train, y_train)   # Model Training
        y_pred = model.predict(X_test)  # Model Prediction on Testing data
        (accuracy) = eval_metrics(y_test, y_pred)

        print('Accuracy: {:.4f}'.format(accuracy))

        # Logging Parameters
        mlflow.log_param("param_grid", param_grid)
        mlflow.log_param("n_splits", n_splits)

        # Logging Metrics
        mlflow.log_metric("accuracy", accuracy_score(y_test, y_pred))

        # Model Logging
        mlflow.sklearn.log_model(model, 'model')

        return model

In [42]:
param_grid = {
    'model__criterion':['entropy', 'gini'],
    'model__max_depth': [2, 3, 6],
    'model__min_samples_leaf':[2, 5, 10],
    'model__n_estimators': [100, 200],
    'model__random_state':[2020]
    }
n_splits = 5

train_model(param_grid, n_splits)

Accuracy: 0.9965


In [43]:
# Search run id
best_run_df = mlflow.search_runs(order_by=['metrics.accuracy DESC'], max_results=1)
best_run_df

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.accuracy,params.param_grid,params.n_splits,tags.mlflow.source.type,tags.mlflow.log-model.history,tags.mlflow.user,tags.mlflow.runName,tags.mlflow.source.name
0,dd4ae748407443dc82e3adf84ff3cafa,1,FINISHED,/Users/jackkahod/Desktop/File/3-2/Data-Sci/211...,2024-02-12 16:20:05.848000+00:00,2024-02-12 16:20:33.549000+00:00,0.996531,"{'model__criterion': ['entropy', 'gini'], 'mod...",5,LOCAL,"[{""run_id"": ""dd4ae748407443dc82e3adf84ff3cafa""...",jackkahod,sassy-chimp-269,/Users/jackkahod/anaconda3/envs/datasci/lib/py...


In [44]:
run_id = str(best_run_df.loc[0, 'run_id'])
print('run_id: ', run_id)
model_uri = f"runs:/{run_id}/model"
print('model_uri: ', model_uri)

run_id:  dd4ae748407443dc82e3adf84ff3cafa
model_uri:  runs:/dd4ae748407443dc82e3adf84ff3cafa/model


In [45]:
# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(model_uri=f"runs:/{run_id}/model")

# Predict on a Pandas DataFrame.
predicted = loaded_model.predict(pd.DataFrame(X_test))

print(classification_report(y_test, predicted, digits=4))

              precision    recall  f1-score   support

           e     0.9906    1.0000    0.9953       421
           p     1.0000    0.9945    0.9973       732

    accuracy                         0.9965      1153
   macro avg     0.9953    0.9973    0.9963      1153
weighted avg     0.9966    0.9965    0.9965      1153



In [46]:
mlflow.register_model(model_uri=model_uri, name="Murshroom")

model_name = 'Murshroom'
model_version = 1

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(model_uri=f"models:/{model_name}/{model_version}")

# Predict on a Pandas DataFrame.
predicted = loaded_model.predict(pd.DataFrame(X_test))


Registered model 'Murshroom' already exists. Creating a new version of this model...
Created version '2' of model 'Murshroom'.


In [47]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test, predicted, digits=4))
print(confusion_matrix(y_test,predicted))

              precision    recall  f1-score   support

           e     0.9906    1.0000    0.9953       421
           p     1.0000    0.9945    0.9973       732

    accuracy                         0.9965      1153
   macro avg     0.9953    0.9973    0.9963      1153
weighted avg     0.9966    0.9965    0.9965      1153

[[421   0]
 [  4 728]]


### Initialize Database

In [48]:
!mlflow ui --backend-store-uri sqlite:///mlruns.db