In [13]:
import pandas as pd
import mlflow
import pickle
from sklearn.metrics import roc_auc_score

<h3> Read Pickled dataframees </h3>

In [2]:
df_full = pd.read_pickle("./df_full.pkl")
df_train = pd.read_pickle("./df_train.pkl")
df_val = pd.read_pickle("./df_val.pkl")
df_test = pd.read_pickle("./df_test.pkl")

y_train = pd.read_pickle("./y_train.pkl")
y_val = pd.read_pickle("./y_val.pkl")
y_test = pd.read_pickle("./y_test.pkl")


<h3> Set mlflow tracking uri and experiment </h3>

In [3]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("kkbox-churn-prediction3")

<Experiment: artifact_location=('/Users/sarveshthakur/Documents/MLOps '
 'Zoomcamp/mlops-zoomcamp-practice/capstone project/mlruns/3'), creation_time=1692204804852, experiment_id='3', last_update_time=1692204804852, lifecycle_stage='active', name='kkbox-churn-prediction3', tags={}>

<h3> Dict Vectorizer </h3>

In [4]:
from sklearn.feature_extraction import DictVectorizer

In [5]:
numerical = ['registration_init_time', 'num_25', 'num_50', 'num_75', 'num_985', 'num_100', 'num_unq', 'total_secs']

In [6]:
categorical = ['gender', 'registered_via', 'city']

In [7]:
train_dicts = df_train[categorical + numerical].to_dict(orient='records')
val_dicts = df_val[categorical + numerical].to_dict(orient='records')
test_dicts = df_test[categorical + numerical].to_dict(orient='records')

In [8]:
dv = DictVectorizer(sparse=False)

In [9]:
X_train = dv.fit_transform(train_dicts)
X_val = dv.fit_transform(val_dicts)
X_test = dv.fit_transform(test_dicts)

In [10]:
dv.feature_names_

['city',
 'city=ns',
 'gender',
 'num_100',
 'num_25',
 'num_50',
 'num_75',
 'num_985',
 'num_unq',
 'registered_via',
 'registered_via=ns',
 'registration_init_time',
 'total_secs']

<h2> Training LightGBM </h2>

In [14]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score

In [12]:
lgbm = LGBMClassifier()

<h3> Model evaluation on test data </h3>

In [25]:
mlflow.lightgbm.autolog()

run = mlflow.active_run()
if(run.info.status == 'RUNNING'):
   mlflow.end_run()

with mlflow.start_run():
    lgbm.fit(X_train, y_train.values.ravel(), eval_set=(X_val, y_val.values.ravel()))
    y_pred = lgbm.predict(X_test)

    score = roc_auc_score(y_test, y_pred)
    score

mlflow.end_run()



[LightGBM] [Info] Number of positive: 37966, number of negative: 557792
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2075
[LightGBM] [Info] Number of data points in the train set: 595758, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.063727 -> initscore=-2.687295
[LightGBM] [Info] Start training from score -2.687295


In [26]:
with open('models/lgbm3.bin', 'wb') as f_out:
    pickle.dump((dv, lgbm), f_out)

In [27]:
mlflow.log_artifact(local_path="models/lgbm3.bin", artifact_path="models_pickl3")
mlflow.lightgbm.log_model(lgbm,artifact_path="models_pickle3")

<mlflow.models.model.ModelInfo at 0x173da3490>

<h2> Training Logistic Regression </h2>

In [11]:
from sklearn.linear_model import LogisticRegression

In [14]:
mlflow.sklearn.autolog()

# run = mlflow.active_run()
# if(run.info.status == 'RUNNING'):
#    mlflow.end_run()

with mlflow.start_run():
    mlflow.set_tag("developer", "st")

    model = LogisticRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    score = roc_auc_score(y_val, y_pred)
# score

    y_pred = model.predict(X_test)
    score = roc_auc_score(y_test, y_pred)

    mlflow.log_metric("roc_auc_score", score)

mlflow.end_run()

  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))


<h2> Training LightGBM </h2>

In [None]:
from lightgbm import LGBMClassifier

In [None]:
lgbm = LGBMClassifier()

In [33]:
mlflow.lightgbm.autolog()
# import logging
# logging.getLogger("mlflow").setLevel(logging.DEBUG)
with mlflow.start_run():
    lgbm.fit(X_train, y_train.values.ravel(), eval_set=(X_val, y_val.values.ravel()))
    y_pred = lgbm.predict(X_test)
    score = roc_auc_score(y_test, y_pred)
    score

mlflow.end_run()



[LightGBM] [Info] Number of positive: 37966, number of negative: 557792
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2075
[LightGBM] [Info] Number of data points in the train set: 595758, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.063727 -> initscore=-2.687295
[LightGBM] [Info] Start training from score -2.687295


<h2> Training Random Forest</h2>

In [15]:
from sklearn.ensemble import RandomForestClassifier

In [16]:
rf = RandomForestClassifier()

In [17]:
rf.fit(X_train, y_train.values.ravel())

2023/08/17 12:23:56 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'c1501dc1caba4c389a2bc29bf3c2c133', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


In [49]:
mlflow.sklearn.autolog(disable=True)

# run = mlflow.active_run()
# if(run.info.status == 'RUNNING'):
#    mlflow.end_run()

with mlflow.start_run():
    y_pred = rf.predict(X_test)
    score = roc_auc_score(y_test, y_pred)
    score
    mlflow.log_metric("roc_auc_score", score)

    with open('models/preproccesor.b', 'wb') as f_out:
        pickle.dump((dv), f_out)

    mlflow.log_artifact(local_path="models/preproccesor.b", artifact_path="preproccesor")

    with open('models/rf.bin', 'wb') as f_out:
        pickle.dump((dv, rf), f_out)

    mlflow.log_artifact(local_path="models/rf.bin", artifact_path="models_pickl31")
    mlflow.sklearn.log_model(rf,artifact_path="models_pickle31")

mlflow.end_run()

In [52]:
logged_model = 'runs:/8eb6c1d188c7441c900ee29f0f4917ec/models_pickle31'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

# Predict on a Pandas DataFrame.
y_pred = loaded_model.predict(pd.DataFrame(X_test))
score = roc_auc_score(y_test, y_pred)
score

0.6298289177933627