In [54]:
import pandas as pd
import mlflow
import pickle

<h3> Read Pickled dataframees </h3>

In [3]:
df_full = pd.read_pickle("./df_full.pkl")
df_train = pd.read_pickle("./df_train.pkl")
df_val = pd.read_pickle("./df_val.pkl")
df_test = pd.read_pickle("./df_test.pkl")

y_train = pd.read_pickle("./y_train.pkl")
y_val = pd.read_pickle("./y_val.pkl")
y_test = pd.read_pickle("./y_test.pkl")


<h3> Set mlflow tracking uri and experiment </h3>

In [84]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("kkbox-churn-prediction2")

2023/08/16 12:34:01 INFO mlflow.tracking.fluent: Experiment with name 'kkbox-churn-prediction2' does not exist. Creating a new experiment.


<Experiment: artifact_location=('/Users/sarveshthakur/Documents/MLOps '
 'Zoomcamp/mlops-zoomcamp-practice/capstone project/mlruns/2'), creation_time=1692203641870, experiment_id='2', last_update_time=1692203641870, lifecycle_stage='active', name='kkbox-churn-prediction2', tags={}>

<h3> Dict Vectorizer </h3>

In [5]:
from sklearn.feature_extraction import DictVectorizer

In [6]:
numerical = ['registration_init_time', 'num_25', 'num_50', 'num_75', 'num_985', 'num_100', 'num_unq', 'total_secs']

In [7]:
categorical = ['gender', 'registered_via', 'city']

In [8]:
train_dicts = df_train[categorical + numerical].to_dict(orient='records')
val_dicts = df_val[categorical + numerical].to_dict(orient='records')
test_dicts = df_test[categorical + numerical].to_dict(orient='records')

In [9]:
dv = DictVectorizer(sparse=False)

In [10]:
X_train = dv.fit_transform(train_dicts)
X_val = dv.fit_transform(val_dicts)
X_test = dv.fit_transform(test_dicts)

In [11]:
dv.feature_names_

['city',
 'city=ns',
 'gender',
 'num_100',
 'num_25',
 'num_50',
 'num_75',
 'num_985',
 'num_unq',
 'registered_via',
 'registered_via=ns',
 'registration_init_time',
 'total_secs']

<h2> Training Logistic Regression </h2>

In [12]:
from sklearn.linear_model import LogisticRegression

In [13]:
len(X_train),len(y_train)

(595758, 595758)

In [14]:
X_train.shape, y_train.shape

((595758, 13), (595758, 1))

In [15]:
X_val.shape, y_val.shape

((198586, 13), (198586, 1))

In [19]:
from sklearn.metrics import roc_auc_score

In [26]:
mlflow.sklearn.autolog()

#run = mlflow.active_run()
#if(run.info.status == 'RUNNING'):
#    mlflow.end_run()

with mlflow.start_run():
    mlflow.set_tag("developer", "st")

    model = LogisticRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    score = roc_auc_score(y_val, y_pred)
# score

    y_pred = model.predict(X_test)
    score = roc_auc_score(y_test, y_pred)

    mlflow.log_metric("roc_auc_score", score)

mlflow.end_run()

  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))


<h3> Logistic Regression Model Prediction on validation data </h3>

In [139]:
y_pred = model.predict(X_val)

In [123]:
model.predict_proba(X_val)

array([[0.5       , 0.5       ],
       [0.92769234, 0.07230766],
       [0.97051756, 0.02948244],
       ...,
       [0.98991972, 0.01008028],
       [0.89549084, 0.10450916],
       [0.95012366, 0.04987634]])

In [124]:
y_pred = model.predict_proba(X_val)[:, 1]
y_pred

array([0.5       , 0.07230766, 0.02948244, ..., 0.01008028, 0.10450916,
       0.04987634])

In [125]:
churn_decision = (y_pred >= 0.49)

In [126]:
df_val[churn_decision]

Unnamed: 0,msno,city,gender,registered_via,registration_init_time,date,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs
0,6mSj0Fs+oeIIg9dvcYaZ53sxFOnBsktVQlb+qBd0coc=,ns,0.0,ns,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,mZQor4G6jYCC9eRmqkncpTb0M+159r8P8da1m6yxgmI=,ns,0.0,ns,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,y351ihBgxdWoDLpESqYBg1/dvTvHgWa1JYruQaB5B6c=,ns,0.0,ns,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,5smXFx7PUuzHncs1GDZwFModKyGePcHhcET+HTn30Ao=,ns,0.0,ns,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,aNcW+YeoIbuoVYp3zXhc9J045wkELHQFoEM+RNCjeKI=,ns,0.0,ns,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
198514,gQ9XngR84x1GNpjdTx/sJWVvXigxR/CRuyNRKw+6nww=,ns,0.0,ns,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
198527,yJ98WpkTrzUhB2TT38l3ao5hWNkg4mkEEBK1cww+yGU=,ns,0.0,ns,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
198537,2W+1Lu9mCAh2iQA9HvW/s4sSi2BxFwJCFfOzUv0NpGg=,ns,0.0,ns,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
198539,zbdwhel4lrh3PnKEVzl4pCAL3mESiCX0bcLzu+A0MPw=,ns,0.0,ns,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [127]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = churn_decision.astype(int)
df_pred['actual'] = y_train


In [128]:
df_pred

Unnamed: 0,probability,prediction,actual
0,0.500000,1,0
1,0.072308,0,0
2,0.029482,0,0
3,0.079215,0,0
4,0.071943,0,0
...,...,...,...
198581,0.035046,0,0
198582,0.097451,0,0
198583,0.010080,0,0
198584,0.104509,0,0


In [129]:
df_pred['correct'] = df_pred.prediction == df_pred.actual

In [130]:
df_pred.correct.mean()

0.8342330275044565

In [23]:
y_pred = model.predict(X_train)
score = roc_auc_score(y_train, y_pred)
score

0.5

In [131]:
from sklearn.metrics import roc_auc_score

<h3> Model evaluation on validation data </h3>

In [141]:
score = roc_auc_score(y_val, y_pred)
score

0.5

<h3> Model evaluation on test data </h3>

In [142]:
y_pred = model.predict(X_test)

In [143]:
score = roc_auc_score(y_test, y_pred)
score

0.5

<h2> Training Random Forest</h2>

In [144]:
from sklearn.ensemble import RandomForestClassifier

In [145]:
rf = RandomForestClassifier()

In [146]:
rf.fit(X_train, y_train.values.ravel())

In [147]:
y_pred = rf.predict(X_train)

In [148]:
score = roc_auc_score(y_train, y_pred)
score

0.8602065888937045

<h3> Model evaluation on validation data </h3>

In [149]:
y_pred = rf.predict(X_val)

In [150]:
score = roc_auc_score(y_val, y_pred)
score

0.6311941667682298

<h3> Model evaluation on test data </h3>

In [151]:
y_pred = rf.predict(X_test)

In [152]:
score = roc_auc_score(y_test, y_pred)
score

0.6288367751524799

<h2> Training LightGBM </h2>

In [85]:
from lightgbm import LGBMClassifier

In [86]:
lgbm = LGBMClassifier()

In [70]:
mlflow.lightgbm.autolog()

lgbm.fit(X_train, y_train, eval_set=(X_val, y_val))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 37966, number of negative: 557792
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2075
[LightGBM] [Info] Number of data points in the train set: 595758, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.063727 -> initscore=-2.687295
[LightGBM] [Info] Start training from score -2.687295




In [71]:
y_pred = lgbm.predict(X_train)

In [72]:
score = roc_auc_score(y_train, y_pred)
score

0.6370321797194192

<h3> Model evaluation on validation data </h3>

In [73]:
y_pred = lgbm.predict(X_val)

In [74]:
score = roc_auc_score(y_val, y_pred)
score

0.6331832060719743

<h3> Model evaluation on test data </h3>

In [87]:
mlflow.lightgbm.autolog(disable_for_unsupported_versions=True)
# import logging
# logging.getLogger("mlflow").setLevel(logging.DEBUG)
lgbm.fit(X_train, y_train.values.ravel(), eval_set=(X_val, y_val.values.ravel()))
y_pred = lgbm.predict(X_test)
score = roc_auc_score(y_test, y_pred)
score



2023/08/16 12:34:29 DEBUG mlflow.utils.autologging_utils: Called autolog() method for lightgbm autologging with args '()' and kwargs '{'log_input_examples': False, 'log_model_signatures': True, 'log_models': True, 'disable': False, 'exclusive': False, 'disable_for_unsupported_versions': True, 'silent': False, 'registered_model_name': None}'


[LightGBM] [Info] Number of positive: 37966, number of negative: 557792
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2075
[LightGBM] [Info] Number of data points in the train set: 595758, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.063727 -> initscore=-2.687295
[LightGBM] [Info] Start training from score -2.687295


2023/08/16 12:34:31 DEBUG mlflow.utils.autologging_utils: Invoked patched API '<module 'sklearn.metrics' from '/Users/sarveshthakur/anaconda3/envs/exp-tracking-env/lib/python3.9/site-packages/sklearn/metrics/__init__.py'>.roc_auc_score' for sklearn autologging with args '(        0
0       0
1       0
2       0
3       0
4       0
...    ..
198582  0
198583  0
198584  0
198585  0
198586  0

[198587 rows x 1 columns], array([0, 0, 0, ..., 0, 0, 0]))' and kwargs '{}'
2023/08/16 12:34:31 DEBUG mlflow.utils.autologging_utils: Original function invoked during execution of patched API '<module 'sklearn.metrics' from '/Users/sarveshthakur/anaconda3/envs/exp-tracking-env/lib/python3.9/site-packages/sklearn/metrics/__init__.py'>.roc_auc_score' for sklearn autologging. Original function was invoked with args '(        0
0       0
1       0
2       0
3       0
4       0
...    ..
198582  0
198583  0
198584  0
198585  0
198586  0

[198587 rows x 1 columns], array([0, 0, 0, ..., 0, 0, 0]))' and kwa

0.6298328286010286

In [82]:
with open('models/lgbm.bin', 'wb') as f_out:
    pickle.dump((dv, lgbm), f_out)

In [89]:
mlflow.log_artifact(local_path="models/lgbm.bin", artifact_path="models_pickl4")
mlflow.lightgbm.log_model(lgbm,artifact_path="models_pickle4")

2023/08/16 12:35:08 DEBUG mlflow.models.model: 
Traceback (most recent call last):
  File "/Users/sarveshthakur/anaconda3/envs/exp-tracking-env/lib/python3.9/site-packages/mlflow/models/model.py", line 550, in log
    mlflow.tracking.fluent._record_logged_model(mlflow_model)
  File "/Users/sarveshthakur/anaconda3/envs/exp-tracking-env/lib/python3.9/site-packages/mlflow/tracking/fluent.py", line 993, in _record_logged_model
    MlflowClient()._record_logged_model(run_id, mlflow_model)
  File "/Users/sarveshthakur/anaconda3/envs/exp-tracking-env/lib/python3.9/site-packages/mlflow/tracking/client.py", line 1396, in _record_logged_model
    self._tracking_client._record_logged_model(run_id, mlflow_model)
  File "/Users/sarveshthakur/anaconda3/envs/exp-tracking-env/lib/python3.9/site-packages/mlflow/tracking/_tracking_service/client.py", line 404, in _record_logged_model
    self.store.record_logged_model(run_id, mlflow_model)
  File "/Users/sarveshthakur/anaconda3/envs/exp-tracking-env/lib

<mlflow.models.model.ModelInfo at 0x2dd279100>

In [43]:
score = roc_auc_score(y_test, y_pred)
score

0.8669172574290727