In [1]:
import mlflow
import pandas as pd

from dotenv import load_dotenv
from loguru import logger
from database.database import db
from ml_engineering.pipeline.inference_pipeline.main import InferenceExecutor


load_dotenv('../../ml_engineering/pipeline/.env')

True

In [2]:
import os
os.environ['MLFLOW_TRACKING_URI']

'http://localhost:5000/'

In [3]:
X_train = pd.read_csv('../../data/processed/feature_train.csv', index_col=0)
y_train = pd.read_csv('../../data/processed/label_train.csv', index_col=0)
X_valid = pd.read_csv('../../data/processed/feature_valid.csv', index_col=0)
y_valid = pd.read_csv('../../data/processed/label_valid.csv', index_col=0)

In [4]:
model_name = "titanic-log-reg-model"
MODEL_URI = "models://{}/{}/{}"
model_version = mlflow.search_model_versions(filter_string=f"name='{model_name}'")[0]
model_uri = MODEL_URI.format(model_version.run_id, model_version.name, model_version.version)

In [5]:
numerical_cols = ['p_class', 'fare']
categorical_cols = ['sex', 'embarked']
columns_remove = ['created_at', 'ticket']
label = 'survived'
query_titanic_train_dataset = f"SELECT * FROM titanic_train"

In [6]:
inf_executor = InferenceExecutor(db=db)
inf_executor.load_latest_model(model_name=model_name)

In [7]:
y_predict_train = inf_executor.predict(X_train)
y_predict_test = inf_executor.predict(X_valid)

In [37]:
!pip install numpy==1.22.4

Collecting numpy==1.22.4
  Downloading numpy-1.22.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.8/16.8 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.24.3
    Uninstalling numpy-1.24.3:
      Successfully uninstalled numpy-1.24.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ml-titanic 0.1 requires numpy==1.24.3, but you have numpy 1.22.4 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-1.22.4


In [41]:
import numpy as np
import pandas as pd
import shap 
import seaborn

print(np.__version__)
print(pd.__version__)
print(shap.__version__)
print(seaborn.__version__)

1.24.3
1.5.3
0.41.0
0.12.2


In [42]:
import shap
import sklearn


model = sklearn.linear_model.LinearRegression().fit(X_train, y_train)

In [48]:
X_train = X_train.astype(float)
X_train.dtypes

p_class         float64
sex             float64
age             float64
sib_sp          float64
parch           float64
fare            float64
rank            float64
followers       float64
title_Dr        float64
title_Master    float64
title_Miss      float64
title_Mr        float64
title_Mrs       float64
family_0        float64
family_1        float64
family_2        float64
family_3        float64
family_4        float64
family_5        float64
family_6        float64
family_7        float64
cabin_A         float64
cabin_B         float64
cabin_C         float64
cabin_D         float64
cabin_E         float64
cabin_F         float64
cabin_G         float64
cabin_N         float64
cabin_T         float64
embarked_Q      float64
embarked_S      float64
dtype: object

In [49]:
df_train_normed_summary = shap.kmeans(X_train.values, 25)

In [50]:
explainer = shap.KernelExplainer(model.predict, df_train_normed_summary)

In [51]:
shap_values = explainer.shap_values(X_train.values)

  0%|                                                                                                                                                                                                                                                                                  | 0/712 [00:00<?, ?it/s]


AttributeError: module 'numpy' has no attribute 'int'.
`np.int` was a deprecated alias for the builtin `int`. To avoid this error in existing code, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

In [53]:
train_stats = X_train.describe()
train_stats = train_stats.transpose()
train_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
p_class,712.0,2.299157,0.836307,1.0,2.0,3.0,3.0,3.0
sex,712.0,0.366573,0.482207,0.0,0.0,0.0,1.0,1.0
age,712.0,29.786868,13.233471,0.42,21.26,30.0,36.0,80.0
sib_sp,712.0,0.516854,1.134919,0.0,0.0,0.0,1.0,8.0
parch,712.0,0.391854,0.829885,0.0,0.0,0.0,0.0,6.0
fare,712.0,32.865817,51.481324,0.0,7.925,14.4583,31.275,512.3292
rank,712.0,0.973315,0.161276,0.0,1.0,1.0,1.0,1.0
followers,712.0,0.908708,1.655688,0.0,0.0,0.0,1.0,10.0
title_Dr,712.0,0.007022,0.083564,0.0,0.0,0.0,0.0,1.0
title_Master,712.0,0.039326,0.194506,0.0,0.0,0.0,0.0,1.0


In [54]:
def norm(x):
    return (x - train_stats['mean']) / train_stats['std']

df_train_normed = norm(X_train)

df_train_normed.head()

Unnamed: 0,p_class,sex,age,sib_sp,parch,fare,rank,followers,title_Dr,title_Master,...,cabin_B,cabin_C,cabin_D,cabin_E,cabin_F,cabin_G,cabin_N,cabin_T,embarked_Q,embarked_S
57,0.838021,-0.760198,-0.097243,-0.45541,-0.472178,-0.497979,0.165465,-0.54884,-0.084037,-0.202184,...,-0.230607,-0.25956,-0.194545,-0.198395,-0.125179,-0.075112,0.542339,-0.037477,-0.294789,-1.66232
717,-0.357713,1.313599,-0.210592,-0.45541,-0.472178,-0.434445,0.165465,-0.54884,-0.084037,-0.202184,...,-0.230607,-0.25956,-0.194545,5.033362,-0.125179,-0.075112,-1.841275,-0.037477,-0.294789,0.600724
431,-1.553446,-0.760198,-0.135026,-0.45541,-0.472178,-0.122682,0.165465,-0.54884,-0.084037,-0.202184,...,-0.230607,3.847259,-0.194545,-0.198395,-0.125179,-0.075112,-1.841275,-0.037477,-0.294789,0.600724
633,-1.553446,-0.760198,0.272274,-0.45541,-0.472178,-0.638403,0.165465,-0.54884,-0.084037,-0.202184,...,-0.230607,-0.25956,-0.194545,-0.198395,-0.125179,-0.075112,0.542339,-0.037477,-0.294789,0.600724
163,0.838021,-0.760198,-0.966252,-0.45541,-0.472178,-0.470138,0.165465,-0.54884,-0.084037,-0.202184,...,-0.230607,-0.25956,-0.194545,-0.198395,-0.125179,-0.075112,0.542339,-0.037477,-0.294789,0.600724


In [55]:
model = sklearn.linear_model.LinearRegression().fit(df_train_normed, y_train)

In [59]:
df_train_normed_summary = shap.kmeans(df_train_normed.values, 25)

In [60]:
explainer = shap.KernelExplainer(model.predict, df_train_normed_summary)

In [62]:
shap_values = explainer.shap_values(df_train_normed.values)

  0%|                                                                                                                                                                                                                                                                                  | 0/712 [00:00<?, ?it/s]


AttributeError: module 'numpy' has no attribute 'int'.
`np.int` was a deprecated alias for the builtin `int`. To avoid this error in existing code, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations