In [39]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,root_mean_squared_error
import joblib
import pandas as pd
import seaborn as sns
import numpy as np
from scipy.stats import ks_2samp

<h3>Model Training</h3>

In [13]:
diamonds=sns.load_dataset(name="diamonds")

In [15]:
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [23]:
diamonds.shape

(53940, 10)

In [16]:
diamonds.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

In [17]:
X=diamonds[['carat', 'cut', 'color', 'clarity', 'depth', 'table',  'x', 'y','z']]
y=diamonds['price']

In [18]:
preprocessor=ColumnTransformer(transformers=[
    ("numeric",StandardScaler(),["carat","depth","table"]),
    ("categorical",OneHotEncoder(drop="first",sparse_output=False),["cut","color","clarity"])
],
                              )

In [19]:
model=RandomForestRegressor(n_estimators=100,random_state=42)

In [20]:
modelPipeline=Pipeline(steps=[
    ("preprocessor",preprocessor),
    ("regressor",model)
])

In [21]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)

In [24]:
y_train.shape,y_test.shape

((43152,), (10788,))

In [22]:
modelPipeline.fit(X=X_train,y=y_train)

In [25]:
y_pred=modelPipeline.predict(X=X_test)

In [32]:
rmse=root_mean_squared_error(y_true=y_test,y_pred=y_pred)

In [34]:
print(f"Model MSE: {rmse}")

Model MSE: 615.9010356650112


In [37]:
joblib.dump(value=modelPipeline,filename="model_pipeline.joblib")
print("Model Pipeline Saved Successfully")

Model Pipeline Saved Successfully


<h3>Data Drift</h3>

In [44]:
driftScores={}
for column in X_train.columns:
    ks_statistic,p_value=ks_2samp(data1=X_train[column].values,data2=X_test[column].values)
    driftScores[column]=float(ks_statistic)

In [45]:
driftScores

{'carat': 0.009895253985910263,
 'cut': 0.004148127549128661,
 'color': 0.00857434186132744,
 'clarity': 0.006743604004449344,
 'depth': 0.012027252502780916,
 'table': 0.007068038561364398,
 'x': 0.0121199480904709,
 'y': 0.010266036336670364,
 'z': 0.008898776418242493}

In [48]:
overall_drift_score=float(np.mean(list(driftScores.values())))
overall_drift_score

0.008860153256704976

In [49]:
threshold=0.1

In [55]:
isDriftData=overall_drift_score>threshold
isDriftData

False

<h3>Concept Drift</h3>

In [51]:
y_pred_reference=modelPipeline.predict(X=X_train)
y_pred_current=modelPipeline.predict(X=X_test)

In [58]:
rmse_reference=root_mean_squared_error(y_true=y_train,y_pred=y_pred_reference)
rmse_current=root_mean_squared_error(y_true=y_test,y_pred=y_pred_current)

In [59]:
relative_performance_decrease=(rmse_current-rmse_reference)/rmse_reference
relative_performance_decrease

1.6739797198562067

In [60]:
isDriftConcept=relative_performance_decrease>threshold
isDriftConcept

True