# IMPORTING LIBRARIES

In [1]:
import sys
print(sys.executable)

D:\ShivTikoo\envs\CV_ClaimPrediction\python.exe


In [2]:
#IMPORTING LIBRARIES FOR PREPROCESSING

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import RandomOverSampler
plt.style.use('ggplot')

#IMPORTING LIBRARIES FOR TRAINING
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error

import mlflow
import mlflow.sklearn
import os
os.environ["GIT_PYTHON_REFRESH"] = "quiet"
import git


# FUNCTIONS TO PREPROCESS AND LOAD THE TRAINING DATA 

In [3]:
#FUNCTION TO LOAD DATA
def load_data(path):
    data=pd.read_excel(path)
    return data

In [None]:
df = load_data('C:/Users/Shiv.tikoo/Downloads/Project/Data/tmp_auto_poicies_202306261442.xlsx')
df.head() 

- TRAIN TEST SPLIT

In [5]:
def train_test(df_new):
    features = df_new.drop('clm_cnt',axis=1)   # Features
    label = df_new['clm_cnt']  # Target variable

    # Split the data into training and test sets
    X_train, X_test, Y_train, Y_test = train_test_split(features, label, test_size=0.2, random_state=42)
    
    return X_train, X_test, Y_train, Y_test

- HANDLING THE NULL VALUES IN THE DATAFRAME

In [6]:
def handling_null(df):
    # NULL VALUES
    print("na values available in data \n")
    print(df.isna().sum())
    
    # POLICY NUMBER WILL BE IRRELEVANT IN PREDICTION SO DROPPING
    df=df.drop('policy_number',axis=1)
    
    # DROPPING IRRELEVANT RECORDS
    df.drop_duplicates(keep='last',inplace=True)
    
    # DROPPING THE OUTLIERs 
    # df=df.drop(df[df['clm_cnt']>6].index)

    # FILLING NA VALUES OF VEHICLE_SUBTYPE WITH THE MODE OF THAT VEHICLE_MAKE
    df['vehicle_subtype']=df.groupby('vehicle_make')['vehicle_subtype'].transform(lambda x:x.fillna(x.mode().iloc[0]))
    
    # ASSUMING NAN CLAIM COUNT MEANING NO CLAIM HASNT BEEN TAKEN
    df['clm_cnt']=df['clm_cnt'].fillna(0)
    
    # ASSUMING NO 
    df['hypo_party']=df['hypo_party'].fillna("self")
    
    #CONSIDERING THERE WAS NO PREVIOUS INSURER AND DIGIT IS THE CUSTOMER'S FIRST INSURER
    df['prev_insurer']=df['prev_insurer'].fillna("new")
    
    df['veh_permit']=df.groupby('rto_location')['veh_permit'].transform(lambda x:x.fillna(x.mode().iloc[0]))
    
    # NULL VALUES
    print("\n na values POST PROCESING in data \n")
    print(df.isna().sum())
    
    return df

- REMOVING THE BIAS OF THE DATAFRAME BY OVERSAMPLING

In [7]:
def sampling(df):
    X = df.drop('clm_cnt', axis=1)  
    Y = df['clm_cnt'] 
    
    oversample = RandomOverSampler(sampling_strategy='auto')
    X,Y = oversample.fit_resample(X,Y)
    
    df_new = pd.concat([X, Y], axis=1)

    # Print the value counts of the target variable before and after oversampling
    print("Before oversampling:")
    print(df['clm_cnt'].value_counts())

    print("\nAfter oversampling:")
    print(df_new['clm_cnt'].value_counts())
    
    print("\nShape PRE oversampling:")
    print(df.shape)
    
    print("\nShape POST oversampling:")
    print(df_new.shape)
    
    return df_new
        

- ENCODING THE DATAFRAME TO FIT VARIOUS REGRESSION MODELS ON IT

In [8]:
def encoding(df):
    
    print("DATA TYPE of FEATURES available in data \n")
    print(df.dtypes)
    
    # PERFORMING FREQUENCY ENCODING
    for x in df.columns:
        if(df[x].dtype=='object'):
            df_frequency_map = (df.groupby(x)['clm_cnt'].sum()/df[x].value_counts()).to_dict()
            df[x] = df[x].map(df_frequency_map)   
        
    print("DATA TYPE of FEATURES post processing in data \n")
    print(df.dtypes) 
    
    return df


# FUNCTIONS TO PREPROCESS THE TESTING DATAFRAME

- HANDLING NULL VALUES

In [9]:
def handling_null_test(data):
    
    # DROP POLICY NUMBER
    data=data.drop('policy_number',axis=1)
    
    # ASSUMING NO HYPO_PARTY MEANS THAT THE VEHICLE WAS SELF FINANCED
    data['hypo_party']=data['hypo_party'].fillna("self")
    
    #CONSIDERING THERE WAS NO PREVIOUS INSURER AND DIGIT IS THE CUSTOMER'S FIRST INSURER
    data['prev_insurer']=data['prev_insurer'].fillna("new")
        
    return data

- ENCODING THE TEST DATAFRAME TO MAKE PREDICTIONS USING THE REGRESSION MODEL

In [10]:
def encode_testing_data(data, count_map):
    
    # Map the values in x_test to their encoded counterparts
    x = data.copy()  # Make a copy of x_test
    
    
    for j in x.columns:
        if x[j].dtypes=='object':
            x[j]=x[j].map(count_map[j]).fillna(0)
            #need to put default values for certain columns
            #new addition into the dictionary meaning no claim for it already exists so the default value being zero
             
        else:
            x[j]=x[j]
            
    
    return x


# MODEL TRAINING & METRICS

- FUNCTION TO MAKE PREDICTIONS

In [24]:
def predict_test(model,X_test):
    
    Y_pred = model.predict(X_test)
    Y_pred= np.round(Y_pred).astype(int)

    # CHECKER FOR NEGATOVE VALUES
    for i in range(len(Y_pred)):
        if Y_pred[i]<0:
            Y_pred[i]=0
    
    return Y_pred

- FUNCTION TO GET METRICS OF THE MODEL

In [25]:
def get_metrics(Y_test, Y_pred):
    
    Y_test=Y_test.fillna(0)
    
    # Evaluate the model
    mse = mean_squared_error(Y_test, Y_pred)
    print("Mean Squared Error:", mse)

    rmse = mean_squared_error(Y_test, Y_pred, squared=False)
    print("Root Mean Squared Error:", rmse)

    r2=r2_score(Y_test,Y_pred)
    print("R2 Score:",r2)

    mae=mean_absolute_error(Y_test,Y_pred)
    print('Mean Absolute Error: ',mae)
    
    return {'Mean Squared Error': round(mse, 4), 'Root Mean Squared Error': round(rmse,4), 'R2 SCORE': round(r2, 4), 'Mean Absolute Error': round(mae, 4)}


- MAIN PIPELINE
        
    HANDLE NULL VALUES ->  REMOVE BIAS -> TRAIN TEST SPLIT -> PERFORM ENCODING ON THE TRAINING DATAFRAME
.

- PARALLEL PIPELINE
    
     CREATION OF DICTIONARY TO STORE MAPPINGS OF ENCODINGS MADE

In [45]:
df = handling_null(df)

na values available in data 

policy_number           0
office_code             0
policy_period           0
imd_code                0
imd_channel             0
vehicle_make            0
vehicle_model           0
vehicle_subtype         0
fuel_type               0
rto_location            0
veh_permit              0
veh_age                 0
prev_insurer       463816
prev_ncb                0
policy_type             0
net_premium             0
sum_insured             0
hypo_party         246571
clm_cnt            619679
dtype: int64

 na values POST PROCESING in data 

office_code        0
policy_period      0
imd_code           0
imd_channel        0
vehicle_make       0
vehicle_model      0
vehicle_subtype    0
fuel_type          0
rto_location       0
veh_permit         0
veh_age            0
prev_insurer       0
prev_ncb           0
policy_type        0
net_premium        0
sum_insured        0
hypo_party         0
clm_cnt            0
dtype: int64


In [46]:
df['clm_cnt']=df['clm_cnt'].astype(int)

In [47]:
df_new = sampling(df)

Before oversampling:
clm_cnt
0    483741
1     20200
2      1111
3        77
4         2
5         2
6         1
Name: count, dtype: int64

After oversampling:
clm_cnt
1    483741
2    483741
3    483741
4    483741
5    483741
6    483741
0    483741
Name: count, dtype: int64

Shape PRE oversampling:
(505134, 18)

Shape POST oversampling:
(3386187, 18)


In [48]:
X_train, X_test, Y_train, Y_test = train_test(df_new)

- DICTIONARY FOR ENCODING THE TEST DATA

In [49]:
count_map={}

y=X_train.copy()

for x in y.columns:
    if y[x].dtypes=="object":
        count_map[x]=dict(y[x].value_counts())
 
    else:
        continue

In [50]:
X_train = encoding(X_train)

DATA TYPE of FEATURES available in data 

office_code          int64
policy_period        int64
imd_code             int64
imd_channel         object
vehicle_make        object
vehicle_model       object
vehicle_subtype     object
fuel_type           object
rto_location        object
veh_permit          object
veh_age              int64
prev_insurer        object
prev_ncb             int64
policy_type         object
net_premium        float64
sum_insured          int64
hypo_party          object
dtype: object
DATA TYPE of FEATURES post processing in data 

office_code          int64
policy_period        int64
imd_code             int64
imd_channel          int64
vehicle_make         int64
vehicle_model        int64
vehicle_subtype      int64
fuel_type            int64
rto_location         int64
veh_permit           int64
veh_age              int64
prev_insurer         int64
prev_ncb             int64
policy_type          int64
net_premium        float64
sum_insured          int64
hypo_

# NEW TRAINING PIPELINE
 
- TRAIN TEST SPLIT 
- HANDLE NULL VALUES OF X_TRAIN 
- REMOVE BIAS OF THE DATAFRAME
- PERFORM ENCODING OF THE CATEGORICAL COLUMNS

    - MAKE DICTIONARY TO SAVE THE ENCODINGS MADE SO CAN USE IT TO ENCODE THE TESTING DATA 


In [13]:
# TRAIN TEST SPLIT
X_train, X_test, Y_train, Y_test = train_test(df)

In [14]:
df_new= pd.concat([X_train,Y_train],axis=1)

In [15]:
df_new = handling_null(df_new)

na values available in data 

policy_number           0
office_code             0
policy_period           0
imd_code                0
imd_channel             0
vehicle_make            0
vehicle_model           0
vehicle_subtype         0
fuel_type               0
rto_location            0
veh_permit              0
veh_age                 0
prev_insurer       370882
prev_ncb                0
policy_type             0
net_premium             0
sum_insured             0
hypo_party         197041
clm_cnt            495707
dtype: int64

 na values POST PROCESING in data 

office_code        0
policy_period      0
imd_code           0
imd_channel        0
vehicle_make       0
vehicle_model      0
vehicle_subtype    0
fuel_type          0
rto_location       0
veh_permit         0
veh_age            0
prev_insurer       0
prev_ncb           0
policy_type        0
net_premium        0
sum_insured        0
hypo_party         0
clm_cnt            0
dtype: int64


In [16]:
df_new=sampling(df_new)

Before oversampling:
clm_cnt
0.0    394203
1.0     16353
2.0       870
3.0        69
5.0         2
6.0         1
4.0         1
Name: count, dtype: int64

After oversampling:
clm_cnt
0.0    394203
1.0    394203
2.0    394203
3.0    394203
5.0    394203
6.0    394203
4.0    394203
Name: count, dtype: int64

Shape PRE oversampling:
(411499, 18)

Shape POST oversampling:
(2759421, 18)


In [17]:
count_map={}

y=df_new.copy()

for x in y.columns:
    if y[x].dtypes=="object":
        count_map[x]=dict(y.groupby(x)['clm_cnt'].sum()/y[x].value_counts())
 
    else:
        continue

In [18]:
df_new=encoding(df_new)

DATA TYPE of FEATURES available in data 

office_code          int64
policy_period        int64
imd_code             int64
imd_channel         object
vehicle_make        object
vehicle_model       object
vehicle_subtype     object
fuel_type           object
rto_location        object
veh_permit          object
veh_age              int64
prev_insurer        object
prev_ncb             int64
policy_type         object
net_premium        float64
sum_insured          int64
hypo_party          object
clm_cnt            float64
dtype: object
DATA TYPE of FEATURES post processing in data 

office_code          int64
policy_period        int64
imd_code             int64
imd_channel        float64
vehicle_make       float64
vehicle_model      float64
vehicle_subtype    float64
fuel_type          float64
rto_location       float64
veh_permit         float64
veh_age              int64
prev_insurer       float64
prev_ncb             int64
policy_type        float64
net_premium        float64
sum_i

In [19]:
X_train = df_new.drop('clm_cnt',axis=1)   
Y_train = df_new['clm_cnt'] 

# MLFLOW EXPERIMENT LOGGING

In [21]:
# MLFLOW

def create_experiment(experiment_name,run_name,run_metrics,model,run_params=None):
    
    import mlflow
    mlflow.set_experiment(experiment_name)
    
    with mlflow.start_run():
        
        if not run_params==None:
            for param in run_params:
                mlflow.log_param(param,run_params[param])
                
        for metric in run_metrics:
            mlflow.log_metric(metric,run_metrics[metric])
            
            mlflow.sklearn.log_model(model,"model")
            
            print("Run - %s is logged to experiment- %s" %(run_name,experiment_name))

# LINEAR REGRESSOR

In [30]:
def training_lr(X_train,Y_train):
    # Create an instance of LinearRegression
    model = LinearRegression()

    # Fit the model to the training data
    model.fit(X_train, Y_train)
    
    return model

- TRAINING & LOGGING LINEAR REGRESSOR

In [31]:
model = training_lr(X_train,Y_train)

In [22]:
# TRANSFORMING TEST DATA TO FIT THE MODEL
X_test= handling_null_test(X_test)

# Encode testing data using the encoding dictionaries  
X_test = encode_testing_data(X_test, count_map)

# Filling claim count null values with 0
Y_test=Y_test.fillna(0)

In [35]:
Y_pred = predict_test(model,X_test)
run_metrics = get_metrics(Y_test, Y_pred)

Mean Squared Error: 1.5791999875532874
Root Mean Squared Error: 1.2566622408401102
R2 Score: -37.89343976877271
Mean Absolute Error:  0.9491162834116439


In [39]:
experiment_name="NEW ENCODING LR"
run_name="Linear Regressor"
run_metrics=get_metrics(Y_test,Y_pred)

Mean Squared Error: 1.5791999875532874
Root Mean Squared Error: 1.2566622408401102
R2 Score: -37.89343976877271
Mean Absolute Error:  0.9491162834116439


In [40]:
from sklearn.metrics import confusion_matrix
fig= confusion_matrix(Y_test,Y_pred)

print(fig)

[[41913 48396 30985  2384   264    25     5]
 [  761  1598  1666   249    43     1     0]
 [   41    66    99    32     6     5     0]
 [    0     2     0     1     3     1     1]
 [    0     0     0     0     0     1     0]
 [    0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0]]


In [41]:
create_experiment(experiment_name,run_name,run_metrics,model)

2023/07/19 14:37:52 INFO mlflow.tracking.fluent: Experiment with name 'NEW ENCODING LR' does not exist. Creating a new experiment.


Run - Linear Regressor is logged to experiment- NEW ENCODING LR
Run - Linear Regressor is logged to experiment- NEW ENCODING LR
Run - Linear Regressor is logged to experiment- NEW ENCODING LR
Run - Linear Regressor is logged to experiment- NEW ENCODING LR


# SUPPORT VECTOR REGRESSOR

In [42]:
def training_svr(X_train,Y_train):
    
    from sklearn import linear_model
    
    # Create a Support Vector Regression model
    model = linear_model.SGDRegressor()

    # Train the model
    model.fit(X_train, Y_train)
    
    return model

In [43]:
def training_linsvr(X_train,Y_train):
    
    from sklearn.svm import LinearSVR
    
    # Create a Support Vector Regression model
    model = LinearSVR()

    # Train the model
    model.fit(X_train, Y_train)
    
    return model

In [44]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [45]:
model=training_svr(X_train_scaled,Y_train)

In [46]:
Y_pred = predict_test(model,X_test_scaled)
run_metrics = get_metrics(Y_test, Y_pred)

Mean Squared Error: 1.5807869433985748
Root Mean Squared Error: 1.2572934993065759
R2 Score: -37.932524224238044
Mean Absolute Error:  0.9510144070697327


In [47]:
fig=confusion_matrix(Y_test,Y_pred)
print(fig)

[[41644 48720 30911  2404   263    25     5]
 [  774  1590  1657   255    41     1     0]
 [   41    67    97    33     6     5     0]
 [    0     2     0     1     3     1     1]
 [    0     0     0     0     0     1     0]
 [    0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0]]


In [50]:
model=training_linsvr(X_train_scaled,Y_train)



In [51]:
Y_pred = predict_test(model,X_test_scaled)
run_metrics = get_metrics(Y_test, Y_pred)

Mean Squared Error: 1.7651849892647105
Root Mean Squared Error: 1.3286026453626798
R2 Score: -42.473984676935736
Mean Absolute Error:  0.9844338301646077


In [52]:
fig=confusion_matrix(Y_test,Y_pred)
print(fig)

[[43779 43744 30826  5213   299   103     8]
 [  852  1387  1606   391    67    15     0]
 [   47    56    85    42    11     7     1]
 [    0     1     1     1     2     2     1]
 [    0     0     0     0     0     1     0]
 [    0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0]]


In [53]:
experiment_name="NEW ENCODING SVR"
run_name="Support Vector Regressor"
create_experiment(experiment_name,run_name,run_metrics,model)

Run - Support Vector Regressor is logged to experiment- NEW ENCODING SVR
Run - Support Vector Regressor is logged to experiment- NEW ENCODING SVR
Run - Support Vector Regressor is logged to experiment- NEW ENCODING SVR
Run - Support Vector Regressor is logged to experiment- NEW ENCODING SVR


# GRADIENT BOOSTING REGRESSOR

In [27]:
def training_xgr(X_train,Y_train):
    
    import xgboost as xgb
    from sklearn.metrics import make_scorer,mean_absolute_error

    # Define the parameter grid
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 7, 9],
        'learning_rate': [0.1, 0.01, 0.001]
        # Add other hyperparameters specific to XGBoost Regressor
    }

    scoring=make_scorer(mean_absolute_error)
    
    # Create an instance of the XGBoost Regressor
    model = xgb.XGBRegressor()

    # Create GridSearchCV object
    bestModel = RandomizedSearchCV(model, param_grid, cv=4, scoring=scoring)

    # Fit the grid search to the data
    bestModel.fit(X_train,Y_train)

    # Print the best hyperparameters and corresponding score
    print("Best Hyperparameters: ", bestModel.best_params_)
    print("Best Score: ", bestModel.best_score_)

    # Return the best estimator
    return bestModel.best_estimator_ , bestModel.best_params_
    

In [28]:
model,run_params=training_xgr(X_train,Y_train)
Y_pred = predict_test(model,X_test)
run_metrics = get_metrics(Y_test, Y_pred)

Best Hyperparameters:  {'n_estimators': 200, 'max_depth': 9, 'learning_rate': 0.001}
Best Score:  2.388575749897732
Mean Squared Error: 0.5203425957618943
Root Mean Squared Error: 0.7213477633998003
R2 Score: -11.815294811867005
Mean Absolute Error:  0.5193313003702897


In [29]:
# CONFUSION MATRIX FOR VISUALIZATION
from sklearn.metrics import confusion_matrix
what=confusion_matrix(Y_test,Y_pred)
print(what)

[[58615 65357     0     0     0]
 [ 1080  3238     0     0     0]
 [   54   195     0     0     0]
 [    0     8     0     0     0]
 [    0     1     0     0     0]]


In [30]:
experiment_name="NEW ENCODING XGR"
run_name="Gradient Boosting Regressor"
create_experiment(experiment_name,run_name,run_metrics,model,run_params)



Run - Gradient Boosting Regressor is logged to experiment- NEW ENCODING XGR
Run - Gradient Boosting Regressor is logged to experiment- NEW ENCODING XGR
Run - Gradient Boosting Regressor is logged to experiment- NEW ENCODING XGR
Run - Gradient Boosting Regressor is logged to experiment- NEW ENCODING XGR


- ADDING MORE VERSIONS TO THE MLFLOW MODEL REGISTRY

In [54]:
import xgboost as xgb
model = xgb.XGBRegressor(n_estimators= 300, max_depth= 9, learning_rate= 0.1)
model.fit(X_train,Y_train)
Y_pred = predict_test(model,X_test)
run_metrics = get_metrics(Y_test, Y_pred)
fig=confusion_matrix(Y_test,Y_pred)
print(fig)

Mean Squared Error: 0.3442838472788375
Root Mean Squared Error: 0.5867570598457572
R2 Score: -7.479219340830303
Mean Absolute Error:  0.32089958614680897
[[86529 36166  1221    55     1]
 [ 1973  2192   142    11     0]
 [   86   144    16     2     1]
 [    2     1     5     0     0]
 [    1     0     0     0     0]]


In [55]:
run_params={'n_estimators': 300, 'max_depth': 9, 'learning_rate': 0.1}
experiment_name="NEW ENCODING XGR"
run_name="Gradient Boosting Regressor"
create_experiment(experiment_name,run_name,run_metrics,model,run_params)

2023/07/19 15:47:38 INFO mlflow.tracking.fluent: Experiment with name 'NEW ENCODING XGR' does not exist. Creating a new experiment.


Run - Gradient Boosting Regressor is logged to experiment- NEW ENCODING XGR
Run - Gradient Boosting Regressor is logged to experiment- NEW ENCODING XGR
Run - Gradient Boosting Regressor is logged to experiment- NEW ENCODING XGR
Run - Gradient Boosting Regressor is logged to experiment- NEW ENCODING XGR


- NEW VERSION

In [56]:
model = xgb.XGBRegressor(n_estimators= 300, max_depth=11, learning_rate= 0.1)
model.fit(X_train,Y_train)
Y_pred = predict_test(model,X_test)
run_metrics = get_metrics(Y_test, Y_pred)
fig=confusion_matrix(Y_test,Y_pred)
print(fig)

Mean Squared Error: 0.2626334131997386
Root Mean Squared Error: 0.5124777197105632
R2 Score: -5.468285789045164
Mean Absolute Error:  0.24628154463702276
[[96205 26946   778    42     1]
 [ 2544  1674    91     9     0]
 [  114   122    10     2     1]
 [    3     2     3     0     0]
 [    1     0     0     0     0]]


In [57]:
run_params={'n_estimators': 300, 'max_depth': 11, 'learning_rate': 0.1}
experiment_name="NEW ENCODING XGR"
run_name="Gradient Boosting Regressor"
create_experiment(experiment_name,run_name,run_metrics,model,run_params)

Run - Gradient Boosting Regressor is logged to experiment- NEW ENCODING XGR
Run - Gradient Boosting Regressor is logged to experiment- NEW ENCODING XGR
Run - Gradient Boosting Regressor is logged to experiment- NEW ENCODING XGR
Run - Gradient Boosting Regressor is logged to experiment- NEW ENCODING XGR


- NEW VERSION

In [58]:
model = xgb.XGBRegressor(n_estimators= 400, max_depth=13, learning_rate= 0.1)
model.fit(X_train,Y_train)
Y_pred = predict_test(model,X_test)
run_metrics = get_metrics(Y_test, Y_pred)
fig=confusion_matrix(Y_test,Y_pred)
print(fig)

Mean Squared Error: 0.18506705666365872
Root Mean Squared Error: 0.43019420807776887
R2 Score: -3.5579372329428764
Mean Absolute Error:  0.17129788094719484
[[106190  17140    608     34      0]
 [  3066   1175     69      8      0]
 [   148     90      8      2      1]
 [     3      3      2      0      0]
 [     1      0      0      0      0]]


In [59]:
run_params={'n_estimators': 400, 'max_depth': 13, 'learning_rate': 0.1}
experiment_name="NEW ENCODING XGR"
run_name="Gradient Boosting Regressor"
create_experiment(experiment_name,run_name,run_metrics,model,run_params)

Run - Gradient Boosting Regressor is logged to experiment- NEW ENCODING XGR
Run - Gradient Boosting Regressor is logged to experiment- NEW ENCODING XGR
Run - Gradient Boosting Regressor is logged to experiment- NEW ENCODING XGR
Run - Gradient Boosting Regressor is logged to experiment- NEW ENCODING XGR


- NEW VERSION

In [None]:
import xgboost as xgb
model = xgb.XGBRegressor(n_estimators= 500, max_depth=15, learning_rate= 0.1)
model.fit(X_train,Y_train)
Y_pred = predict_test(model,X_test)
run_metrics = get_metrics(Y_test, Y_pred)
fig=confusion_matrix(Y_test,Y_pred)
print(fig)

In [None]:
run_params={'n_estimators': 500, 'max_depth': 15, 'learning_rate': 0.1}
experiment_name="NEW ENCODING XGR"
run_name="Gradient Boosting Regressor"
create_experiment(experiment_name,run_name,run_metrics,model,run_params)

- NEW VERSION

In [None]:
import xgboost as xgb

model = xgb.XGBRegressor(n_estimators= 600, max_depth=15, learning_rate= 0.1)
model.fit(X_train,Y_train)
Y_pred = predict_test(model,X_test)
run_metrics = get_metrics(Y_test, Y_pred)

In [None]:
from sklearn.metrics import confusion_matrix
fig=confusion_matrix(Y_test,Y_pred)
print(fig)

In [None]:
run_params={'n_estimators': 500, 'max_depth': 18, 'learning_rate': 0.1}
experiment_name="NEW ENCODING XGR"
run_name="Gradient Boosting Regressor"
create_experiment(experiment_name,run_name,run_metrics,model,run_params)

- NEW VERSION

In [None]:
import xgboost as xgb
model = xgb.XGBRegressor(n_estimators= 800, max_depth=15, learning_rate= 0.1)
model.fit(X_train,Y_train)
Y_pred = predict_test(model,X_test)
run_metrics = get_metrics(Y_test, Y_pred)
fig=confusion_matrix(Y_test,Y_pred)
print(fig)

In [None]:
run_params={'n_estimators': 800, 'max_depth': 23, 'learning_rate': 0.1}
experiment_name="NEW ENCODING XGR"
run_name="Gradient Boosting Regressor"
create_experiment(experiment_name,run_name,run_metrics,model,run_params)

- LOGGING THE MODEL

In [37]:
experiment_name="NEW ENCODING XGR"
run_name="Gradient Boosting Regressor"
create_experiment(experiment_name,run_name,run_metrics,model,run_params)

Run - Gradient Boosting Regressor is logged to experiment- NEW DATA XGR
Run - Gradient Boosting Regressor is logged to experiment- NEW DATA XGR
Run - Gradient Boosting Regressor is logged to experiment- NEW DATA XGR
Run - Gradient Boosting Regressor is logged to experiment- NEW DATA XGR


In [38]:
run_params

{'n_estimators': 200, 'max_depth': 9, 'learning_rate': 0.1}

- TRAINING THE XGBOOST MODEL ON UNSEEN DATA

In [51]:
# MAKING PREDICTIONS
ans_pred = predict_test(model,test_features)

# GETTING METRICS FOR THE PREDICTIONS MADE
run_metrics = get_metrics(test_ans, ans_pred)

# CONFUSION MATRIX FOR VISUALIZATION
from sklearn.metrics import confusion_matrix
a=confusion_matrix(test_ans,ans_pred)
print(a)

Mean Squared Error: 0.29488937921164055
Root Mean Squared Error: 0.5430371803216061
R2 Score: -8.678263669692988
Mean Absolute Error:  0.2805772556503071
[[11999  4222   108]
 [  215   183    14]
 [   12    14     2]]


# RANDOM FOREST REGRESSOR

In [32]:
def training_rf(X_train, Y_train):
    
    # define random parameters grid
    n_estimators = [5,21,51,101,201] # number of trees in the random forest
    max_features = [1.0, 'sqrt'] # number of features in consideration at every split
    min_samples_split = [2, 6, 10] # minimum sample number to split a node
    min_samples_leaf = [1, 3, 4] # minimum sample number that can be stored in a leaf node
    bootstrap = [True, False] # method used to sample data points

    random_grid = {'n_estimators': n_estimators,
                    'max_features': max_features,
                    'min_samples_split': min_samples_split,
                    'min_samples_leaf': min_samples_leaf,
                    'bootstrap': bootstrap
                  }
    
    model=RandomForestRegressor()
    
    # Perform grid search for hyperparameter tuning
    model_tuning = RandomizedSearchCV(estimator = model, param_distributions = random_grid,
                                      n_iter = 100, cv = 5, verbose=2, random_state=35, n_jobs = -1)
    
    model_tuning.fit(X_train, Y_train)
    
    print ('Random grid: ', random_grid, '\n')
    
    # print the best parameters
    print ('Best Parameters: ', model_tuning.best_params_, ' \n')

    best_params = model_tuning.best_params_
    
    n_estimators = best_params['n_estimators']
    min_samples_split = best_params['min_samples_split']
    min_samples_leaf = best_params['min_samples_leaf']
    max_features = best_params['max_features']
    bootstrap = best_params['bootstrap']
    
    model_tuned = RandomForestRegressor(n_estimators = n_estimators, min_samples_split = min_samples_split,
                                         min_samples_leaf= min_samples_leaf, max_features = max_features,
                                         bootstrap=bootstrap) 
    model_tuned.fit( X_train, Y_train)
    
    return model_tuned,best_params

In [None]:
model=RandomForestRegressor(n_estimators=101, min_samples_split= 6, min_samples_leaf= 1, max_features= 'sqrt', bootstrap= False)

In [110]:
model.fit(X_train,Y_train)

In [None]:
model,best_params=training_rf(X_train,Y_train)
Y_pred = predict_test(model,X_test)
run_metrics = get_metrics(Y_test, Y_pred)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


- ADDING MULTIPLE VERSIONS TO THE MODEL REGISTRY

In [69]:
model=RandomForestRegressor(n_estimators=101, min_samples_split= 8, max_features= 'sqrt', bootstrap= False)

In [70]:
model.fit(X_train,Y_train)
Y_pred = predict_test(model,X_test)
run_metrics = get_metrics(Y_test, Y_pred)

Mean Squared Error: 0.005693714764971842
Root Mean Squared Error: 0.07545670788585891
R2 Score: 0.9985747591738768
Mean Absolute Error:  0.005342287349498994


In [71]:
from sklearn.metrics import confusion_matrix
fig=confusion_matrix(Y_test,Y_pred)
print(fig)

[[94272  2379    65     3     0     0     0]
 [    0 95884   579    45     0     0     0]
 [    0     0 96533   431     0     0     0]
 [    0     0     0 97029     0     0     0]
 [    0     0     0     0 96897     0     0]
 [    0     0     0     0     0 96589     0]
 [    0     0     0     0     0     0 96532]]


In [72]:
best_params={'n_estimators':101, 'min_samples_split': 8, 'max_features': 'sqrt', 'bootstrap': False}

In [None]:
experiment_name="NEW ENCODING RF"
run_name="Random Forest Regressor"
create_experiment(experiment_name,run_name,run_metrics,model,best_params)

- NEW VERSION

In [63]:
model=RandomForestRegressor(n_estimators=200, min_samples_split= 6, min_samples_leaf= 2, max_features= 'sqrt', bootstrap= False)

In [64]:
model.fit(X_train,Y_train)
Y_pred = predict_test(model,X_test)
run_metrics = get_metrics(Y_test, Y_pred)
fig=confusion_matrix(Y_test,Y_pred)
print(fig)

Mean Squared Error: 0.007775700713781566
Root Mean Squared Error: 0.08817993373654556
R2 Score: 0.998053600054366
Mean Absolute Error:  0.007409507440515742
[[92894  3752    70     3     0     0     0]
 [    0 95867   596    45     0     0     0]
 [    0     0 96533   431     0     0     0]
 [    0     0     0 97029     0     0     0]
 [    0     0     0     0 96897     0     0]
 [    0     0     0     0     0 96589     0]
 [    0     0     0     0     0     0 96532]]


In [65]:
best_params={'n_estimators':200, 'min_samples_split': 6, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'bootstrap': False}

experiment_name="NEW DATA RF"
run_name="Random Forest Regressor"
create_experiment(experiment_name,run_name,run_metrics,model,best_params)

Run - Random Forest Regressor is logged to experiment- NEW DATA RF
Run - Random Forest Regressor is logged to experiment- NEW DATA RF
Run - Random Forest Regressor is logged to experiment- NEW DATA RF
Run - Random Forest Regressor is logged to experiment- NEW DATA RF


- NEW VERSION

In [74]:
model=RandomForestRegressor(n_estimators=101, min_samples_split= 10, max_features= 'sqrt', bootstrap= False)

In [75]:
model.fit(X_train,Y_train)
Y_pred = predict_test(model,X_test)
run_metrics = get_metrics(Y_test, Y_pred)
fig=confusion_matrix(Y_test,Y_pred)
print(fig)

Mean Squared Error: 0.005653846948930804
Root Mean Squared Error: 0.07519206706116546
R2 Score: 0.9985847388165909
Mean Absolute Error:  0.005305372705016552
[[94296  2356    64     3     0     0     0]
 [    0 95884   579    45     0     0     0]
 [    0     0 96533   431     0     0     0]
 [    0     0     0 97029     0     0     0]
 [    0     0     0     0 96897     0     0]
 [    0     0     0     0     0 96589     0]
 [    0     0     0     0     0     0 96532]]


In [76]:
best_params={'n_estimators':150, 'min_samples_split': 8, 'max_features': 'sqrt', 'bootstrap': False}

experiment_name="NEW DATA RF"
run_name="Random Forest Regressor"
create_experiment(experiment_name,run_name,run_metrics,model,best_params)

Run - Random Forest Regressor is logged to experiment- NEW DATA RF
Run - Random Forest Regressor is logged to experiment- NEW DATA RF
Run - Random Forest Regressor is logged to experiment- NEW DATA RF
Run - Random Forest Regressor is logged to experiment- NEW DATA RF


- PREPROCESSING ON DATAFRAME SPECIFIC TO RF REGRESSOR

In [125]:
def preprocess_rf(X_train):
    columns=["rto_location","veh_permit","hypo_party","sum_insured","office_code","imd_code","vehicle_subtype","veh_age","net_premium"]
    trial=X_train[columns]
    
    return trial


- TRAINING OF THE BEST MODEL

In [126]:
model=RandomForestRegressor(n_estimators=101, min_samples_split= 6, min_samples_leaf= 1, max_features= 'sqrt', bootstrap= False)

In [127]:
trial=preprocess_rf(X_train)

In [128]:
model.fit( trial, Y_train )

In [129]:
test_trial=preprocess_rf(X_test)

In [130]:
Y_pred = predict_test(model,test_trial)
run_metrics = get_metrics(Y_test, Y_pred)

Mean Squared Error: 0.007123049799331992
Root Mean Squared Error: 0.0843981622983107
R2 Score: 0.9982169705017588
Mean Absolute Error:  0.006623963805929378


In [131]:
fig=confusion_matrix(Y_test,Y_pred)
print(fig)

[[93752  2854   107     6     0     0     0]
 [    5 95612   847    44     0     0     0]
 [    0     0 96504   460     0     0     0]
 [    0     0     0 97029     0     0     0]
 [    0     0     0     0 96897     0     0]
 [    0     0     0     0     0 96589     0]
 [    0     0     0     0     0     0 96532]]


- ORIGINAL DATATSET TRAINED MODEL METRICS

In [54]:
Y_pred = predict_test(model,X_test)
run_metrics = get_metrics(Y_test, Y_pred)

Mean Squared Error: 0.005696667936530437
Root Mean Squared Error: 0.07547627399739892
R2 Score: 0.9985740199410833
Mean Absolute Error:  0.005362959550409162


In [55]:
fig=confusion_matrix(Y_test,Y_pred)
print(fig)

[[94325  2331    60     3     0     0     0]
 [    0 95840   624    44     0     0     0]
 [    0     0 96504   460     0     0     0]
 [    0     0     0 97029     0     0     0]
 [    0     0     0     0 96897     0     0]
 [    0     0     0     0     0 96589     0]
 [    0     0     0     0     0     0 96532]]


# FEATURE IMPORTANCE

In [111]:
importances=model.feature_importances_

In [112]:
indices=np.argsort(importances)[::-1]
print("FEATURE IMPORTANCES: ")
x=0;
for i in range(len(X_train.columns)):
    if(x<=12):
        x+=1
        print(f"{X_train.columns[indices[i]]}:{importances[indices[i]]}")

FEATURE IMPORTANCES: 
rto_location:0.2260306366584007
veh_permit:0.13551657452790172
hypo_party:0.13222661942549896
office_code:0.10981604300150269
sum_insured:0.10088776425189987
imd_code:0.08924670251661741
vehicle_subtype:0.04777021515126693
veh_age:0.04754357819681572
net_premium:0.04657156675420492
vehicle_model:0.0139937438642389
policy_type:0.011776436708035866
imd_channel:0.011597825440868365
prev_insurer:0.011392723958900468


In [None]:
experiment_name="NEW DATA RF"
run_name="Random Forest Regressor"
create_experiment(experiment_name,run_name,run_metrics,model,best_params)

- PREDICTIONS FOR UNSEEN DATA USING THE RF MODEL

In [None]:
ans_pred = predict_test(model,test_features)
run_metrics = get_metrics(test_ans, ans_pred)

In [58]:
from sklearn.metrics import confusion_matrix
test=confusion_matrix(test_ans,ans_pred)

In [59]:
test

array([[15930,   398,     1],
       [  363,    46,     3],
       [   22,     5,     1]], dtype=int64)

- NEW MODEL TRAINED ON TOP 8 FEATURES

In [132]:
newTest_features=preprocess_rf(test_features)

In [133]:
ans_pred = predict_test(model,newTest_features)
run_metrics = get_metrics(test_ans, ans_pred)

Mean Squared Error: 0.050629137098216946
Root Mean Squared Error: 0.22500919336377556
R2 Score: -0.6616472913183715
Mean Absolute Error:  0.04788598008229471


In [134]:
from sklearn.metrics import confusion_matrix
test=confusion_matrix(test_ans,ans_pred)
print(test)

[[15940   388     1]
 [  363    48     1]
 [   22     5     1]]


#  ADDING THE MODELS TO THE MLFLOW MODEL REGISTRY

In [84]:
import mlflow
run_name="Gradient Boosting Regressor"
with mlflow.start_run(run_name=run_name) as run:
    result = mlflow.register_model(
        "runs:/09e4677206524c88a5c36def165b6c73/model",
        "NewData_XGR"
    )

Registered model 'NewData_XGR' already exists. Creating a new version of this model...
2023/07/11 15:55:14 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: NewData_XGR, version 10
Created version '10' of model 'NewData_XGR'.


In [89]:
import mlflow
run_name="Random Forest Regressor"
with mlflow.start_run(run_name=run_name) as run:
    result = mlflow.register_model(
        "runs:/1fb31800b853442cbb73dd52e6f708d2/model",
        "NewData_RF"
    )

Registered model 'NewData_RF' already exists. Creating a new version of this model...
2023/07/11 15:59:36 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: NewData_RF, version 6
Created version '6' of model 'NewData_RF'.


# SETTING STAGES OF THE MODELS IN THE MLFLOW REGISTRY

In [3]:
import mlflow
client = mlflow.tracking.MlflowClient()
client.transition_model_version_stage(
    name="NewData_RF",
    version=1,
    stage="Production"
)

<ModelVersion: aliases=[], creation_timestamp=1688103098374, current_stage='Production', description='', last_updated_timestamp=1688103138176, name='NewData_RF', run_id='5f1d45c6a221454a888238dc7364215a', run_link='', source='mlflow-artifacts:/229557385950538072/5f1d45c6a221454a888238dc7364215a/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='1'>

In [41]:
import mlflow
client = mlflow.tracking.MlflowClient()
client.transition_model_version_stage(
    name="NewData_XGR",
    version=2,
    stage="Staging"
)

<ModelVersion: aliases=[], creation_timestamp=1688548242090, current_stage='Staging', description='', last_updated_timestamp=1688548253335, name='NewData_XGR', run_id='d3df4bc673f049f1af31681e0f0b055b', run_link='', source='mlflow-artifacts:/414263798559657782/d3df4bc673f049f1af31681e0f0b055b/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='2'>

# AUTOMATED PIPELINE TO COMPARE AND REGISTER THE BEST MODEL

In [90]:
def target_encoding(data):
    count_map={}

    y=data.copy()

    for x in y.columns:
        if y[x].dtypes=="object":
            count_map[x]=dict(y[x].value_counts())
 
        else:
            continue
        
    return count_map

In [91]:
def train_model(model, dataset):
    
    #splitting the dataframe
    features = dataset.drop('clm_cnt',axis=1)   # Features
    labels = dataset['clm_cnt']  # Target variable
    
    loaded_model= mlflow.sklearn.load_model(model)

    trained_model= loaded_model.fit(features,labels)
    
    return trained_model

In [92]:
def evaluate_model(model, dataset):
    
    #splitting the dataframe
    features = dataset.drop('clm_cnt',axis=1)   # Features
    labels = dataset['clm_cnt']  # Target variable
    labels=labels.fillna(0)
    
    pred = model.predict(features)
    pred= np.round(pred).astype(int)
    
    # CHECKER FOR NEGATiVE VALUES
    for i in range(len(pred)):
        if pred[i]<0:
            pred[i]=0
            
    from sklearn.metrics import confusion_matrix
    test=confusion_matrix(labels,pred)
    print(test)
    
    mae=mean_absolute_error(labels,pred)
    print('Mean Absolute Error: ',mae)
    
    return mae

In [93]:
import mlflow
from mlflow.tracking import MlflowClient

def select_best_model_from_registry(registry_uri, new_dataset):
    
    mlflow.set_tracking_uri(registry_uri)
    client = MlflowClient()
    
    #PREPROCESSING OF THE NEW DATAFRAME
    data=handling_null(new_dataset)
    mapping=target_encoding(data)
    train_df=encoding(data)
    train_df=sampling(train_df)
    
    #TESTING DATAFRAME
    test_df=pd.read_csv('C:/Users/Shiv.tikoo/Downloads/Project/Data/tmp_auto_poicies_202306301517.csv')
    
    #TRANSFORMING TEST DATA TO FIT THE MODEL
    test_df=drop_testing_data(test_df)
    test_df=handling_null_test(test_df)

    #TARGET ENCODING  
    test_df = encode_testing_data(test_df, mapping)
    

    #ALL REGISTERED MODELS IN THE REGISTRY
    all_models = client.search_model_versions("", order_by=["creation_timestamp desc" ])
    best_model = None
    best_metrics = None

    for model in all_models:
        model_name = model.name
        model_version = model.version

        # Load the model from the registry
        model_uri = f"models:/{model_name}/{model_version}"
        loaded_model = mlflow.pyfunc.get_model_dependencies(model_uri)
        
        trained_model = train_model(model_uri,train_df)

        # Evaluate the trained model using desired metrics
        model_metrics = evaluate_model(trained_model, test_df)

        if best_metrics is None or model_metrics < best_metrics:
            best_model = model
            best_metrics = model_metrics

    if best_model:
        # Set the best model to production in the registry
        client.transition_model_version_stage(
            name=best_model.name,
            version=best_model.version,
            stage="production"
        )
        print(f"The best model '{best_model.name}:{best_model.version}' has been set to PRODUCTION.")
    else:
        print("No models found in the registry.")


In [94]:
registry_uri = "http://127.0.0.1:5000"
new_dataset = pd.read_excel('C:/Users/Shiv.tikoo/Downloads/Project/Data/tmp_auto_poicies_202306261442.xlsx')

select_best_model_from_registry(registry_uri, new_dataset)

na values available in data 

policy_number           0
office_code             0
policy_period           0
imd_code                0
imd_channel             0
vehicle_make            0
vehicle_model           0
vehicle_subtype         0
fuel_type               0
rto_location            0
veh_permit              0
veh_age                 0
prev_insurer       463816
prev_ncb                0
policy_type             0
net_premium             0
sum_insured             0
hypo_party         246571
clm_cnt            619679
dtype: int64

 na values POST PROCESING in data 

office_code        0
policy_period      0
imd_code           0
imd_channel        0
vehicle_make       0
vehicle_model      0
vehicle_subtype    0
fuel_type          0
rto_location       0
veh_permit         0
veh_age            0
prev_insurer       0
prev_ncb           0
policy_type        0
net_premium        0
sum_insured        0
hypo_party         0
clm_cnt            0
dtype: int64
DATA TYPE of FEATURES available in 

2023/07/11 16:03:49 INFO mlflow.pyfunc: To install the dependencies that were used to train the model, run the following command: '%pip install -r C:\Users\SHIV~1.TIK\AppData\Local\Temp\tmptw2pv57z\requirements.txt'.


[[15751   577     1]
 [  337    72     3]
 [   20     7     1]]
Mean Absolute Error:  0.057606297334366986


2023/07/11 16:21:55 INFO mlflow.pyfunc: To install the dependencies that were used to train the model, run the following command: '%pip install -r C:\Users\SHIV~1.TIK\AppData\Local\Temp\tmp020_ro1q\requirements.txt'.


[[15929   400     0]
 [  355    54     3]
 [   22     5     1]]
Mean Absolute Error:  0.0481245154749836


2023/07/11 16:31:48 INFO mlflow.pyfunc: To install the dependencies that were used to train the model, run the following command: '%pip install -r C:\Users\SHIV~1.TIK\AppData\Local\Temp\tmpdsvesqa0\requirements.txt'.


[[15735   594     0]
 [  336    73     3]
 [   21     6     1]]
Mean Absolute Error:  0.05850080505695032


2023/07/11 16:51:58 INFO mlflow.pyfunc: To install the dependencies that were used to train the model, run the following command: '%pip install -r C:\Users\SHIV~1.TIK\AppData\Local\Temp\tmp_egmj24l\requirements.txt'.


[[15927   402     0]
 [  357    52     3]
 [   23     4     1]]
Mean Absolute Error:  0.04842268471584471


2023/07/11 17:02:56 INFO mlflow.pyfunc: To install the dependencies that were used to train the model, run the following command: '%pip install -r C:\Users\SHIV~1.TIK\AppData\Local\Temp\tmptdhwtz6l\requirements.txt'.


[[15930   399     0]
 [  356    53     3]
 [   23     4     1]]
Mean Absolute Error:  0.048184149323155824


2023/07/11 17:13:09 INFO mlflow.pyfunc: To install the dependencies that were used to train the model, run the following command: '%pip install -r C:\Users\SHIV~1.TIK\AppData\Local\Temp\tmpk0myjki1\requirements.txt'.


[[11230  4976   123]
 [  167   226    19]
 [    9    17     2]]
Mean Absolute Error:  0.32458703560140734


2023/07/11 17:18:20 INFO mlflow.pyfunc: To install the dependencies that were used to train the model, run the following command: '%pip install -r C:\Users\SHIV~1.TIK\AppData\Local\Temp\tmp7heb4qqk\requirements.txt'.


[[12841  3381   107]
 [  219   176    17]
 [   11    15     2]]
Mean Absolute Error:  0.23066372473015684


2023/07/11 18:21:02 INFO mlflow.pyfunc: To install the dependencies that were used to train the model, run the following command: '%pip install -r C:\Users\SHIV~1.TIK\AppData\Local\Temp\tmp53zzwzm0\requirements.txt'.


[[13519  2775    35]
 [  256   147     9]
 [   13    14     1]]
Mean Absolute Error:  0.18784662174250105


2023/07/11 18:39:10 INFO mlflow.pyfunc: To install the dependencies that were used to train the model, run the following command: '%pip install -r C:\Users\SHIV~1.TIK\AppData\Local\Temp\tmpef5k_vkm\requirements.txt'.


[[14565  1717    47]
 [  298   105     9]
 [   17     9     2]]
Mean Absolute Error:  0.12886874590017294


2023/07/11 19:02:58 INFO mlflow.pyfunc: To install the dependencies that were used to train the model, run the following command: '%pip install -r C:\Users\SHIV~1.TIK\AppData\Local\Temp\tmpqmksvejn\requirements.txt'.


[[15297  1028     4]
 [  337    68     7]
 [   19     8     1]]
Mean Absolute Error:  0.08503786749358937


2023/07/11 19:42:27 INFO mlflow.pyfunc: To install the dependencies that were used to train the model, run the following command: '%pip install -r C:\Users\SHIV~1.TIK\AppData\Local\Temp\tmpp4e4damr\requirements.txt'.


[[15630   696     3]
 [  342    66     4]
 [   22     5     1]]
Mean Absolute Error:  0.06541833144492815


2023/07/11 20:56:12 INFO mlflow.pyfunc: To install the dependencies that were used to train the model, run the following command: '%pip install -r C:\Users\SHIV~1.TIK\AppData\Local\Temp\tmpqh5pxgk5\requirements.txt'.


[[15717   607     5]
 [  355    54     3]
 [   23     3     2]]
Mean Absolute Error:  0.0610650605283559


2023/07/11 21:50:28 INFO mlflow.pyfunc: To install the dependencies that were used to train the model, run the following command: '%pip install -r C:\Users\SHIV~1.TIK\AppData\Local\Temp\tmpn_ce0mh4\requirements.txt'.


[[15688   638     3]
 [  347    61     4]
 [   22     5     1]]
Mean Absolute Error:  0.062257737491800344


2023/07/11 22:41:30 INFO mlflow.pyfunc: To install the dependencies that were used to train the model, run the following command: '%pip install -r C:\Users\SHIV~1.TIK\AppData\Local\Temp\tmpumi39j22\requirements.txt'.


[[12841  3381   107]
 [  219   176    17]
 [   11    15     2]]
Mean Absolute Error:  0.23066372473015684


2023/07/11 22:54:36 INFO mlflow.pyfunc: To install the dependencies that were used to train the model, run the following command: '%pip install -r C:\Users\SHIV~1.TIK\AppData\Local\Temp\tmpfqcu70cj\requirements.txt'.


[[15927   402     0]
 [  355    54     3]
 [   23     4     1]]
Mean Absolute Error:  0.04830341701950027


2023/07/11 23:09:34 INFO mlflow.pyfunc: To install the dependencies that were used to train the model, run the following command: '%pip install -r C:\Users\SHIV~1.TIK\AppData\Local\Temp\tmp5133rg80\requirements.txt'.


[[10436  5712   181]
 [  152   228    32]
 [    8    16     4]]
Mean Absolute Error:  0.37509690500327986
The best model 'NewData_RF:5' has been set to PRODUCTION.


# MODEL SERVING

In [None]:
#SERVING THE MODEL FROM THE MODEL REGISTRY

import mlflow
mlflow.set_tracking_uri('http://127.0.0.1:5000')

In [None]:
import requests

#BATCH PREDICTION

X_test=drop_testing_data(X_test)

# TRANSFORMING TEST DATA TO FIT THE MODEL
X_test=handling_null_test(X_test)

# Encode testing data using the encoding dictionaries  
X_test = encode_testing_data(X_test, count_map)

# Convert to list
lst = X_test.values.tolist()

inference_request = {
        "dataframe_records": lst
}
endpoint = "http://localhost:1568/invocations"
response = requests.post(endpoint, json=inference_request)
print(response)

In [None]:
print(response.text)