In [1]:
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from scipy.sparse import hstack, csr_matrix
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
import os 
import mlflow


from ML_Utils import *


In [2]:
# Load the data:

data = pd.read_csv("/data1/notebooks/H_Level/ML/Data/books_task.csv")

data = data.drop(['Unnamed: 0'], axis = 1)


In [3]:
# EDA:
data_overview = DataOverview(data)
data_overview.get_data_head()
data_overview.get_column_names()
data_overview.get_data_types()
data_overview.describe_numerical()
data_overview.describe_categorical()
data_overview.get_data_info()
data_overview.get_null_count()

# Insights:

# 1. Data has book title in text, description as text, author name as text, publisher name as text, published date as date, categories as text, and Impact in float

# 2. authors and categories has list 

# 3. Impact is on scale of 1000, and is rightly skewed

# 4. Published date is inconsistent

# 5. There are null values in description, authors and publishedDate. I will drop the null values, but that can be treated.


                                                Title  \
0                      Its Only Art If Its Well Hung!   
1                            Dr. Seuss: American Icon   
2               Wonderful Worship in Smaller Churches   
3                       Whispers of the Wicked Saints   
4   The Church of Christ: A Biblical Ecclesiology ...   
5                            Saint Hyacinth of Poland   
6   Rising Sons and Daughters: Life Among Japan's ...   
7   Muslim Women's Choices: Religious Belief and S...   
8                         Dramatica for Screenwriters   
9    Mensa Number Puzzles (Mensa Word Games for Kids)   
10  Vector Quantization and Signal Compression (Th...   
11                               A husband for Kutani   
12  The Ultimate Guide to Law School Admission: In...   
13  The Repeal of Reticence: A History of America'...   
14  Overcoming Hypertension (Dr. Kenneth H. Cooper...   
15                                   Alaska Sourdough   
16  The Oxford Handbook of Free

In [4]:
# Pre Processing of Data

data_preprocessed = general_preprocessing(data)
data_preprocessed.removesquarebracket(columns = ['authors', 'categories'])
data_preprocessed.date_processing(columns = ['publishedDate'])
data_preprocessed.dropna()
data_preprocessed.scaled(columns_to_scale = ['Impact'])


  self.data[col] = self.data[col].str.replace("[", "").str.replace("]", "").str.replace("'", "")


In [5]:
X_combined = TextVectorizer(64)
X_combined = X_combined.fit_transform(data, text_columns = ['Title', 'description', 'authors', 'publisher', 'categories'], numerical_column = ['publishedDate'])

In [7]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, data['Impact'], test_size=0.2, random_state=42)


In [None]:

os.chdir("/data1")

mlflow.set_tracking_uri("...")
# I have removed URI for privacy concerns.

## This sets the place where the models logs are stored .If it is not set mlflow will create a local folder in your directory.But
## further configuration will be needed to attach it with mlflow webserver.Hence best to keep it to original path

mlflow.set_experiment('Book_Impact_Prediction_1')
## Set your experiment name (Creates the experiment if it does not exist)
## Set your experiment name (Creates the experiment if it does not exist)

###Starts an active run

mlflow.set_tag("mlflow.runName", f"Book_Impact_Prediction_1")

mlflow.autolog()

with mlflow.start_run(nested=True, description=f"Running Xgboost regressor First version"):
    
    # Initialize XGBoost Regressor
    xgb_regressor = XGBRegressor()
    
    # Train the model
    xgb_regressor.fit(X_train, y_train)
    
    # Evaluate the model
    mse = mean_squared_error(y_test, xgb_regressor.predict(X_test))
    print("Mean Squared Error:", mse)
    
    predicted_values = xgb_regressor.predict(X_test)
    
    mape = mean_absolute_percentage_error(y_test, predicted_values)
    
    print("Mean Absolute Percentage Error:", mape)
    
        # Log AUC to MLflow
    mlflow.log_metric("Mean Squared Error", mse)
    mlflow.log_metric("Mean Absolute Percentage Error", mape)
    
    from mlflow.models import infer_signature
    
    signature = infer_signature(X_test, predicted_values)
    
    # Save the model with MLflow:
    mlflow.sklearn.save_model(xgb_regressor, "xgb_regressor_path", signature = signature)

mlflow.end_run()


In [None]:

os.chdir("/data1")

mlflow.set_tracking_uri("...")
# I have removed URI for privacy concerns.

## This sets the place where the models logs are stored .If it is not set mlflow will create a local folder in your directory.But
## further configuration will be needed to attach it with mlflow webserver.Hence best to keep it to original path

mlflow.set_experiment('Book_Impact_Prediction_1')
## Set your experiment name (Creates the experiment if it does not exist)
## Set your experiment name (Creates the experiment if it does not exist)

###Starts an active run

mlflow.set_tag("mlflow.runName", f"Book_Impact_Prediction_1")

mlflow.autolog()

with mlflow.start_run(nested=True, description=f"Running Xgboost regressor Run 2"):
    
    # Initialize XGBoost Regressor
    xgb_regressor = XGBRegressor()
    
    # Train the model
    xgb_regressor.fit(X_train, y_train)
    
    # Evaluate the model
    mse = mean_squared_error(y_test, xgb_regressor.predict(X_test))
    print("Mean Squared Error:", mse)
    
    predicted_values = xgb_regressor.predict(X_test)
    
    mape = mean_absolute_percentage_error(y_test, predicted_values)
    
    print("Mean Absolute Percentage Error:", mape)
    
        # Log AUC to MLflow
    mlflow.log_metric("Mean Squared Error", mse)
    mlflow.log_metric("Mean Absolute Percentage Error", mape)
    
    from mlflow.models import infer_signature
    
    signature = infer_signature(X_test, predicted_values)
    
    # Save the model with MLflow:
    mlflow.sklearn.save_model(xgb_regressor, "xgb_regressor_path_2", signature = signature)

mlflow.end_run()


In [None]:

os.chdir("/data1")

mlflow.set_tracking_uri("...")
# I have removed URI for privacy concerns.

## This sets the place where the models logs are stored .If it is not set mlflow will create a local folder in your directory.But
## further configuration will be needed to attach it with mlflow webserver.Hence best to keep it to original path

mlflow.set_experiment('Book_Impact_Prediction_1')
## Set your experiment name (Creates the experiment if it does not exist)
## Set your experiment name (Creates the experiment if it does not exist)

###Starts an active run

mlflow.set_tag("mlflow.runName", f"Book_Impact_Prediction_1")

mlflow.autolog()

with mlflow.start_run(nested=True, description=f"Running Xgboost regressor Model Hypertuned"):
    
    # Define the regressor
    regressor = XGBRegressor()
    
    # Define hyperparameter search space
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.05, 0.1, 0.2],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0],
        'gamma': [0, 0.1, 0.2],
        'reg_alpha': [0, 0.1, 0.2],
        'reg_lambda': [0, 0.1, 0.2]
    }
    
    # Define RandomizedSearchCV
    random_search = RandomizedSearchCV(
        regressor,
        param_distributions=param_grid,
        n_iter=10,          # Number of parameter settings that are sampled
        cv=5,               # Cross-validation folds
        scoring ='neg_mean_squared_error', 
        n_jobs=3,
        random_state=42     # Random seed for reproducibility
    )
    
    # Perform hyperparameter tuning
    random_search.fit(X_train, y_train)
    
    # Get the best hyperparameters
    best_params = random_search.best_params_
    print("Best Hyperparameters:", best_params)
    
    # Evaluate model performance on the test set
    xgb_regressor = random_search.best_estimator_
    
    # Evaluate the model
    mse = mean_squared_error(y_test, xgb_regressor.predict(X_test))
    print("Mean Squared Error:", mse)
    
    predicted_values = xgb_regressor.predict(X_test)
    
    mape = mean_absolute_percentage_error(y_test, predicted_values)
    
    print("Mean Absolute Percentage Error:", mape)
    
        # Log AUC to MLflow
    mlflow.log_metric("Mean Squared Error", mse)
    mlflow.log_metric("Mean Absolute Percentage Error", mape)
    
    from mlflow.models import infer_signature
    
    signature = infer_signature(X_test, predicted_values)
    
    # Save the model with MLflow:
    mlflow.sklearn.save_model(xgb_regressor, "xgb_regressor_path_3", signature = signature)

mlflow.end_run()


In [6]:
# Get the best model from experiment we store in MLflow:

import mlflow
import os
import mlflow.pyfunc

os.chdir("/data1")

mlflow.set_tracking_uri("...")
# I have removed URI for privacy concerns.

experiment_name = "Book_Impact_Prediction_1"

runs = mlflow.search_runs(experiment_ids=mlflow.get_experiment_by_name(experiment_name).experiment_id)

runs = runs.sort_values(['metrics.Mean Absolute Percentage Error'], ascending = [False]).reset_index().drop(['index'], axis = 1)

best_run = runs.loc[runs["metrics.Mean Absolute Percentage Error"].idxmax()]


In [7]:
best_model = mlflow.pyfunc.load_model(best_run.artifact_uri + "/model")

In [9]:
predicted = best_model.predict(X_combined)

with open('/data1/notebooks/H_Level/ML/Scaler Pickle/scaler.pkl', 'rb') as f:
    loaded_scaler = pickle.load(f)

# Use the loaded scaler to transform data
predicted = loaded_scaler.inverse_transform(pd.DataFrame(predicted))
