# Feature Engineering and Linear Regression - Tennis Demand

#### Table of Contents
1. [Imports](#imports)
2. [Data Import and Feature Engineering](#data-import-and-feature-engineering)
3. [MLFlow Experiments](#mlflow-experiments)

#### Imports

In [1]:
import mlflow  # ML experiment tracking
import pandas as pd  # Data manipulation

# Sklearn
from sklearn.model_selection import train_test_split  # Data splitting
from imblearn.over_sampling import SMOTE  # Handle class imbalance

from sklearn.linear_model import Ridge  # Ridge regression model
from sklearn.preprocessing import StandardScaler  # Feature scaling

In [2]:
# Local Modules

import os

os.sys.path.append("../src")

from process import *

#### Data Import and Feature Engineering

In [3]:
# Import data

data_path = "../data/processed/tennis_merged.csv"
df_tennis = pd.read_csv(data_path)

In [4]:
df_tennis.head(5)

Unnamed: 0,product_price,product_original_price,product_star_rating,product_num_ratings,product_minimum_offer_price,is_prime,climate_pledge_friendly,has_variations,coupon_discount,sales_volume
0,44.52,80.0,4.6,709,44.52,1.0,0.0,1.0,0.0,429
1,42.5,75.0,4.7,2028,42.5,1.0,0.0,1.0,0.0,117
2,38.5,70.0,4.5,502,38.5,1.0,0.0,1.0,0.0,185
3,30.69,50.0,4.5,40996,30.69,1.0,0.0,1.0,0.0,110
4,60.0,80.0,4.6,397,60.0,1.0,0.0,1.0,0.0,144


In [6]:
df_tennis_modified = df_tennis.copy()

In [8]:
df_tennis_modified["sum_ratings"] = df_tennis_modified["product_num_ratings"] * df_tennis_modified["product_star_rating"]
df_tennis_modified["absolute_discount"] = df_tennis_modified["product_price"] - df_tennis_modified["product_minimum_offer_price"]
df_tennis_modified["product_value"] = df_tennis_modified["product_price"] * df_tennis_modified["product_star_rating"]
df_tennis_modified["diversity_satisfaction"] = df_tennis_modified["product_star_rating"] * df_tennis_modified["has_variations"]

In [9]:
X = df_tennis.drop(labels=["sales_volume"], axis=1)
y = df_tennis[["sales_volume"]]

X_train_raw, X_test, y_train_raw, y_test = train_test_split(X, y, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train_raw, y_train_raw, train_size=0.66)

#### MLFlow Experiments

In [12]:
amazon_mlflow_client = mlflow.MlflowClient(
    tracking_uri="http://127.0.0.1:8000"
)

mlflow.set_tracking_uri("http://127.0.0.1:8000")

In [13]:
run_name ="linear-regression-with-feature-engineering"
linear_models_experiment_id = '779742125521330833'
amazon_mlflow_client.create_run(
    experiment_id=linear_models_experiment_id,
    run_name=run_name,
    tags={
        "Dataset": "Amazon Tennis Demand - V2 (Feature Engineering)"
    }
)

<Run: data=<RunData: metrics={}, params={}, tags={'Dataset': 'Amazon Tennis Demand - V2 (Feature Engineering)',
 'mlflow.runName': 'linear-regression-with-feature-engineering'}>, info=<RunInfo: artifact_uri='mlflow-artifacts:/779742125521330833/b7a767b699d548efb6beba2262173f2d/artifacts', end_time=None, experiment_id='779742125521330833', lifecycle_stage='active', run_id='b7a767b699d548efb6beba2262173f2d', run_name='linear-regression-with-feature-engineering', run_uuid='b7a767b699d548efb6beba2262173f2d', start_time=1728582308969, status='RUNNING', user_id='unknown'>, inputs=<RunInputs: dataset_inputs=[]>>

In [17]:
ridge_params = {'alpha': 0.5}

linreg = Ridge(**ridge_params)
linreg.fit(X_train, y_train)

In [18]:
y_pred = linreg.predict(X_val)

In [23]:
linreg_performance = evaluate_regression_model(
    model=linreg,
    y_pred=y_pred,
    y_true=y_val
)

In [28]:
with mlflow.start_run(
    experiment_id=linear_models_experiment_id,
    run_name=run_name
) as run:
    mlflow.log_params(ridge_params)
    mlflow.log_metrics(linreg_performance)
    mlflow.sklearn.log_model(
        sk_model=linreg,
        input_example=X_val,
        artifact_path="ridge-regression-with-feature-engineering"
    )    

2024/10/10 12:56:30 INFO mlflow.tracking._tracking_service.client: 🏃 View run linear-regression-with-feature-engineering at: http://127.0.0.1:8000/#/experiments/779742125521330833/runs/a8c2d8e8796f46d88f2682ec66407888.
2024/10/10 12:56:30 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8000/#/experiments/779742125521330833.


What if we standarize the data?

In [30]:
scaler = StandardScaler()

X_train_std = scaler.fit_transform(X_train)
X_val_std = scaler.transform(X_val)

In [32]:
ridge_reg = Ridge(**ridge_params)

ridge_reg.fit(X_train_std, y_train)
y_pred = ridge_reg.predict(X_val_std)


In [34]:
# Evaluate performance

ridge_reg_performance = evaluate_regression_model(
    model=ridge_reg,
    y_pred=y_pred,
    y_true=y_val
)

In [35]:
std_run_name = "ridge-std-feature-engineering"

amazon_mlflow_client.create_run(
    experiment_id=linear_models_experiment_id,
    run_name=std_run_name
)

<Run: data=<RunData: metrics={}, params={}, tags={'mlflow.runName': 'ridge-std-feature-engineering'}>, info=<RunInfo: artifact_uri='mlflow-artifacts:/779742125521330833/acd03802903f40ba929391f7c442e41f/artifacts', end_time=None, experiment_id='779742125521330833', lifecycle_stage='active', run_id='acd03802903f40ba929391f7c442e41f', run_name='ridge-std-feature-engineering', run_uuid='acd03802903f40ba929391f7c442e41f', start_time=1728583541421, status='RUNNING', user_id='unknown'>, inputs=<RunInputs: dataset_inputs=[]>>

In [36]:
with mlflow.start_run(
    experiment_id=linear_models_experiment_id,
    run_name=std_run_name
) as run:
    mlflow.log_params(ridge_params)
    mlflow.log_metrics(ridge_reg_performance)
    mlflow.sklearn.log_model(
        sk_model=ridge_reg,
        input_example=X_val_std,
        artifact_path="ridge-with-standar-scaler-and-feature-engineering"
    )

2024/10/10 13:08:41 INFO mlflow.tracking._tracking_service.client: 🏃 View run ridge-std-feature-engineering at: http://127.0.0.1:8000/#/experiments/779742125521330833/runs/d444ea8e4032433385b7bf3d692e326e.
2024/10/10 13:08:41 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8000/#/experiments/779742125521330833.


14:49:34.566 | [31mERROR[0m   | prefect.server.services.telemetry - [31mFailed[0m to send telemetry:
Shutting down telemetry service...
