In [None]:
#### ML Model creation for Regression tasks ###
# Works with LinearRegression, Randomforest and XGBoost
# Sriram Parthasarathy
# LICENSES : MIT

'''
Regression predicts continuous numerical outcomes by analyzing relationships between dependent and independent variables.
When to use: Use when the target variable is quantitative (e.g., price, temperature, sales).
Examples:

- Predicting house prices based on size, location, and amenities.

- Forecasting stock market trends using historical data.

- Estimating patient recovery time based on treatment and health metrics.

Additional Reading:
Please refer to my articles on Medium for more details:

The Shopping Cart Abandonment Problem: How Machine Learning Can Help!
https://medium.com/managing-digital-products/the-shopping-cart-abandonment-problem-how-machine-learning-can-help-eb690f1dc4f6?source=your_stories_page--------------------------------------------

How to Measure & Optimise Your Predictive Model for Prime Time?
https://medium.com/managing-digital-products/how-to-measure-optimise-your-predictive-model-for-prime-time-3b9f6072f85c?source=your_stories_page--------------------------------------------

Increasing The Accuracy of Predictive Models with Stacked Ensemble Techniques: Healthcare Example
https://medium.com/managing-digital-products/increasing-the-accuracy-of-predictive-model-with-stacked-ensemble-techniques-a-healthcare-example-135d36b9a2b7?source=your_stories_page--------------------------------------------

AI Powered Automatic Classification: The Challenges in Managing Data in Clinical Trials
https://medium.com/managing-digital-products/ai-powered-automatic-classification-the-challenges-in-managing-data-in-clinical-trials-6639e7aa1a7d?source=your_stories_page--------------------------------------------

How Do You Measure If Your Customer Churn Predictive Model Is Good?
https://medium.com/data-science/how-do-you-measure-if-your-customer-churn-predictive-model-is-good-187a49a9eee3?source=your_stories_page--------------------------------------------


Practical Data Augmentation Techniques for Predictive Models
https://medium.com/hackernoon/practical-data-augmentation-techniques-for-predictive-models-b51599253c30?source=your_stories_page--------------------------------------------

Machine Learning for Product Managers: Defining the business problem
https://medium.com/managing-digital-products/machine-learning-for-product-managers-defining-the-business-problem-f0e968d09ee7?source=your_stories_page--------------------------------------------


'''

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# ----------------------------
# STEP 1: Create sample dataset
# ----------------------------
# To illustrate I am using sample dataset as I can't share a real customer dataset
# Replace this with your data


data = {
    'education': ['Bachelors', 'Masters', 'PhD', 'Bachelors', 'Masters'],
    'years_experience': [1, 3, 5, 2, 7],
    'salary': [30000.0, 50000.0, 70000.0, 35000.0, 90000.0],
    'productivity_score': [60.5, 70.0, 85.2, 65.0, 90.1]  # Target
}
df = pd.DataFrame(data)

# ----------------------------
# STEP 2: Define features and target
# ----------------------------
X = df.drop(columns=['productivity_score'])
y = df['productivity_score']

# ----------------------------
# STEP 3: Preprocessing pipeline
# ----------------------------
# Categorical and numerical columns
categorical_cols = ['education']
numeric_cols = ['years_experience', 'salary']

# For RandomForest and XG Boost we can potentially skip the numeric scaler

# Apply OneHotEncoder to categoricals and pass through numericals for Randomforest
# preprocessor = ColumnTransformer([
#     ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
# ], remainder='passthrough')  # keep numeric columns as-is

# Apply OneHotEncoder to categoricals and Numerical for Linear regression
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', StandardScaler(), numeric_cols)
])


# ----------------------------
# STEP 4: Build and train model
# ----------------------------
# Final pipeline: preprocessing + regression
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('regressor', LinearRegression())
])

# Replace regressor for other models
# Use regressor = RandomForestRegressor
# Use regressor = XGBRegressor

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit model
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)
# Calculate RMSE
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
#print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
#print("R^2 Score:", r2_score(y_test, y_pred))


# Mean Absolute Error (MAE)
# What it means: Average of the absolute differences between predicted and actual values.

# Mean Squared Error (MSE)
# What it means: Like MAE, but larger errors get more weight.

# Root Mean Squared Error (RMSE)
# What it means: Square root of MSE. Easier to interpret in original units.

# If the RMSE (Root Mean Squared Error) is 5,
# it means that, on average, your model’s predictions are off
# by about 5 people compared to the actual number of people
# staying in the hotel.
# 🔹 In simple terms:
# If your model predicts 100 guests on a day,
# the actual number might be around 95 or 105—on average.

Mean Squared Error: 0.8414983660921077
Root Mean Squared Error: 0.9173322005097759
