In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
X_train = pd.read_csv('dataset/X_train.csv')
y_train = pd.read_csv('dataset/y_train.csv')
X_test = pd.read_csv('dataset/X_test.csv')
y_test = pd.read_csv('dataset/y_test.csv')

In [3]:
X_train.shape, y_train.shape

((486, 24), (486, 1))

In [4]:
X_test.shape,y_test.shape,

((163, 24), (163, 1))

#### Standardization - The process of transforming data features to have a mean of zero and a standard deviation of one

In [5]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#### Standardize the target variable

In [6]:
scaler_y = StandardScaler()
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1))

## Models Training

### 1. Linear Regression

In [7]:
# Initialize and train the model
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train_scaled)

# Predictions
y_pred_lr = lr_model.predict(X_test_scaled)

# Evaluation
mse_lr = mean_squared_error(y_test_scaled, y_pred_lr)
mae_lr = mean_absolute_error(y_test_scaled, y_pred_lr)
r2_lr = r2_score(y_test_scaled, y_pred_lr)

print(f"Linear Regression - MSE: {mse_lr}, MAE: {mae_lr}, R2: {r2_lr}")

Linear Regression - MSE: 0.1774958123337091, MAE: 0.2674169876577753, R2: 0.8292886738610973


### 2. Decision Tree Regressor

In [8]:
# Initialize and train the model
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

# Predictions
y_pred_dt = dt_model.predict(X_test)

# Evaluation
mse_dt = mean_squared_error(y_test, y_pred_dt)
mae_dt = mean_absolute_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)

print(f"Decision Tree Regression - MSE: {mse_dt}, MAE: {mae_dt}, R²: {r2_dt}")

Decision Tree Regression - MSE: 4.134969325153374, MAE: 1.165644171779141, R²: 0.6142350503880052


### 3. Random Forest Regressor

In [9]:
# Initialize and train the model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train.values.ravel())

# Predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluation
mse_rf = mean_squared_error(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest Regression - MSE: {mse_rf}, MAE: {mae_rf}, R²: {r2_rf}")

Random Forest Regression - MSE: 2.026892024539878, MAE: 0.8390184049079754, R²: 0.8109045465079532


### 4. Gradient Boosting Regressor

In [10]:
# Initialize and train the model
gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train, y_train.values.ravel())

# Predictions
y_pred_gb = gb_model.predict(X_test)

# Evaluation
mse_gb = mean_squared_error(y_test, y_pred_gb)
mae_gb = mean_absolute_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)

print(f"Gradient Boosting Regression - MSE: {mse_gb}, MAE: {mae_gb}, R²: {r2_gb}")

Gradient Boosting Regression - MSE: 2.176110440833129, MAE: 0.8941053163563203, R²: 0.7969834674584944


### 5. Support Vector Regressor

In [11]:
# Initialize and train the model
svr_model = SVR()
svr_model.fit(X_train_scaled, y_train.values.ravel())

# Predictions
y_pred_svr = svr_model.predict(X_test_scaled)

# Evaluation
mse_svr = mean_squared_error(y_test, y_pred_svr)
mae_svr = mean_absolute_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)

print(f"Support Vector Regression - MSE: {mse_svr}, MAE: {mae_svr}, R²: {r2_svr}")

Support Vector Regression - MSE: 3.4885535044804428, MAE: 1.0319264093525122, R²: 0.6745413179516806


##### Linear Regression is the best model as it has the lowest Mean Squared Error (MSE), a relatively low Mean Absolute Error (MAE), and the highest R-squared (R²) value.

#### saving the model and the scaler file for deployment

In [12]:
import pickle

In [13]:
pickle.dump(lr_model, open('deploy/log_reg_model.pkl', 'wb'))

In [14]:
pickle.dump(scaler, open('deploy/log_reg_scaler.pkl', 'wb'))

In [15]:
pickle.dump(scaler_y, open('deploy/log_reg_scaler_y.pkl', 'wb'))

In [16]:
pickled_model = pickle.load(open('deploy/log_reg_model.pkl', 'rb'))

In [17]:
pickled_scaler = pickle.load(open('deploy/log_reg_scaler.pkl', 'rb'))

In [18]:
pickled_scaler_y = pickle.load(open('deploy/log_reg_scaler_y.pkl', 'rb'))

In [19]:
y_pred_lr_pickled = pickled_model.predict(X_test_scaled)

In [20]:
# Evaluation
mse_lr_pickled = mean_squared_error(y_test_scaled, y_pred_lr_pickled)
mae_lr_pickled = mean_absolute_error(y_test_scaled, y_pred_lr_pickled)
r2_lr_pickled = r2_score(y_test_scaled, y_pred_lr_pickled)

print(f"Pickled Linear Regression - MSE: {mse_lr_pickled}, MAE: {mae_lr_pickled}, R2: {r2_lr_pickled}")

Pickled Linear Regression - MSE: 0.1774958123337091, MAE: 0.2674169876577753, R2: 0.8292886738610973


In [21]:
input_data = np.array([[0,18,0,4,4,0,4,0,0,2,2,0,0,1,0,4,3,4,1,1,3,4,0,11]])
#input_data = np.array([[0,0,18,0,0,0,4,4,0,4,0,0,2,2,0,1,0,0,0,1,1,0,0,4,3,4,1,1,3,4,0,11]])

In [22]:
# Standardize the new data using the same scaler
input_data_scaled = scaler.transform(input_data)



In [23]:
# Make predictions
predictions_scaled = pickled_model.predict(input_data_scaled)
print(predictions_scaled)

[[-0.61963511]]


In [24]:
# Convert the predictions back to the original scale
predictions_original = scaler_y.inverse_transform(predictions_scaled)
print(predictions_original)

[[9.97138726]]
