# Dimensionality reduction using PLS , training and testing 

### Importing libraries

In [None]:
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor

### Reading data

In [7]:
PREPROCESSED_DATA_PATH = "../../../data/preprocessed/"
train_df = pd.read_parquet(PREPROCESSED_DATA_PATH + "train.parquet")

val_df = pd.read_parquet(PREPROCESSED_DATA_PATH + "validation.parquet")
small_test_df = pd.read_parquet(PREPROCESSED_DATA_PATH + "test.parquet")

test_df = pd.concat([val_df, small_test_df], axis=0)

TARGET = 'copiesSold'

X_train = train_df.drop(columns=[TARGET])
y_train = train_df[TARGET]

X_test = test_df.drop(columns=[TARGET])
y_test = test_df[TARGET]

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [8]:
pls = PLSRegression(n_components=15)
X_train_pls = pls.fit_transform(X_train_scaled, y_train)[0]
X_test_pls = pls.transform(X_test_scaled)

# Train RandomForest on PLS components
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train_pls, y_train)

y_pred = rf.predict(X_test_pls)
mse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"MSE with PLS + RF: {mse:.4f}")


MSE with PLS + RF: 2.0353


In [9]:
y_predd = rf.predict(X_train_pls)
mse = np.sqrt(mean_squared_error(y_train, y_predd))
print(f"MSE with PLS + RF: {mse:.4f}")
r2 = r2_score(y_train, y_predd)
print(f"Train R^2: {r2:.4f}")

MSE with PLS + RF: 0.7490
Train R^2: 0.9426


In [10]:
r2 = r2_score(y_test, y_pred)
print(f"Test R^2: {r2:.4f}")

Test R^2: 0.5784


In [11]:
xgb = XGBRegressor(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42)
xgb.fit(X_train_pls, y_train)

# Predict on test set
y_pred = xgb.predict(X_test_pls)

# Evaluate
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Test RMSE: {rmse:.4f}")
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Test RMSE: {rmse:.4f}")
print(f"Test MAE: {mae:.4f}")
print(f"Test R^2: {r2:.4f}")

Test RMSE: 2.0307
Test RMSE: 2.0307
Test MAE: 1.5919
Test R^2: 0.5804


In [13]:
# Train Gradient Boosting Regressor on PLS components
gbr = GradientBoostingRegressor(max_depth=5, learning_rate=0.1, n_estimators=100, random_state=42)
gbr.fit(X_train_pls, y_train)

y_pred = gbr.predict(X_test_pls)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Test RMSE with PLS + GradientBoostingRegressor: {rmse:.4f}")
print(f"Test MAE with PLS + GradientBoostingRegressor: {mae:.4f}")
print(f"Test R2 score with PLS + GradientBoostingRegressor: {r2:.4f}")


Test RMSE with PLS + GradientBoostingRegressor: 2.0531
Test MAE with PLS + GradientBoostingRegressor: 1.6101
Test R2 score with PLS + GradientBoostingRegressor: 0.5710
