In [3]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from xgboost import XGBRegressor
import joblib
import optuna
import warnings
warnings.filterwarnings('ignore')

In [4]:
# Load data
df = pd.read_csv('../data/raw/cv_dataset.csv')
embeddings = np.load('../data/embeddings.npy')
ratings = df['Rating'].values

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    embeddings, ratings, test_size=0.2, random_state=42
)

In [14]:
def train_baseline_models():
    """Train and evaluate multiple regression models"""
    models = {
        'Linear Regression': LinearRegression(),
        'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
        'Support Vector': SVR(),
        'XGBoost': XGBRegressor()
    }
    
    best_score = -np.inf
    best_model = None
    best_name = ""
    
    for name, model in models.items():
        print(f"\nTraining {name}...")
        model.fit(X_train, y_train)
        
        # Evaluate
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        print(f"{name} Results:")
        print(f"- MSE: {mse:.4f}")
        print(f"- MAE: {mae:.4f}")
        print(f"- R²: {r2:.4f}")
        
        if r2 > best_score:
            best_score = r2
            best_model = model
            best_name = name
    
    print(f"\nBest model: {best_name} with R²: {best_score:.4f}")
    return best_model

In [15]:
if __name__ == "__main__":

    best_model = train_baseline_models()
    
    # Save the best model
    joblib.dump(best_model, '../data/model/regression_model.pkl')
    print("Model saved")
    
    # Final evaluation
    y_pred = best_model.predict(X_test)
    print("\nFinal Model Evaluation:")
    print(f"- MSE: {mean_squared_error(y_test, y_pred):.4f}")
    print(f"- MAE: {mean_absolute_error(y_test, y_pred):.4f}")
    print(f"- R²: {r2_score(y_test, y_pred):.4f}")


Training Linear Regression...
Linear Regression Results:
- MSE: 0.3160
- MAE: 0.4292
- R²: 0.8821

Training Random Forest...
Random Forest Results:
- MSE: 0.9491
- MAE: 0.7575
- R²: 0.6460

Training Support Vector...
Support Vector Results:
- MSE: 0.5065
- MAE: 0.5337
- R²: 0.8111

Training XGBoost...
XGBoost Results:
- MSE: 0.7619
- MAE: 0.6702
- R²: 0.7158

Best model: Linear Regression with R²: 0.8821
Model saved

Final Model Evaluation:
- MSE: 0.3160
- MAE: 0.4292
- R²: 0.8821
