In [None]:
import os
os.chdir('/Users/rv/Projects/7CS074') # Change to the project root directory

# Now we can import the modules
import src.preprocessing as preprocessing
import src.visualisation as visualisation

import src.models as models

In [None]:
# First we will check and run the preprocessing script
# In case this is already done, this will not overwrite existing files
preprocessing.process_raw_multiple_data_files()

In [None]:
import pandas as pd

if not os.path.exists(preprocessing.DATASET_CLEAN_FILE_PATH):
    raise FileNotFoundError(f"Dataset not found at {preprocessing.DATASET_CLEAN_FILE_PATH}. Please ensure the dataset is placed correctly.")

df = pd.read_csv(preprocessing.DATASET_CLEAN_FILE_PATH, sep=',', engine='python') # read with proper delimiter handling, and with python engine always
if df.empty:
    raise ValueError("Loaded dataset is empty. Please check the dataset file.")

print(f"Data loaded successfully from {preprocessing.DATASET_CLEAN_FILE_PATH}.")

# We will use the first 10 columns as features
# However we need to slice the 'price' column correctly, which is at index 3
feature_cols = ['year', 'mileage', 'tax', 'mpg', 'engineSize']
target_col = 'price'

# # One-hot encode categorical variables for better model performance
# categorical_cols = ['make', 'model', 'transmission', 'fuelType']
# df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

all_models, trained_models_per_make = models.automatic_make_model_selection(
    df,
    feature_cols,
    target_col,
    min_samples_per_make=300
)

In [None]:
from evaluation import regression_metrics

for make_vehicle, data in all_models.items():
	metrics = data['metrics']
	print(f"Make: {make_vehicle}, MAE: {metrics['MAE']}, RMSE: {metrics['RMSE']}, R2: {metrics['R2']}")

In [None]:
for make_vehicle, algorithm_data in trained_models_per_make.items():
	model = algorithm_data['model']
	Y_test = algorithm_data['y_test']
	X_test = algorithm_data['X_test']

	predictions = algorithm_data['predictions']
	metrics = algorithm_data['metrics']
 
	visualisation.plot_feature_importances(model, feature_cols, f'Random Forest Regressor Feature Importances - For {make_vehicle}')
	visualisation.plot_actual_vs_predicted(Y_test, predictions, f'Random Forest Regressor - For {make_vehicle}', metrics['R2'])
	visualisation.plot_price_vs_mpg(df)