In [None]:
import os
os.chdir('/Users/rv/Projects/7CS074') # Change to the project root directory

# Now we can import the modules
import src.preprocessing as preprocessing
import src.visualisation as visualisation
import src.features as features
import src.models as models

import global_vars 

In [None]:
# First we will check and run the preprocessing script
# In case this is already done, this will not overwrite existing files
preprocessing.process_raw_multiple_data_files()

In [None]:
import pandas as pd

if not os.path.exists(global_vars.DATASET_CLEAN_FILE_PATH):
    raise FileNotFoundError(f"Dataset not found at {global_vars.DATASET_CLEAN_FILE_PATH}. Please ensure the dataset is placed correctly.")

df = pd.read_csv(global_vars.DATASET_CLEAN_FILE_PATH, sep=',', engine='python') # read with proper delimiter handling, and with python engine always
if df.empty:
    raise ValueError("Loaded dataset is empty. Please check the dataset file.")

print(f"Data loaded successfully from {global_vars.DATASET_CLEAN_FILE_PATH}.")

target_col = 'price'
all_models, trained_models_per_make = models.automatic_make_model_selection(
    df,
    target_col=target_col,
    type_models='classification',
    min_samples_per_make=300,
    cv_splits=5
)

In [None]:
from evaluation import regression_metrics

for make_vehicle, data in all_models.items():
	metrics = data['metrics']
	print(f"Make: {make_vehicle}, MAE: {metrics['MAE']}, RMSE: {metrics['RMSE']}, R2: {metrics['R2']}")

In [None]:
for make_vehicle, data in trained_models_per_make.items():
	model = data["model"]
	feature_names = data["feature_names"]
	y_test = data['y_test']
	predictions = data['predictions']

	values, kind = features.get_feature_effects(model)

	n = model.n_features_in_
	feature_names = feature_names[:n]
	values = values[:n]
 
	visualisation.plot_feature_importances(
		values,
		feature_names,
		f"{kind.title()}s - For {make_vehicle}"
	)

	visualisation.plot_actual_vs_predicted(y_test, predictions, f'Random Forest Regressor - For {make_vehicle}', metrics['R2'])
	visualisation.plot_price_vs_mpg(df)