# Data exploration & preparation

In [51]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.metrics import (make_scorer, mean_squared_error, r2_score,
                             root_mean_squared_error)
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import MinMaxScaler

In [None]:
df = pd.read_csv("./datasets/used_cars_UK.csv")
df.head()

In [None]:
# display basic info (num of entries, columns etc.) about dataframe 
df.info()

In [None]:
df.dtypes

In [55]:
df.rename(
    columns={
        "Price": "price",
        "Mileage(miles)": "mileage",
        "Registration_Year": "registration_year",
        "Previous Owners": "previous_owners",
        "Fuel type": "fuel_type",
        "Body type": "body_type",
        "Engine": "engine",
        "Gearbox": "gearbox",
        "Doors": "doors",
        "Seats": "seats",
        "Emission Class": "emission_class",
        "Service history": "service_history",
    },
    inplace=True
)

In [56]:
df.drop(columns=["Unnamed: 0", "title"], inplace=True)

In [None]:
# display names of the columns
df.columns

## Duplicates check

In [None]:
df.shape

In [None]:
duplicates = df[df.duplicated()]
print(f"{len(duplicates)} out of {len(df)} rows are duplicated")

In [None]:
df.drop_duplicates()

## Unique values for nominal/categorical data

In [None]:
df.dtypes

In [None]:
for i, dtype in zip(df.dtypes.index, df.dtypes.values):
    if dtype == "object":
        print(i, df[i].unique())
        print("---")

## Handle missing values

### Categorical variables

In [None]:
df['engine'] = df['engine'].str.replace('L', '', regex=False).astype(float)
df['engine'].fillna(df.engine.mean(), inplace=True)
df['emission_class'] = df['emission_class'].fillna('Unknown')
df['service_history'] = df['service_history'].fillna("Unknown")

### Interval/ratio variables

In [64]:
df.fillna(
    {
        "mileage": df["mileage"].median(),
        "registration_year": df["registration_year"].median(),
        "previous_owners": df["previous_owners"].median(),
        "engine": df["engine"].median(),
        "doors": df["doors"].median(),
        "seats": df["seats"].median(),
    },
    inplace=True,
)

## One-hot encoding for categorical/nominal values

In [None]:
print("Categorical values:", df.select_dtypes(include='object').columns.tolist())

In [66]:
df = pd.get_dummies(df, ['fuel_type', 'body_type', 'gearbox', 'emission_class', 'service_history'])

In [None]:
df.head()

# Scale (numerical) data

In [68]:
# Identify numerical and categorical columns
numerical_columns = df.select_dtypes(include='number').columns
categorical_columns = [col for col in df.columns if col not in numerical_columns]

In [69]:
# Scale the numerical data
scaler = MinMaxScaler()
scaled_numerical_data = scaler.fit_transform(df[numerical_columns])
scaled_numerical_df = pd.DataFrame(scaled_numerical_data, columns=numerical_columns)

In [70]:
# Combine scaled numerical data with one-hot encoded categorical data
processed_df = pd.concat([scaled_numerical_df, df[categorical_columns].reset_index(drop=True)], axis=1)
processed_df_np = processed_df.values

# Split into train & test sets

In [71]:
X_train, X_test, y_train, y_test = train_test_split(
    processed_df.drop('price', axis=1).values,
    df['price'].values,
    test_size=0.2,
    random_state=42
)

# Lasso regression

In [None]:
lasso = Lasso()

# Define the hyperparameter grid
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1, 2, 3, 4, 5, 10, 100],  # Range of alpha values
    'max_iter': [1000, 5000, 10000],          # Maximum number of iterations
    'tol': [1e-4, 1e-3, 1e-2]                # Tolerance for optimization
}

scorer = make_scorer(mean_squared_error, greater_is_better=False)

# Perform Grid Search
grid_search = GridSearchCV(estimator=lasso, param_grid=param_grid, scoring=scorer, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model and hyperparameters
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

## Evaluation

In [None]:
# calculate R2-score
y_test_pred = best_model.predict(X_test)
rmse = root_mean_squared_error(y_test, y_test_pred)
r2 = r2_score(y_test, y_test_pred)
print("RMSE:", round(rmse, 2))
print("R-squared (R2):", round(r2, 2))

In [None]:
plt.figure(figsize=(8, 6))
errors = np.abs(y_test - y_test_pred)  # Absolute errors

# Scatter plot with color-coded errors
scatter = plt.scatter(y_test, y_test_pred, c=errors, cmap='coolwarm', alpha=0.7, edgecolors='k')
plt.colorbar(scatter, label='Prediction Error')

# Ideal diagonal line
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'r--', lw=2, label='Ideal Fit')

# Add labels and legend
plt.xlabel("True Values (y_test)")
plt.ylabel("Predicted Values (y_test_pred)")
plt.title("True vs Predicted Values")
plt.legend()
plt.grid(True)
plt.show()