In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load Data
df = pd.read_csv('train.csv')

# Data Preprocessing
df['clean_title'].fillna('No', inplace=True)
df['fuel_type'].replace('–', 'Gasoline', inplace=True)
df['fuel_type'].fillna('Electric', inplace=True)
df['accident'].fillna('Undefined', inplace=True)
df.loc[df['brand'] == 'Tesla', 'fuel_type'] = 'Electric'

# Feature Engineering
df['age'] = 2024 - df['model_year'].replace(0, 1)

# Extract engine features
def extract_engine_features(engine_str):
    features = {'horsepower': None, 'displacement': None, 'cylinders': None}
    hp_match = re.search(r'(\d+\.?\d*)HP', engine_str, re.IGNORECASE)
    if hp_match:
        features['horsepower'] = float(hp_match.group(1))
    disp_match = re.search(r'(\d+\.?\d*)L', engine_str, re.IGNORECASE)
    if disp_match:
        features['displacement'] = float(disp_match.group(1))
    cyl_match = re.search(r'(\d+)\s*Cylinder?', engine_str, re.IGNORECASE)
    if cyl_match:
        features['cylinders'] = int(cyl_match.group(1))
    else:
        cyl_match = re.search(r'(\d+)V', engine_str)
        if cyl_match:
            features['cylinders'] = int(cyl_match.group(1))
    return features

# Apply feature extraction and concatenate
extracted_features = df['engine'].apply(extract_engine_features)
df_features = pd.DataFrame(extracted_features.tolist(), index=df.index)
df = pd.concat([df, df_features], axis=1)

# Handle missing values for engine features
df[['cylinders', 'displacement', 'horsepower']] = df[['cylinders', 'displacement', 'horsepower']].fillna(0)

# Transmission categorization
def categorize_transmission(transmission):
    transmission = transmission.lower()
    if 'cvt' in transmission:
        return 'CVT'
    elif 'dct' in transmission or 'dual-clutch' in transmission:
        return 'Dual-Clutch Automatic'
    elif 'a/t' in transmission:
        return 'Standard Automatic'
    elif 'manual' in transmission or 'm/t' in transmission or 'mt' in transmission:
        return 'Standard Manual'
    elif 'variable' in transmission:
        return 'Variable Transmission'
    elif 'fixed gear' in transmission or 'single-speed' in transmission:
        return 'Fixed Gear'
    else:
        return 'Other'

df['transmission_category'] = df['transmission'].apply(categorize_transmission)

# Drop unnecessary features
df.drop(columns=['transmission', 'engine', 'int_col', 'ext_col'], inplace=True)

# Encoding Categorical Values
# One-Hot Encoding for categorical features
df = pd.get_dummies(df, columns=['brand', 'model', 'fuel_type', 'transmission_category', 'clean_title', 'accident'], drop_first=True)

# Final Dataset Preparation
X = df.drop(columns=['price'])
y = df['price']

# Feature Scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Train/Test Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# List of models to evaluate
models = {
    'Linear Regression': LinearRegression(),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'Support Vector Regressor': SVR()
}

# Function to evaluate models
def evaluate_model(model, X_train, X_val, y_train, y_val):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    mae = mean_absolute_error(y_val, y_pred)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    r2 = r2_score(y_val, y_pred)
    return mae, rmse, r2

# Evaluate each model
results = {}
for name, model in models.items():
    mae, rmse, r2 = evaluate_model(model, X_train, X_val, y_train, y_val)
    results[name] = {'MAE': mae, 'RMSE': rmse, 'R²': r2}

# Print results
for name, metrics in results.items():
    print(f"{name} - MAE: {metrics['MAE']}, RMSE: {metrics['RMSE']}, R²: {metrics['R²']}")
