# Used Car Price Prediction

This notebook covers data cleaning, feature engineering, and modeling to predict the price of used cars.

### Data Cleaning and Preparation

In [None]:

import pandas as pd

# Load the dataset
vehicles_df = pd.read_csv('/mnt/data/vehicles.csv')

# Drop irrelevant columns
vehicles_df_cleaned = vehicles_df.drop(columns=['id', 'VIN'])

# Handle missing values
vehicles_df_cleaned = vehicles_df_cleaned[vehicles_df_cleaned['price'].notna() & (vehicles_df_cleaned['price'] > 0)]
vehicles_df_cleaned['year'].fillna(vehicles_df_cleaned['year'].median(), inplace=True)
vehicles_df_cleaned['odometer'].fillna(vehicles_df_cleaned['odometer'].median(), inplace=True)

# Fill categorical columns with mode
categorical_columns = ['manufacturer', 'fuel', 'transmission', 'drive', 'paint_color', 'title_status', 'type', 'condition']
for col in categorical_columns:
    vehicles_df_cleaned[col].fillna(vehicles_df_cleaned[col].mode()[0], inplace=True)

# Remove extreme outliers
vehicles_df_cleaned = vehicles_df_cleaned[vehicles_df_cleaned['price'] <= 200000]

# Select features and target variable
X = vehicles_df_cleaned.drop(columns=['price', 'model', 'cylinders', 'size'])
y = vehicles_df_cleaned['price']

# One-hot encode categorical variables
X = pd.get_dummies(X, drop_first=True)

# Split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
    

### Modeling

In [None]:

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# 1. Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train_scaled, y_train)

# 2. Ridge Regression with GridSearchCV
ridge_model = Ridge()
ridge_params = {'alpha': [0.1, 1, 10, 100]}
ridge_grid = GridSearchCV(ridge_model, ridge_params, cv=5)
ridge_grid.fit(X_train_scaled, y_train)

# 3. Lasso Regression with GridSearchCV
lasso_model = Lasso()
lasso_params = {'alpha': [0.1, 1, 10, 100]}
lasso_grid = GridSearchCV(lasso_model, lasso_params, cv=5)
lasso_grid.fit(X_train_scaled, y_train)

# Make predictions on test data
linear_predictions = linear_model.predict(X_test_scaled)
ridge_predictions = ridge_grid.best_estimator_.predict(X_test_scaled)
lasso_predictions = lasso_grid.best_estimator_.predict(X_test_scaled)

# Evaluate the models
linear_r2 = r2_score(y_test, linear_predictions)
ridge_r2 = r2_score(y_test, ridge_predictions)
lasso_r2 = r2_score(y_test, lasso_predictions)

linear_rmse = np.sqrt(mean_squared_error(y_test, linear_predictions))
ridge_rmse = np.sqrt(mean_squared_error(y_test, ridge_predictions))
lasso_rmse = np.sqrt(mean_squared_error(y_test, lasso_predictions))

# Summarize the results
model_results = pd.DataFrame({
    'Model': ['Linear Regression', 'Ridge Regression', 'Lasso Regression'],
    'R-squared': [linear_r2, ridge_r2, lasso_r2],
    'RMSE': [linear_rmse, ridge_rmse, lasso_rmse],
    'Best Params': ['N/A', ridge_grid.best_params_, lasso_grid.best_params_]
})

# Display results
model_results
    