# Used Cars Price Prediction

## Overview
This notebook outlines the process of building a predictive model for used car prices. The model leverages various machine learning techniques to estimate the price based on features such as brand, model, mileage, fuel type, and more. This is intended for submission to a Kaggle competition.

## 1. Importing Libraries
We begin by importing necessary libraries for data manipulation, visualization, and machine learning.


In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import category_encoders as ce
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.simplefilter(action='ignore')
plt.style.use('seaborn')

# Data Loading
## Load the training and testing datasets.

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df_test = pd.read_csv('test.csv')

# Exploring the Data
## Check the first few rows and the summary statistics of the training data to understand its structure.

In [4]:
df.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


# Data Preprocessing
## Handling Missing Values

In [5]:
df.describe()

Unnamed: 0,id,model_year,milage,price
count,188533.0,188533.0,188533.0,188533.0
mean,94266.0,2015.829998,65705.295174,43878.02
std,54424.933488,5.660967,49798.158076,78819.52
min,0.0,1974.0,100.0,2000.0
25%,47133.0,2013.0,24115.0,17000.0
50%,94266.0,2017.0,57785.0,30825.0
75%,141399.0,2020.0,95400.0,49900.0
max,188532.0,2024.0,405000.0,2954083.0


## Train Data PreProcessing 

In [6]:
df['clean_title'].fillna('No', inplace=True)
df['fuel_type'].replace('–', 'Gasoline', inplace=True)
df['fuel_type'].fillna('Electric', inplace=True)
df['accident'].fillna('Undefined', inplace=True)
df.loc[df['brand'] == 'Tesla', 'fuel_type'] = 'Electric'

## Test Data PreProcessing

In [7]:
df_test['clean_title'].fillna('No', inplace=True)
df_test['fuel_type'].replace('–', 'Gasoline', inplace=True)
df_test['fuel_type'].fillna('Electric', inplace=True)
df_test['accident'].fillna('Undefined', inplace=True)
df_test.loc[df_test['brand'] == 'Tesla', 'fuel_type'] = 'Electric'

# Feature Engineering
## Create new features that could help improve model accuracy. For example, calculating the age of the car based on the model year.

In [8]:
df['age'] = 2024 - df['model_year'].replace(0, 1)

df_test['age'] = 2024 - df_test['model_year'].replace(0, 1)

In [9]:
df.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price,age
0,0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200,17
1,1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999,22
2,2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900,22
3,3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000,7
4,4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500,3


# Extracting Engine Features
## Use regex to extract horsepower, displacement, and cylinder count from the engine specification string.

In [10]:
import re

def extract_engine_features(engine_str):
    features = {
        'horsepower': None,
        'displacement': None,
        'cylinders': None
    }

    
    hp_match = re.search(r'(\d+\.?\d*)HP', engine_str,re.IGNORECASE)
    if hp_match:
        features['horsepower'] = float(hp_match.group(1))

    
    disp_match = re.search(r'(\d+\.?\d*)L', engine_str,re.IGNORECASE)
    if disp_match:
        features['displacement'] = float(disp_match.group(1))

    
    cyl_match = re.search(r'(\d+)\s*Cylinder?', engine_str, re.IGNORECASE)
    if cyl_match:
        features['cylinders'] = int(cyl_match.group(1))
    else:
        cyl_match = re.search(r'(\d+)V', engine_str)
        if cyl_match:
            features['cylinders'] = int(cyl_match.group(1))

    return features

In [11]:
extracted_features = df['engine'].apply(extract_engine_features)
df_features = pd.DataFrame(extracted_features.tolist(), index=df.index)
df = pd.concat([df, df_features], axis=1)

In [12]:
extracted_features = df_test['engine'].apply(extract_engine_features)
df_features = pd.DataFrame(extracted_features.tolist(), index=df_test.index)
df_test = pd.concat([df_test, df_features], axis=1)

In [13]:
df['cylinders'] = df['cylinders'].fillna(0.0)
df['displacement']= df['displacement'].fillna(0.0)
df['horsepower']= df['horsepower'].fillna(0.0)

In [14]:
df_test['cylinders'] = df_test['cylinders'].fillna(0.0)
df_test['displacement']= df_test['displacement'].fillna(0.0)
df_test['horsepower']= df_test['horsepower'].fillna(0.0)

# Categorizing Transmission Types
## Create a new categorical variable for the type of transmission, simplifying the diverse values into broader categories.

In [15]:
df['transmission'].unique()

array(['A/T', 'Transmission w/Dual Shift Mode', '7-Speed A/T',
       '8-Speed A/T', '10-Speed Automatic', '1-Speed A/T', '6-Speed A/T',
       '10-Speed A/T', '9-Speed A/T', '8-Speed Automatic',
       '9-Speed Automatic', '5-Speed A/T', 'Automatic',
       '7-Speed Automatic with Auto-Shift', 'CVT Transmission',
       '5-Speed M/T', 'M/T', '6-Speed M/T', '6-Speed Automatic',
       '4-Speed Automatic', '7-Speed M/T', '2-Speed A/T',
       '1-Speed Automatic', 'Automatic CVT', '4-Speed A/T',
       '6-Speed Manual', 'Transmission Overdrive Switch',
       '8-Speed Automatic with Auto-Shift', '7-Speed Manual',
       '7-Speed Automatic', '9-Speed Automatic with Auto-Shift',
       '6-Speed Automatic with Auto-Shift',
       '6-Speed Electronically Controlled Automatic with O', 'F', 'CVT-F',
       '8-Speed Manual', 'Manual', '–', '2', '6 Speed At/Mt',
       '5-Speed Automatic', '2-Speed Automatic', '8-SPEED A/T', '7-Speed',
       'Variable', 'Single-Speed Fixed Gear', '8-SPEED AT',


In [16]:
def categorize_transmission(transmission):
    transmission = transmission.lower()

    if 'cvT' in transmission:
        return 'CVT'
    elif 'dct' in transmission or 'dual-clutch' in transmission:
        return 'Dual-Clutch Automatic'
    elif 'a/t' in transmission:
        return 'Standard Automatic'
    elif 'manual' in transmission or 'm/t' in transmission or 'mt' in transmission:
        if 'automated' in transmission:
            return 'Automated Manual'
        return 'Standard Manual'
    elif 'variable' in transmission:
        return 'Variable Transmission'
    elif 'fixed gear' in transmission or 'single-speed' in transmission:
        return 'Fixed Gear'
    else:
        return 'Other'


In [17]:
df['transmission_category'] = df['transmission'].apply(categorize_transmission)
df_test['transmission_category'] = df_test['transmission'].apply(categorize_transmission)

# Dropping Unnecessary Features
## Remove features that are no longer needed for the modeling process.

In [18]:
df.drop(columns=['transmission','engine','int_col','ext_col'],inplace=True,axis=1)
df_test.drop(columns=['transmission','engine','int_col','ext_col'],inplace=True,axis=1)

# Encoding Categorical Values
## Convert categorical variables into numerical formats that can be used in the model.

In [19]:
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
import category_encoders as ce
be = ce.BinaryEncoder(cols=['accident', 'clean_title'])
encoded_accident_clean_title = be.fit_transform(df[['accident', 'clean_title']])
encoded_accident_clean_title_test = be.fit_transform(df_test[['accident', 'clean_title']])

ohe = OneHotEncoder(sparse=False, drop='first')
fuel_encoded = pd.DataFrame(ohe.fit_transform(df[['fuel_type']]), columns=ohe.get_feature_names_out(['fuel_type']))
fuel_encoded_test = pd.DataFrame(ohe.fit_transform(df_test[['fuel_type']]), columns=ohe.get_feature_names_out(['fuel_type']))


label_encoders = {
    'brand': LabelEncoder(),
    'model': LabelEncoder(),
    'transmission_category': LabelEncoder(),
}

for col, encoder in label_encoders.items():
    df[col] = encoder.fit_transform(df[col])
    df_test[col] = encoder.fit_transform(df_test[col])


final_df = pd.concat([df.drop(columns=['accident', 'clean_title', 'fuel_type']), 
                       encoded_accident_clean_title, 
                       fuel_encoded], axis=1)

final_df_test = pd.concat([df_test.drop(columns=['accident', 'clean_title', 'fuel_type']), 
                       encoded_accident_clean_title_test, 
                       fuel_encoded_test], axis=1)


X = final_df.drop(columns=['price']) 
y = final_df['price']


X.head()

Unnamed: 0,id,brand,model,model_year,milage,age,horsepower,displacement,cylinders,transmission_category,accident_0,accident_1,clean_title_0,clean_title_1,fuel_type_E85 Flex Fuel,fuel_type_Electric,fuel_type_Gasoline,fuel_type_Hybrid,fuel_type_Plug-In Hybrid,fuel_type_not supported
0,0,31,495,2007,213000,17,172.0,1.6,4.0,3,0,1,0,1,0.0,0.0,1.0,0.0,0.0,0.0
1,1,28,930,2002,143250,22,252.0,3.9,8.0,3,1,0,0,1,0.0,0.0,1.0,0.0,0.0,0.0
2,2,9,1575,2002,136731,22,320.0,5.3,8.0,3,0,1,0,1,1.0,0.0,0.0,0.0,0.0,0.0
3,3,16,758,2017,19500,7,420.0,5.0,8.0,2,0,1,0,1,0.0,0.0,1.0,0.0,0.0,0.0
4,4,36,1077,2021,7388,3,208.0,2.0,4.0,3,0,1,0,1,0.0,0.0,1.0,0.0,0.0,0.0


# Splitting the Data
## Separate the data into training and validation sets for model training and evaluation.

In [20]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Selection

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


X = final_df.drop(columns=['price'])
y = final_df['price']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


y_pred = model.predict(X_val)

# Evaluating the model
mae = mean_absolute_error(y_val, y_pred)
rmse = mean_squared_error(y_val, y_pred, squared=False)
r2 = r2_score(y_val, y_pred)

print(f"MAE: {mae}, RMSE: {rmse}, R²: {r2}")

# Making Predictions on Test Data

In [21]:
df_test_price =  model.predict(final_df_test)

NameError: name 'model' is not defined

In [None]:
df_test_price

In [None]:
df_test_price = model.predict(final_df_test)
submission = pd.DataFrame({
    'id': final_df_test['id'],
    'price': df_test_price
})
submission.to_csv('submission.csv', index=False)