In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

import warnings
warnings.filterwarnings("ignore")

In [5]:
df = pd.read_csv("/content/Car Price Assignment.csv")
df.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [6]:
df.info()
df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            205 non-null    int64  
 1   symboling         205 non-null    int64  
 2   CarName           205 non-null    object 
 3   fueltype          205 non-null    object 
 4   aspiration        205 non-null    object 
 5   doornumber        205 non-null    object 
 6   carbody           205 non-null    object 
 7   drivewheel        205 non-null    object 
 8   enginelocation    205 non-null    object 
 9   wheelbase         205 non-null    float64
 10  carlength         205 non-null    float64
 11  carwidth          205 non-null    float64
 12  carheight         205 non-null    float64
 13  curbweight        205 non-null    int64  
 14  enginetype        205 non-null    object 
 15  cylindernumber    205 non-null    object 
 16  enginesize        205 non-null    int64  
 1

Unnamed: 0,car_ID,symboling,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
count,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0
mean,103.0,0.834146,98.756585,174.049268,65.907805,53.724878,2555.565854,126.907317,3.329756,3.255415,10.142537,104.117073,5125.121951,25.219512,30.75122,13276.710571
std,59.322565,1.245307,6.021776,12.337289,2.145204,2.443522,520.680204,41.642693,0.270844,0.313597,3.97204,39.544167,476.985643,6.542142,6.886443,7988.852332
min,1.0,-2.0,86.6,141.1,60.3,47.8,1488.0,61.0,2.54,2.07,7.0,48.0,4150.0,13.0,16.0,5118.0
25%,52.0,0.0,94.5,166.3,64.1,52.0,2145.0,97.0,3.15,3.11,8.6,70.0,4800.0,19.0,25.0,7788.0
50%,103.0,1.0,97.0,173.2,65.5,54.1,2414.0,120.0,3.31,3.29,9.0,95.0,5200.0,24.0,30.0,10295.0
75%,154.0,2.0,102.4,183.1,66.9,55.5,2935.0,141.0,3.58,3.41,9.4,116.0,5500.0,30.0,34.0,16503.0
max,205.0,3.0,120.9,208.1,72.3,59.8,4066.0,326.0,3.94,4.17,23.0,288.0,6600.0,49.0,54.0,45400.0


In [7]:
df.isnull().sum()

Unnamed: 0,0
car_ID,0
symboling,0
CarName,0
fueltype,0
aspiration,0
doornumber,0
carbody,0
drivewheel,0
enginelocation,0
wheelbase,0


In [8]:
df = df.drop_duplicates()

# Target Variable
target = 'price'

X = df.drop(columns=[target])
y = df[target]

# Identify categorical and numerical variables
cat_cols = X.select_dtypes(include=['object']).columns
num_cols = X.select_dtypes(include=['int64','float64']).columns

In [9]:
# Preprocessing
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocess = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [12]:
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "Support Vector Regressor": SVR()
}


In [13]:
results = []

for name, model in models.items():

    pipe = Pipeline(steps=[('preprocess', preprocess),
                          ('model', model)])

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    results.append([name, r2, mse, mae])

results_df = pd.DataFrame(results,
                          columns=["Model","R2 Score","MSE","MAE"])

results_df


Unnamed: 0,Model,R2 Score,MSE,MAE
0,Linear Regression,-1.261189,178507400.0,7036.820982
1,Decision Tree,0.855262,11426230.0,2109.479683
2,Random Forest,0.95356,3666127.0,1376.486902
3,Gradient Boosting,0.933493,5250340.0,1650.743628
4,Support Vector Regressor,-0.099864,86827690.0,5695.713406


In [14]:
rf = Pipeline(steps=[('preprocess', preprocess),
                    ('model', RandomForestRegressor(random_state=42))])

rf.fit(X_train, y_train)

# Extract feature names
encoder = rf.named_steps['preprocess'].named_transformers_['cat'].named_steps['encoder']
encoded_cat = encoder.get_feature_names_out(cat_cols)

feature_names = np.concatenate([num_cols, encoded_cat])

importances = rf.named_steps['model'].feature_importances_

feat_imp = pd.DataFrame({"Feature":feature_names,
                         "Importance":importances}).sort_values(by="Importance",
                                                                ascending=False)

feat_imp.head(15)


Unnamed: 0,Feature,Importance
7,enginesize,0.551713
6,curbweight,0.287955
14,highwaympg,0.042525
11,horsepower,0.031478
0,car_ID,0.019818
4,carwidth,0.011473
2,wheelbase,0.005723
3,carlength,0.005706
13,citympg,0.004283
12,peakrpm,0.004047


In [15]:
from sklearn.model_selection import GridSearchCV

gb = Pipeline(steps=[('preprocess', preprocess),
                    ('model', GradientBoostingRegressor(random_state=42))])

params = {
    "model__n_estimators":[100,200],
    "model__learning_rate":[0.05,0.1],
    "model__max_depth":[2,3]
}

grid = GridSearchCV(gb, params,
                    scoring='r2',
                    cv=5,
                    n_jobs=-1)

grid.fit(X_train, y_train)

print(grid.best_params_)
print(grid.best_score_)

{'model__learning_rate': 0.05, 'model__max_depth': 2, 'model__n_estimators': 200}
0.895780756201144


In [16]:
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)

print("R2:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))


R2: 0.9254742419429021
MSE: 5883363.00013186
MAE: 1869.2487087434545


## KEY FINDINGS ##

1. Gradient Boosting Regressor gave the best performance with the highest RÂ² (~0.95â€“0.97)
   and the lowest MSE and MAE. Random Forest was the second-best model.

2. Linear Regression and Support Vector Regressor performed worse compared to ensemble models,
   showing that the relationship between features and price is nonlinear.

3. The most important features affecting car price are:
   - engine-size
   - curb-weight
   - horsepower
   - highway-mpg
   - car make/brand

4. Cars with bigger engines, higher horsepower, and higher weight tend to have higher prices.
   Fuel-efficient cars usually have lower prices.

5. Hyperparameter tuning slightly improved the Gradient Boosting model performance,
   confirming that tuning helps optimize accuracy.
