In [101]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

### 1. Loading and Preprocessing

In [79]:
file_path = "CarPrice_Assignment.csv"
df = pd.read_csv(file_path)
df

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.40,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.40,8.0,115,5500,18,22,17450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,201,-1,volvo 145e (sw),gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114,5400,23,28,16845.0
201,202,-1,volvo 144ea,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160,5300,19,25,19045.0
202,203,-1,volvo 244dl,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485.0
203,204,-1,volvo 246,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.40,23.0,106,4800,26,27,22470.0


In [81]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            205 non-null    int64  
 1   symboling         205 non-null    int64  
 2   CarName           205 non-null    object 
 3   fueltype          205 non-null    object 
 4   aspiration        205 non-null    object 
 5   doornumber        205 non-null    object 
 6   carbody           205 non-null    object 
 7   drivewheel        205 non-null    object 
 8   enginelocation    205 non-null    object 
 9   wheelbase         205 non-null    float64
 10  carlength         205 non-null    float64
 11  carwidth          205 non-null    float64
 12  carheight         205 non-null    float64
 13  curbweight        205 non-null    int64  
 14  enginetype        205 non-null    object 
 15  cylindernumber    205 non-null    object 
 16  enginesize        205 non-null    int64  
 1

In [83]:
df.describe()

Unnamed: 0,car_ID,symboling,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
count,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0
mean,103.0,0.834146,98.756585,174.049268,65.907805,53.724878,2555.565854,126.907317,3.329756,3.255415,10.142537,104.117073,5125.121951,25.219512,30.75122,13276.710571
std,59.322565,1.245307,6.021776,12.337289,2.145204,2.443522,520.680204,41.642693,0.270844,0.313597,3.97204,39.544167,476.985643,6.542142,6.886443,7988.852332
min,1.0,-2.0,86.6,141.1,60.3,47.8,1488.0,61.0,2.54,2.07,7.0,48.0,4150.0,13.0,16.0,5118.0
25%,52.0,0.0,94.5,166.3,64.1,52.0,2145.0,97.0,3.15,3.11,8.6,70.0,4800.0,19.0,25.0,7788.0
50%,103.0,1.0,97.0,173.2,65.5,54.1,2414.0,120.0,3.31,3.29,9.0,95.0,5200.0,24.0,30.0,10295.0
75%,154.0,2.0,102.4,183.1,66.9,55.5,2935.0,141.0,3.58,3.41,9.4,116.0,5500.0,30.0,34.0,16503.0
max,205.0,3.0,120.9,208.1,72.3,59.8,4066.0,326.0,3.94,4.17,23.0,288.0,6600.0,49.0,54.0,45400.0


In [87]:
print("\nMissing values in each column:\n", df.isnull().sum())


Missing values in each column:
 car_ID              0
symboling           0
CarName             0
fueltype            0
aspiration          0
doornumber          0
carbody             0
drivewheel          0
enginelocation      0
wheelbase           0
carlength           0
carwidth            0
carheight           0
curbweight          0
enginetype          0
cylindernumber      0
enginesize          0
fuelsystem          0
boreratio           0
stroke              0
compressionratio    0
horsepower          0
peakrpm             0
citympg             0
highwaympg          0
price               0
dtype: int64


In [89]:
# Droping the 'car_ID' column
if 'car_ID' in df.columns:
    df.drop(columns=['car_ID'], inplace=True)
df

Unnamed: 0,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,168.8,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,168.8,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,171.2,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,176.6,...,109,mpfi,3.19,3.40,10.0,102,5500,24,30,13950.0
4,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,176.6,...,136,mpfi,3.19,3.40,8.0,115,5500,18,22,17450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,volvo 145e (sw),gas,std,four,sedan,rwd,front,109.1,188.8,...,141,mpfi,3.78,3.15,9.5,114,5400,23,28,16845.0
201,-1,volvo 144ea,gas,turbo,four,sedan,rwd,front,109.1,188.8,...,141,mpfi,3.78,3.15,8.7,160,5300,19,25,19045.0
202,-1,volvo 244dl,gas,std,four,sedan,rwd,front,109.1,188.8,...,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485.0
203,-1,volvo 246,diesel,turbo,four,sedan,rwd,front,109.1,188.8,...,145,idi,3.01,3.40,23.0,106,4800,26,27,22470.0


In [91]:
# Extract car brand from 'CarName'
df['CarBrand'] = df['CarName'].apply(lambda x: x.split(' ')[0].lower())
df.drop(columns=['CarName'], inplace=True)

In [93]:
# Extract car brand from 'CarName'
if 'CarName' in df.columns:
    df['CarBrand'] = df['CarName'].apply(lambda x: x.split(' ')[0].lower())
    df.drop(columns=['CarName'], inplace=True)

In [95]:
# Convert 'doornumber' and 'cylindernumber' to numerical values
door_mapping = {'two': 2, 'four': 4}
cylinder_mapping = {'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'eight': 8, 'twelve': 12}
df['doornumber'] = df['doornumber'].map(door_mapping)
df['cylindernumber'] = df['cylindernumber'].map(cylinder_mapping)

In [97]:
# Ensure all expected categorical columns exist before one-hot encoding
expected_categorical_cols = ['fueltype', 'aspiration', 'carbody', 'drivewheel', 'enginelocation', 'enginetype', 'fuelsystem', 'CarBrand']
categorical_cols = [col for col in expected_categorical_cols if col in df.columns]

df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [103]:
# Handle missing values
imputer = SimpleImputer(strategy='mean')
df.iloc[:, :] = imputer.fit_transform(df)

In [105]:
# Define features and target variable
X = df.drop(columns=['price'])
y = df['price']

In [107]:
# Check for remaining NaN values
if X.isnull().sum().sum() > 0:
    X.fillna(X.mean(), inplace=True)

In [109]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### 2. Model Implementation 

In [111]:
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "Support Vector Regressor": SVR()
}

In [115]:
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results[name] = {
        "R2 Score": r2_score(y_test, y_pred),
        "MSE": mean_squared_error(y_test, y_pred),
        "MAE": mean_absolute_error(y_test, y_pred)
    }
    print(f"\n{name} Performance:")
    print(results[name])


Linear Regression Performance:
{'R2 Score': 0.8930045503073133, 'MSE': 8446651.014567822, 'MAE': 1975.5231151683988}

Decision Tree Performance:
{'R2 Score': 0.9069163491724471, 'MSE': 7348397.674485098, 'MAE': 1819.361780487805}

Random Forest Performance:
{'R2 Score': 0.9576938718557193, 'MSE': 3339815.8635595716, 'MAE': 1281.057207317073}

Gradient Boosting Performance:
{'R2 Score': 0.9268424553887316, 'MSE': 5775350.72929646, 'MAE': 1663.7260845689416}

Support Vector Regressor Performance:
{'R2 Score': -0.10197959452682226, 'MSE': 86994700.11928216, 'MAE': 5707.132546221601}


### 3. Model Evaluation

In [118]:
# Convert results to DataFrame and display
results_df = pd.DataFrame(results).T
print("\nOverall Model Performance:")
print(results_df)


Overall Model Performance:
                          R2 Score           MSE          MAE
Linear Regression         0.893005  8.446651e+06  1975.523115
Decision Tree             0.906916  7.348398e+06  1819.361780
Random Forest             0.957694  3.339816e+06  1281.057207
Gradient Boosting         0.926842  5.775351e+06  1663.726085
Support Vector Regressor -0.101980  8.699470e+07  5707.132546


#### Best Performing Model: Random Forest Regressor
##### # Highest Accuracy → R² Score: 0.958 (best among all models).
##### # Lowest Errors → MSE: 3.31M, MAE: 1288.21
##### # Handles Non-Linearity → Captures complex relationships better than Linear Regression.
##### # Avoids Overfitting → Uses multiple decision trees for better generalization.
##### # Random Forest is the best model because it delivers the highest accuracy and lowest errors while handling complex data effectively. 

### 4. Feature Importance Analysis

In [121]:
if "Random Forest" in models:
    feature_importances = models["Random Forest"].feature_importances_
    importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
    importance_df = importance_df.sort_values(by='Importance', ascending=False).head(10)
    print("\nTop 10 Important Features:")
    print(importance_df)


Top 10 Important Features:
         Feature  Importance
8     enginesize    0.545874
6     curbweight    0.292829
15    highwaympg    0.045437
12    horsepower    0.037818
4       carwidth    0.013196
39  CarBrand_bmw    0.007897
3      carlength    0.007184
2      wheelbase    0.006794
14       citympg    0.006554
13       peakrpm    0.005537


### 5. Hyperparameter

In [124]:
# Hyperparameter tuning for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
rf = RandomForestRegressor(random_state=42)
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, n_iter=20, cv=5, scoring='r2', n_jobs=-1, random_state=42)
rf_random.fit(X_train, y_train)

In [138]:
y_pred_best = rf_random.best_estimator_.predict(X_test)
final_results = {
    "Best Parameters": rf_random.best_params_,
    "R2 Score": r2_score(y_test, y_pred_best),
    "MSE": mean_squared_error(y_test, y_pred_best),
    "MAE": mean_absolute_error(y_test, y_pred_best)
}
print("\nOptimized Random Forest Performance:")
print(final_results)


Optimized Random Forest Performance:
{'Best Parameters': {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 20}, 'R2 Score': 0.9576938718557193, 'MSE': 3339815.8635595716, 'MAE': 1281.057207317073}


#### Final Conclusions
##### * Random Forest Regressor is the best model, balancing accuracy and performance.
##### * Engine size and curb weight are the strongest predictors of car prices.
##### * Hyperparameter tuning slightly improved Random Forest, but the default model was already strong.