Car Price Prediction - ML Project
A comprehensive analysis for predicting car prices in the American market

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

1. Loading and Preprocessing
---------------------------------

In [4]:
# Load the dataset
df = pd.read_csv('CarPrice_Assignment.csv')

In [6]:
# Display basic information
print("Dataset Info:")
print(f"Shape: {df.shape}")
print("\nFirst few rows:")
print(df.head())

Dataset Info:
Shape: (205, 26)

First few rows:
   car_ID  symboling                   CarName fueltype aspiration doornumber  \
0       1          3        alfa-romero giulia      gas        std        two   
1       2          3       alfa-romero stelvio      gas        std        two   
2       3          1  alfa-romero Quadrifoglio      gas        std        two   
3       4          2               audi 100 ls      gas        std       four   
4       5          2                audi 100ls      gas        std       four   

       carbody drivewheel enginelocation  wheelbase  ...  enginesize  \
0  convertible        rwd          front       88.6  ...         130   
1  convertible        rwd          front       88.6  ...         130   
2    hatchback        rwd          front       94.5  ...         152   
3        sedan        fwd          front       99.8  ...         109   
4        sedan        4wd          front       99.4  ...         136   

   fuelsystem  boreratio  stroke

In [8]:
# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())


Missing values:
car_ID              0
symboling           0
CarName             0
fueltype            0
aspiration          0
doornumber          0
carbody             0
drivewheel          0
enginelocation      0
wheelbase           0
carlength           0
carwidth            0
carheight           0
curbweight          0
enginetype          0
cylindernumber      0
enginesize          0
fuelsystem          0
boreratio           0
stroke              0
compressionratio    0
horsepower          0
peakrpm             0
citympg             0
highwaympg          0
price               0
dtype: int64


In [10]:
# Check data types
print("\nData types:")
print(df.dtypes)


Data types:
car_ID                int64
symboling             int64
CarName              object
fueltype             object
aspiration           object
doornumber           object
carbody              object
drivewheel           object
enginelocation       object
wheelbase           float64
carlength           float64
carwidth            float64
carheight           float64
curbweight            int64
enginetype           object
cylindernumber       object
enginesize            int64
fuelsystem           object
boreratio           float64
stroke              float64
compressionratio    float64
horsepower            int64
peakrpm               int64
citympg               int64
highwaympg            int64
price               float64
dtype: object


In [12]:
# Statistical summary
print("\nStatistical Summary:")
print(df.describe())


Statistical Summary:
           car_ID   symboling   wheelbase   carlength    carwidth   carheight  \
count  205.000000  205.000000  205.000000  205.000000  205.000000  205.000000   
mean   103.000000    0.834146   98.756585  174.049268   65.907805   53.724878   
std     59.322565    1.245307    6.021776   12.337289    2.145204    2.443522   
min      1.000000   -2.000000   86.600000  141.100000   60.300000   47.800000   
25%     52.000000    0.000000   94.500000  166.300000   64.100000   52.000000   
50%    103.000000    1.000000   97.000000  173.200000   65.500000   54.100000   
75%    154.000000    2.000000  102.400000  183.100000   66.900000   55.500000   
max    205.000000    3.000000  120.900000  208.100000   72.300000   59.800000   

        curbweight  enginesize   boreratio      stroke  compressionratio  \
count   205.000000  205.000000  205.000000  205.000000        205.000000   
mean   2555.565854  126.907317    3.329756    3.255415         10.142537   
std     520.680204  

Data Preprocessing
------------------

In [15]:
# Drop the car_ID column as it's just an identifier
df = df.drop('car_ID', axis=1)

In [17]:
# Convert categorical variables to numerical
categorical_features = ['CarName', 'fueltype', 'aspiration', 'doornumber', 
                        'carbody', 'drivewheel', 'enginelocation', 
                        'enginetype', 'cylindernumber', 'fuelsystem']
numerical_features = [col for col in df.columns if col not in categorical_features and col != 'price']

In [19]:
# Extract car manufacturer from CarName
df['manufacturer'] = df['CarName'].apply(lambda x: x.split(' ')[0])
df = df.drop('CarName', axis=1)
categorical_features.remove('CarName')
categorical_features.append('manufacturer')

Exploratory Data Analysis
-------------------------

In [22]:
# Correlation Matrix for numerical features
plt.figure(figsize=(15, 10))
correlation = df[numerical_features + ['price']].corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Numerical Features')
plt.tight_layout()
plt.savefig('correlation_matrix.png')
plt.close()

In [24]:
# Distribution of target variable (price)
plt.figure(figsize=(10, 6))
sns.histplot(df['price'], kde=True)
plt.title('Distribution of Car Prices')
plt.xlabel('Price ($)')
plt.ylabel('Frequency')
plt.savefig('price_distribution.png')
plt.close()

In [26]:
# Feature vs Target Visualization (Top correlated features)
top_corr_features = correlation['price'].sort_values(ascending=False).drop('price').head(5).index.tolist()

In [28]:
plt.figure(figsize=(15, 10))
for i, feature in enumerate(top_corr_features, 1):
    plt.subplot(2, 3, i)
    plt.scatter(df[feature], df['price'])
    plt.title(f'{feature} vs Price')
    plt.xlabel(feature)
    plt.ylabel('Price')
plt.tight_layout()
plt.savefig('top_features_vs_price.png')
plt.close()

In [30]:
# Categorical features analysis
plt.figure(figsize=(15, 12))
for i, feature in enumerate(categorical_features[:6], 1):
    plt.subplot(2, 3, i)
    sns.boxplot(x=feature, y='price', data=df)
    plt.title(f'{feature} vs Price')
    plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('categorical_vs_price_1.png')
plt.close()

In [32]:
plt.figure(figsize=(15, 12))
for i, feature in enumerate(categorical_features[6:], 1):
    plt.subplot(2, 3, i)
    sns.boxplot(x=feature, y='price', data=df)
    plt.title(f'{feature} vs Price')
    plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('categorical_vs_price_2.png')
plt.close()

In [34]:
# Preparing data for modeling
X = df.drop('price', axis=1)
y = df['price']

In [36]:
# Create preprocessing pipelines
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [38]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [40]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [42]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

2. Model Implementation
-----------------------

In [45]:
# Function to evaluate model performance
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    return r2, mse, mae

In [47]:
# Dictionary to store model performances
model_performances = {}

In [49]:
# Linear Regression
lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])
lr_pipeline.fit(X_train, y_train)
lr_r2, lr_mse, lr_mae = evaluate_model(lr_pipeline, X_test, y_test)
model_performances['Linear Regression'] = {
    'R2': lr_r2,
    'MSE': lr_mse,
    'MAE': lr_mae
}

In [51]:
# Decision Tree Regressor
dt_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor(random_state=42))
])
dt_pipeline.fit(X_train, y_train)
dt_r2, dt_mse, dt_mae = evaluate_model(dt_pipeline, X_test, y_test)
model_performances['Decision Tree'] = {
    'R2': dt_r2,
    'MSE': dt_mse,
    'MAE': dt_mae
}

In [53]:
# Random Forest Regressor
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])
rf_pipeline.fit(X_train, y_train)
rf_r2, rf_mse, rf_mae = evaluate_model(rf_pipeline, X_test, y_test)
model_performances['Random Forest'] = {
    'R2': rf_r2,
    'MSE': rf_mse,
    'MAE': rf_mae
}

In [55]:
# Gradient Boosting Regressor
gb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(random_state=42))
])
gb_pipeline.fit(X_train, y_train)
gb_r2, gb_mse, gb_mae = evaluate_model(gb_pipeline, X_test, y_test)
model_performances['Gradient Boosting'] = {
    'R2': gb_r2,
    'MSE': gb_mse,
    'MAE': gb_mae
}

In [57]:
# Support Vector Regressor
svr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', SVR())
])
svr_pipeline.fit(X_train, y_train)
svr_r2, svr_mse, svr_mae = evaluate_model(svr_pipeline, X_test, y_test)
model_performances['SVR'] = {
    'R2': svr_r2,
    'MSE': svr_mse,
    'MAE': svr_mae
}

3. Model Evaluation
-------------------

In [60]:
# Create a DataFrame for model comparison
model_comparison = pd.DataFrame(model_performances).T
model_comparison = model_comparison.sort_values('R2', ascending=False)

In [62]:
print("\nModel Performance Comparison:")
print(model_comparison)


Model Performance Comparison:
                             R2           MSE           MAE
Random Forest      9.576212e-01  3.345553e+06  1.302010e+03
Gradient Boosting  9.267908e-01  5.779426e+06  1.695765e+03
Decision Tree      8.962240e-01  8.192494e+06  1.923679e+03
SVR               -9.978948e-02  8.682180e+07  5.695278e+03
Linear Regression -6.732595e+20  5.314981e+28  4.647172e+13


In [64]:
# Visualize model performances
plt.figure(figsize=(12, 6))
model_comparison['R2'].plot(kind='bar', color='skyblue')
plt.title('R-squared Scores by Model')
plt.ylabel('R-squared')
plt.ylim(0, 1)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('r2_comparison.png')
plt.close()

In [66]:
plt.figure(figsize=(12, 6))
model_comparison['MSE'].plot(kind='bar', color='salmon')
plt.title('Mean Squared Error by Model')
plt.ylabel('MSE')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('mse_comparison.png')
plt.close()

In [68]:
plt.figure(figsize=(12, 6))
model_comparison['MAE'].plot(kind='bar', color='lightgreen')
plt.title('Mean Absolute Error by Model')
plt.ylabel('MAE')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('mae_comparison.png')
plt.close()

In [70]:
# Determine the best model
best_model = model_comparison.index[0]
print(f"\nBest performing model: {best_model}")
print(f"R-squared: {model_comparison.loc[best_model, 'R2']:.4f}")
print(f"MSE: {model_comparison.loc[best_model, 'MSE']:.4f}")
print(f"MAE: {model_comparison.loc[best_model, 'MAE']:.4f}")


Best performing model: Random Forest
R-squared: 0.9576
MSE: 3345553.2007
MAE: 1302.0097


4. Feature Importance Analysis
------------------------------

In [73]:
if best_model == 'Random Forest':
    model = rf_pipeline
elif best_model == 'Gradient Boosting':
    model = gb_pipeline
elif best_model == 'Decision Tree':
    model = dt_pipeline
else:
    # If Linear Regression or SVR wins, use Random Forest for feature importance
    print("\nUsing Random Forest for feature importance analysis...")
    model = rf_pipeline

In [75]:
# Get feature names after preprocessing
preprocessed_features = []
for name, transformer, features in preprocessor.transformers_:
    if name == 'num':
        preprocessed_features.extend(features)
    else:  # categorical features
        # For categorical features, get the one-hot encoded feature names
        preprocessed_features.extend([f"{feat}_{cat}" for feat in features 
                                     for cat in transformer.named_steps['onehot'].categories_[features.index(feat)]])

In [77]:
# Extract feature importances (works for tree-based models)
if hasattr(model.named_steps['regressor'], 'feature_importances_'):
    feature_importances = model.named_steps['regressor'].feature_importances_
    
    # Create a DataFrame for feature importances
    fi_df = pd.DataFrame({
        'Feature': preprocessed_features,
        'Importance': feature_importances
    })
    
    # Sort by importance
    fi_df = fi_df.sort_values('Importance', ascending=False).head(15)
    
    # Visualize feature importances
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=fi_df)
    plt.title(f'Top 15 Feature Importances ({best_model})')
    plt.tight_layout()
    plt.savefig('feature_importances.png')
    plt.close()
    
    print("\nTop 10 Important Features:")
    print(fi_df.head(10))
else:
    print("\nThe best model doesn't support direct feature importance calculation.")
    # Alternative: Use permutation importance or SHAP values


Top 10 Important Features:
             Feature  Importance
6         enginesize    0.547857
5         curbweight    0.295958
13        highwaympg    0.045167
10        horsepower    0.032016
3           carwidth    0.013500
54  manufacturer_bmw    0.007762
2          carlength    0.007194
1          wheelbase    0.006675
12           citympg    0.005862
11           peakrpm    0.005572


5. Hyperparameter Tuning
------------------------

In [80]:
print("\nPerforming hyperparameter tuning on the best model...")


Performing hyperparameter tuning on the best model...


In [82]:
if best_model == 'Random Forest':
    param_grid = {
        'regressor__n_estimators': [50, 100, 200],
        'regressor__max_depth': [None, 10, 20, 30],
        'regressor__min_samples_split': [2, 5, 10],
        'regressor__min_samples_leaf': [1, 2, 4]
    }
    best_pipeline = rf_pipeline

In [88]:
if best_model == 'Gradient Boosting':
    param_grid = {
        'regressor__n_estimators': [50, 100, 200],
        'regressor__learning_rate': [0.01, 0.1, 0.2],
        'regressor__max_depth': [3, 5, 7],
        'regressor__min_samples_split': [2, 5]
    }
    best_pipeline = GradientBoostingRegressor()


In [92]:
if best_model == 'Decision Tree':
    param_grid = {
        'regressor__max_depth': [None, 10, 20, 30],
        'regressor__min_samples_split': [2, 5, 10],
        'regressor__min_samples_leaf': [1, 2, 4],
        'regressor__criterion': ['squared_error', 'friedman_mse', 'absolute_error']
    }
    best_pipeline = DecisionTreeRegressor()


In [96]:
if best_model == 'Linear Regression':
    print("Linear Regression does not require hyperparameter tuning.")
    tuned_r2, tuned_mse, tuned_mae = lr_r2, lr_mse, lr_mae
    best_pipeline = lr_pipeline


In [104]:
if best_model == 'Linear Regression':
    print("Linear Regression does not require hyperparameter tuning.")
    tuned_r2, tuned_mse, tuned_mae = lr_r2, lr_mse, lr_mae
    best_pipeline = lr_pipeline
elif best_model == 'Decision Tree':
    param_grid = {
        'regressor__max_depth': [None, 10, 20, 30],
        'regressor__min_samples_split': [2, 5, 10],
        'regressor__min_samples_leaf': [1, 2, 4],
        'regressor__criterion': ['squared_error', 'friedman_mse', 'absolute_error']
    }
    best_pipeline = DecisionTreeRegressor()
elif best_model == 'Gradient Boosting':
    param_grid = {
        'regressor__n_estimators': [50, 100, 200],
        'regressor__learning_rate': [0.01, 0.1, 0.2],
        'regressor__max_depth': [3, 5, 7],
        'regressor__min_samples_split': [2, 5]
    }
    best_pipeline = GradientBoostingRegressor()
else:  # SVR
    param_grid = {
        'regressor__C': [0.1, 1, 10, 100],
        'regressor__gamma': ['scale', 'auto', 0.1, 0.01],
        'regressor__kernel': ['linear', 'rbf', 'poly']
    }
    best_pipeline = SVR()


In [108]:
param_grid = {
    'regressor__C': [0.1, 1, 10, 100],
    'regressor__gamma': ['scale', 'auto', 0.1, 0.01],
    'regressor__kernel': ['linear', 'rbf', 'poly']
}


In [110]:
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.1, 0.01],
    'kernel': ['linear', 'rbf', 'poly']
}


In [122]:
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [None, 10, 20],
    'regressor__min_samples_split': [2, 5, 10]
}


In [124]:
from sklearn.model_selection import GridSearchCV

# Assuming you already have a model (e.g., a RandomForestRegressor or other)
# and a parameter grid defined (param_grid)

# Example: grid_search initialization and fitting
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='r2')

# Fit the grid search on training data
grid_search.fit(X_train, y_train)

# Retrieve the best model from the grid search results
tuned_model = grid_search.best_estimator_

# Now evaluate the tuned model using the test set
tuned_r2, tuned_mse, tuned_mae = evaluate_model(tuned_model, X_test, y_test)

# Now proceed with the performance comparison
print("\nModel Performance Comparison (Before vs. After Tuning):")
print(f"R-squared: {model_comparison.loc[best_model, 'R2']:.4f} -> {tuned_r2:.4f}")
print(f"MSE: {model_comparison.loc[best_model, 'MSE']:.4f} -> {tuned_mse:.4f}")
print(f"MAE: {model_comparison.loc[best_model, 'MAE']:.4f} -> {tuned_mae:.4f}")



Model Performance Comparison (Before vs. After Tuning):
R-squared: 0.9576 -> 0.9591
MSE: 3345553.2007 -> 3232388.3535
MAE: 1302.0097 -> 1230.7212


In [126]:
improvement_r2 = (tuned_r2 - model_comparison.loc[best_model, 'R2']) / model_comparison.loc[best_model, 'R2'] * 100
print(f"R-squared Improvement: {improvement_r2:.2f}%")

R-squared Improvement: 0.15%


6. Conclusion
-------------

In [129]:
print("\nConclusion:")
print(f"1. The best performing model is {best_model}")
print("2. Key factors affecting car prices include:")
if 'fi_df' in locals():
    for i, (feature, importance) in enumerate(zip(fi_df['Feature'].head(5), fi_df['Importance'].head(5))):
        print(f"   {i+1}. {feature}: {importance:.4f}")
print(f"3. The model achieves an R-squared of {tuned_r2:.4f} after tuning, explaining {tuned_r2*100:.2f}% of the variance in car prices")
print(f"4. The Mean Absolute Error is ${tuned_mae:.2f}, indicating the average prediction error")


Conclusion:
1. The best performing model is Random Forest
2. Key factors affecting car prices include:
   1. enginesize: 0.5479
   2. curbweight: 0.2960
   3. highwaympg: 0.0452
   4. horsepower: 0.0320
   5. carwidth: 0.0135
3. The model achieves an R-squared of 0.9591 after tuning, explaining 95.91% of the variance in car prices
4. The Mean Absolute Error is $1230.72, indicating the average prediction error


In [131]:
# Save the final model
import joblib
joblib.dump(tuned_model if best_model != 'Linear Regression' else best_pipeline, 'car_price_prediction_model.pkl')
print("\nFinal model saved as 'car_price_prediction_model.pkl'")


Final model saved as 'car_price_prediction_model.pkl'
