<a href="https://colab.research.google.com/github/robertherreraaa/Car-Prices-Prediction/blob/main/Car_Prices_Prediction_Robert_Herrera_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import zipfile
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_percentage_error
import numpy as np
pd.set_option('display.max_columns', None)

In [None]:
zip_file_path = r'/content/drive/MyDrive/Car Prices Prediction Dataset.zip'
file_folder_path = r'/content/drive/MyDrive/Uplift Datasets/'

In [None]:
with zipfile.ZipFile(zip_file_path) as zip_ref:
  zip_ref.extractall(file_folder_path)

In [None]:
file_path = r'/content/drive/MyDrive/Uplift Datasets/CarPricesPrediction.csv'

In [None]:
# Load the data

df = pd.read_csv(file_path)

In [None]:
df

# DATA CLEANING AND VISUALIZATIONS

In [None]:
list_col1 = ['Make','Model','Year','Condition']

for col in list_col1:
    print(f'Column Name: {col}:')
    print(df[col].value_counts())
    print('------------------------------')

In [None]:
list_col1 = ['Make','Model','Year','Condition']

plt.figure(figsize=(16,14))
for i,col in enumerate(list_col1,start=1):
    #(1) Pie plot
    plt.subplot(4,2,2*i)
    value_counts = df[col].value_counts()
    plt.pie(x=value_counts, labels=value_counts.index, autopct='%1.1f%%')
    plt.title(f'{col} distribution', fontweight='bold')

    #(2) Countplot
    plt.subplot(4,2,2*i-1)
    sns.countplot(data=df, x=col)
    counts = df[col].value_counts()
    for i, count in enumerate(counts):
        plt.text(x=i, y=count, s=str(count), ha='center', va='bottom')
    plt.title(f'{col} distribution', fontweight='bold')
    plt.grid(axis='both')

plt.tight_layout()

### Price Distribution

In [None]:
mean = df['Price'].mean()
mode = df['Price'].mode()[0]
median = df['Price'].median()


fig, axs = plt.subplots(1,2,figsize=(10,4))

sns.histplot(data=df, x='Price', kde=True, palette='magma', ax=axs[0])
axs[0].set_title('Price distribution (Histogram)' , fontweight='bold')
axs[0].set_ylabel('Frequency')
axs[0].axvline(mean, color='red', linestyle='--',label=f'Mean: {mean:.2f}')
axs[0].axvline(mode, color='blue', linestyle='--',label=f'Mode: {mode:.2f}')
axs[0].axvline(median, color='green', linestyle='--',label=f'Median: {mode:.2f}')
axs[0].legend()


sns.boxplot(data=df, y='Price', palette='magma')
axs[1].set_title('Price distribtuion (BoxPlot)', fontweight='bold')


plt.tight_layout()
plt.show()

In [None]:
col_list_hue = ['Make','Model','Year','Condition']

plt.figure(figsize=(10,8))
for i, col in enumerate(col_list_hue):
    plt.subplot(2,2,i+1)
    sns.histplot(data=df, x='Price', hue=col, kde=True)
    plt.title(f'Price distribution hue with {col}', fontweight='bold')
    plt.tight_layout()

    # Note: Older manufactured cars tend to be more expensive.

### Mileage Distribution

In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(data=df, x='Mileage', y='Price')
plt.title('Mileage & Car Price', fontweight='bold')
plt.show()

print(df[['Mileage','Price']].corr())

# Note: Increase in Mileage cause the car price to decrease

In [None]:
col_list_hue = ['Make','Model','Year','Condition']

plt.figure(figsize=(12,8))
for i, col in enumerate(col_list_hue):
    plt.subplot(2,2,i+1)
    sns.scatterplot(data=df, x='Mileage', y='Price', hue=col)
    plt.title(f'Mileage & Car Price hue with {col}')

plt.tight_layout()
plt.show()

# LINEAR REGRESSION and DECISION TREE MODEL

In [None]:
# Split the data into training and test

train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
# Train Set

x_train = train_set.drop('Price', axis=1)
y_train = train_set['Price']

In [None]:
# Identify numeric and categorical columns

num_cols = ['Year', 'Mileage']
cat_cols = ['Make', 'Model', 'Condition']

# Transformer: impute + encode
trans_pipeline = ColumnTransformer([
    ("num", SimpleImputer(strategy='median'), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])

x_train_trans = trans_pipeline.fit_transform(x_train)

In [None]:
# Test Set

x_test = test_set.drop('Price', axis=1)
y_test = test_set['Price']

In [None]:
# Identify numeric and categorical columns

num_cols = ['Year', 'Mileage']
cat_cols = ['Make', 'Model', 'Condition']


# Transformer: impute + encode
trans_pipeline = ColumnTransformer([
    ("num", SimpleImputer(strategy='median'), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])

x_test_trans = trans_pipeline.fit_transform(x_test)

### MODEL TRAINING

In [None]:
# Train Decision Tree
decision_model = DecisionTreeRegressor(random_state=42)
decision_model.fit(x_train_trans, y_train)
decision_model_predictions = decision_model.predict(x_test_trans)

In [None]:
# Train Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(x_train_trans, y_train)
lin_model_predictions = lin_reg.predict(x_test_trans)

### GETTING THE PREDICTION (ACCURACY)

In [None]:
# Measure model performance
lin_error = mean_absolute_percentage_error(y_test, lin_model_predictions)
decision_error = mean_absolute_percentage_error(y_test, decision_model_predictions)

In [None]:
print(f"Linear Regression Error: {lin_error}")
print(f"Decision Tree Error: {decision_error}")

In [None]:
# Linear Regression Error is 0.00000282555 or 0.00028%
# Decision Tree Error is 0.006 or 0.63%

In [None]:
# Append test results to test set for comparison
test_set['Vehicle Price Predictions (Linear Model)'] = lin_model_predictions
test_set['Vehicle Price Predictions (Decision Tree Model)'] = decision_model_predictions

In [None]:
test_set.head(10)

Source: https://www.geeksforgeeks.org/machine-learning/types-of-regression-techniques/

# RANDOM FOREST

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

In [None]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(x_train_trans, y_train)
rf_model_predictions = rf_model.predict(x_test_trans)

print("Random Forest:")
print("  Absolute Percentage Error:", mean_absolute_percentage_error(y_test, rf_model_predictions))

# Random Forest has 0.48% Error

# SUPPORT VECTOR REGRESSOR (SVR)

In [None]:
from sklearn.svm import SVR

svr_model = SVR(kernel="rbf", C=100, gamma=0.1)
svr_model.fit(x_train_trans, y_train)
svr_model_predictions = svr_model.predict(x_test_trans)

print("Support Vector Regressor:")
print("  Absolute Percentage Error:", mean_absolute_percentage_error(y_test, svr_model_predictions))

# SVR has 18.11% Error

# LASSO REGRESSION

In [None]:
from sklearn.linear_model import Lasso

lasso_model = Lasso(alpha=0.001, random_state=42, max_iter=10000)
lasso_model.fit(x_train_trans, y_train)
lasso_model_predictions = lasso_model.predict(x_test_trans)

# alpha controls regularization strength. If itâ€™s too high, Lasso may zero-out most coefficients.
# max_iter=10000 avoids convergence warnings when dataset is large.

print("Lasso Regression:")
print("  Absolute Percentage Error:", mean_absolute_percentage_error(y_test, lasso_model_predictions))

# Lasso has 0.00028% Error

# EVALUATING ALL MODELS

### Using Table

In [None]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso Regression": Lasso(alpha=0.001, random_state=42, max_iter=10000),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "SVR": SVR(kernel="rbf", C=100, gamma=0.1)
}

results = []
for name, model in models.items():
    model.fit(x_train_trans, y_train)
    predictions = model.predict(x_test_trans)
    mape = mean_absolute_percentage_error(y_test, predictions)
    percentage = (mean_absolute_percentage_error(y_test, predictions))*100
    results.append({'Model': name, 'MAPE': mape, 'Percentage(%)': percentage})

# Show the comparison table

results_df = pd.DataFrame(results).sort_values('MAPE', ascending=True)
# This will make the table show the data with the  best model fit to least (the higher the R-squared, the better fit as a model)

print(results_df)

### Using Graph

In [None]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso Regression": Lasso(alpha=0.001, random_state=42, max_iter=10000),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "SVR": SVR(kernel="rbf", C=100, gamma=0.1)
}

results = []
for name, model in models.items():
    model.fit(x_train_trans, y_train)
    predictions = model.predict(x_test_trans)
    mape = mean_absolute_percentage_error(y_test, predictions)
    percentage = (mean_absolute_percentage_error(y_test, predictions))*100
    results.append({'Model': name, 'MAPE': mape, 'Percentage(%)': percentage})

# Show the comparison table

results_df = pd.DataFrame(results).sort_values('MAPE', ascending=True)
# This will make the table show the data with the  best model fit to least (the higher the R-squared, the better fit as a model)

In [None]:
plt.figure(figsize=(10,6))
plt.bar(results_df["Model"], results_df["Percentage(%)"])
plt.ylabel("Mean Absolute Percentage Error")
plt.title("Model Comparison (Percentage Error)")
plt.xticks(rotation=30, ha="right")
plt.tight_layout()
plt.show()