In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Read in Data

In [None]:
cars = pd.read_csv("../input/car-data/CarPrice_Assignment.csv", index_col = 0)

In [None]:
cars.head()

In [None]:
cars.describe()

In [None]:
cars.info()

No missing data

# User Defined Functions

In [None]:
def column_percenteges(data, column):
    return data[column].value_counts(normalize = True)

In [None]:
def lm_plot(data, x_column, y_column = "price"):
    sns.regplot(x = x_column, y = y_column, data = data)
    plt.title(str(x_column).title() + " and " + str(y_column).title())
    plt.xlabel(x_column.title())
    plt.ylabel(y_column.title())

In [None]:
def box_plot(data, x_column, y_column = "price"):
    sns.boxplot(x = x_column, y = y_column, data = data)
    plt.title(str(x_column).title() + " and " + str(y_column).title())
    plt.xlabel(x_column.title())
    plt.ylabel(y_column.title())

# Explore Data

## Symboling

In [None]:
cars["symboling"].describe()

In [None]:
cars["symboling"] = cars["symboling"].astype("object")

In [None]:
cars["symboling"].dtype

In [None]:
sns.countplot(x = "symboling", data = cars)
plt.title("Count of symboling")
plt.ylabel("Count")
plt.xlabel("symboling")

In [None]:
box_plot(cars, "symboling")

Symboling describes the assigned risk rating. Negative 3 would represent the safest option, while 3 represents the riskiest option. If the risk rating is at the extreme ends of possible values, we see higher prices. With cars having symboling of 1 having the lowest price.

## Car Name

In [None]:
cars["CarName"].unique()

There are a lot of different categories. Because of this, I will take away the specific car title and create a new variable called brand.

In [None]:
brand_list = pd.Series(cars["CarName"].str.lower().str.split(" "))
print(brand_list)

In [None]:
brands = []
for i in brand_list:
    brands.append(i[0])
np.unique(brands)

In [None]:
# fix misspellings
for i in range(len(brands)):
    if brands[i] == "maxda":
        brands[i] = "mazda"
    elif brands[i] == "porcshce":
        brands[i] = "porsche"
    elif brands[i] == "toyouta":
        brands[i] = "toyota"
    elif brands[i] == "vw":
        brands[i] = "volkswagen"
        
np.unique(brands)

In [None]:
# add new column and drop carname
cars["brand"] = brands
cars.drop("CarName", axis = 1, inplace = True)

In [None]:
column_percenteges(cars, "brand")

Since some car brands appear rarely in the data, I will be keeping the top 10 and renaming the rest to other.

In [None]:
keep_brand = cars["brand"].value_counts().index[:10]
cars["brand"] = np.where(cars["brand"].isin(keep_brand), cars["brand"], "other")
cars["brand"].unique()

In [None]:
plt.figure(figsize = (7, 7))
box_plot(cars, x_column = "price", y_column = "brand")

Several car brands have a higher average prices than others. These include volvo, peugeot and car brands in the other category. Now that we have brand name, it might be interesting to see if there are certain brands that are deemed riskier or safer than others using the symboling column and to compare it to the prices.

In [None]:
brand_and_symboling = cars.groupby("brand")["symboling"].value_counts()
brand_and_symboling_df = pd.DataFrame(brand_and_symboling).rename(columns = {"symboling": "count"}).reset_index()
brand_and_symboling_df[brand_and_symboling_df["symboling"] == 3].sort_values(by = "count", ascending = False)

In [None]:
brand_and_symboling_df[brand_and_symboling_df["symboling"] == -2].sort_values(by = "count", ascending = False)

Brands in the other category seem to have the least safe cars. Perhaps a lot of these cars are sports cars, which could also explain the higher price. Volvo is the only car brand that has cars with a risk rating of -2, which is the lowest value that appears in the dataset. Since it looks like the safest brand of car, the higher price might reflect the perception people have with the brand being safe.

## Fuel Type

In [None]:
cars["fueltype"].unique()

In [None]:
sns.countplot(x = "fueltype", data = cars)
plt.title("Gas vs Disel")
plt.ylabel("Count")
plt.xlabel("Fuel Type")

Gas is much more popular fuel choice for cars than diesel.

In [None]:
box_plot(cars, "fueltype")

Although there are much more gas cars than diesel, diesel cars are more expensive on average than gas cars are.

## Aspiration

In [None]:
cars["aspiration"].unique()

In [None]:
sns.countplot(x = "aspiration", data = cars)
plt.title("Standard vs Turbo Cars")
plt.ylabel("Count")
plt.xlabel("Aspiration")

In [None]:
sns.boxplot(x = "aspiration", y = "price", hue = "fueltype", data = cars)
plt.title("Aspiration, Price by Fuel Type")
plt.ylabel("Price")
plt.xlabel("Aspiration")

According to our graph when looking at standard aspiration, the price of gas cars is actually more expensive than diesel cars. We still see an increase in the cost of a diesel car over a gas car when looking at turbo aspiration. This could be due to a low amount of cars in the data that have a standard aspiration and run on diesel.

In [None]:
cars.groupby("aspiration")["fueltype"].value_counts()

There are only 7 cars that use diesel when using standard aspiration, compared to 161 that use gas. So the lower average price is probably due to a small sample size.

## Door Number

In [None]:
cars["doornumber"].unique()

In [None]:
column_percenteges(cars, "doornumber")

In [None]:
box_plot(cars, "doornumber")

The price of a car seems to be relatively consistent when looking at two or four door cars.

## Car Body

In [None]:
cars["carbody"].unique()

In [None]:
column_percenteges(cars, "carbody")

In [None]:
sns.boxplot(x = "price", y = "carbody", hue = "doornumber", data = cars)
plt.title("Body of Car, Price and Number of Doors")
plt.legend(title = "# of doors")
plt.ylabel("Car Body")
plt.xlabel("Price")

Sedans seem to cost more with four doors instead of two, while hatchbacks seem to cost slightly less with four doors instead of two. For the other values, there is only one category for number of doors. However, the average cost of hardtop and convertibles are much higher compared to the other car bodies.

## Drive Wheel

In [None]:
cars["drivewheel"].unique()

In [None]:
column_percenteges(cars, "drivewheel")

In [None]:
box_plot(cars, "drivewheel")

Rear wheel drive cars seem to cost the most, with front wheel drive and four wheel drive costing about the same.

## Engine Location

In [None]:
cars["enginelocation"].unique()

In [None]:
column_percenteges(cars, "enginelocation")

Since there is an overwhelming majority of values that have the engine located in the front. I am going to drop this column.

In [None]:
cars.drop("enginelocation", axis = 1, inplace = True)

## Wheel Base

In [None]:
cars["wheelbase"].describe()

In [None]:
lm_plot(cars, "wheelbase")

Wheel Base describes the distance between the centres of the front and rear wheels. It makes sense that a larger car would demand a higher price.

## Car Length, Width and Height

In [None]:
print(cars["carlength"].describe())
print("\n")
print(cars["carwidth"].describe())
print("\n")
print(cars["carheight"].describe())

In [None]:
lm_plot(cars, "carlength")

In [None]:
lm_plot(cars, "carwidth")

In [None]:
lm_plot(cars, "carheight")

We can see here again with these 3 graphs that a larger car leads to a higher price. The effect is not as obvious when looking at car height, as the slope of the regression line is not as steep.

## Curb Weight

In [None]:
cars["curbweight"].describe()

In [None]:
lm_plot(cars, "curbweight")

Curb weight descibes the weight of a car with no passengers or baggage. We can see that a heavier car leads to an increase on the price of the car. This might be due to heavier cars being smaller, and therefore costing less. We can check by comparing weight to length and width.

In [None]:
sns.scatterplot(x = "curbweight", y = "price", hue = "carlength", palette = "coolwarm", data = cars)
plt.title("Weight of Car, Price and Length")
plt.ylabel("Price")
plt.xlabel("Weight of Car")

In [None]:
sns.scatterplot(x = "curbweight", y = "price", hue = "carwidth", palette = "coolwarm", data = cars)
plt.title("Weight of Car, Price, and Width")
plt.ylabel("Price")
plt.xlabel("Weight of Car")

We can see here that cars that are heavier also tend to be longer and wider. This might be because smaller cars need less resources to be made, which decreases how much the car costs.

## Engine Type

In [None]:
column_percenteges(cars, "enginetype")

There is one value that takes up about 72% of all the observations in this column. Because of this, I will be keeping the majority observation and the next most occuring observation, and chaning all other observations to be other.

In [None]:
keep_engine = cars["enginetype"].value_counts().index[:2]
cars["enginetype"] = np.where(cars["enginetype"].isin(keep_engine), cars["enginetype"], "Other")

In [None]:
column_percenteges(cars, "enginetype")

In [None]:
box_plot(cars, "enginetype")

Engines that were defined as not ohc or ohcf seem to lead to higher priced cars than ohc and ohcf engines do.

## Cylinder Number

In [None]:
cars["cylindernumber"].unique()

In [None]:
box_plot(cars, "cylindernumber")

It seems like the more cylinders added to a car, the higher the price of the car becomes.

In [None]:
cars.columns

In [None]:
# label encoding
cylinders = []
for i in range(cars.shape[0]):
    if cars.iloc[i, 12] == "two":
        cylinders.append(0)
    elif cars.iloc[i, 12] == "three":
        cylinders.append(1)
    elif cars.iloc[i, 12] == "four":
        cylinders.append(2)
    elif cars.iloc[i, 12] == "five":
        cylinders.append(3)
    elif cars.iloc[i, 12] == "six":
        cylinders.append(4)
    elif cars.iloc[i, 12] == "eight":
        cylinders.append(5)
    elif cars.iloc[i, 12] == "twelve":
        cylinders.append(6)
    else:
        cylinders.append(np.nan)

In [None]:
cars["cylinder_ordinal"] = cylinders
cars.drop("cylindernumber", axis = 1, inplace = True)

In [None]:
cars["cylinder_ordinal"].describe()

## Engine Size

In [None]:
cars["enginesize"].describe()

In [None]:
lm_plot(cars, "enginesize")

It seems that keeping with the theme of bigger = more expensive, a bigger engine size leads to a more expensive car.

## Fuel System

In [None]:
cars["fuelsystem"].unique()

In [None]:
column_percenteges(cars, "fuelsystem")

Again we have a column where a majority of the instances are taken up by 2 values. I will keep the first two values and the rest will become "other".

In [None]:
keep_fuelsystem = cars["fuelsystem"].value_counts().index[:2]
cars["fuelsystem"] = np.where(cars["fuelsystem"].isin(keep_fuelsystem), cars["fuelsystem"], "other")

In [None]:
box_plot(cars, "fuelsystem")

In [None]:
sns.boxplot(x = "fuelsystem", y = "price", hue = "fueltype", data = cars)
plt.title("Fuel System and Price")
plt.legend(title = "Fuel Type")
plt.ylabel("Price")
plt.xlabel("Fuel System")

It looks like the mpfi fuel system leads to higher car prices than other fuel systems. We also see again that diesel is more expensive than gas for the observations coded as other, but there are no diesel fuel type cars with our other two fuel systems.

## Boreratio

In [None]:
cars["boreratio"].describe()

In [None]:
plt.hist(x = "boreratio", data = cars)
plt.title("Histogram of Boreratio")
plt.xlabel("Boreratio")

A majority of the values fall between 3 and 3.6.

## Stroke

In [None]:
cars["stroke"].describe()

In [None]:
plt.hist(x = "stroke", data = cars)
plt.title("Histogram of Stroke")
plt.xlabel("Stroke")

A majority of the values for stroke are also between 3 and 3.6.

## Compression Ratio

In [None]:
cars["compressionratio"].describe()

In [None]:
lm_plot(cars, "compressionratio")

The values for compression ratio seem to be either very high or very low. There also does not seem to be much, if any, difference in price between high and low values. A majority of the values are also around the mean of 9.

## Horse Power

In [None]:
lm_plot(cars, "horsepower")

Car prices increase as the horse power a car has increases. There don't seem to be any obvious outliers when looking at the graph.

## Peak RPM

In [None]:
cars["peakrpm"].describe()

In [None]:
lm_plot(cars, "peakrpm")

There seems to be a small decline in the price of a car as the peak rpm of the car increases.

In [None]:
lm_plot(cars, x_column = "horsepower", y_column = "peakrpm")

There does not seem to be any correlation between peak rpm and horsepower.

## City and High Way MPG

In [None]:
lm_plot(cars, "citympg")

In [None]:
lm_plot(cars, "highwaympg")

In [None]:
lm_plot(cars, x_column = "citympg", y_column = "highwaympg")

It is interesting that as city and high way mpg increase, the price of a car seems to decrease. This could be due to sports cars getting less mpg than other cars, while also costing much more. There is also a strong correlation between city mpg and highway mpg.

## Price (dependent variable)

In [None]:
plt.hist(x = "price", data = cars)
plt.title("Histogram of Price")
plt.xlabel("Price")

## Correlation Matrix

In [None]:
corr_matrix = cars.corr()

plt.figure(figsize = (6,5))
sns.heatmap(corr_matrix, cmap = "coolwarm")

plt.title("Correlation Coeffiecents of Independent Variables")

There seems to be some very strong correlation between the wheelbase/carlength/carwidth/carheight and citympg/highwaympg variables. Because of this, I will be dropping columns and keeping only one from each of those sets of columns in order to try to prevent multicollinearity issues.

In [None]:
cars.drop(["wheelbase", "carwidth", "carheight", "citympg"], axis = 1, inplace = True)

In [None]:
cars.info()

# Modeling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [None]:
X = cars.drop("price", axis = 1)
y = cars["price"]

In [None]:
X_train_full, X_test_full, y_train, y_test = train_test_split(X, y, train_size = .8, test_size = .2, random_state = 42)

In [None]:
numeric_cols = [col for col in X_train_full.columns if X_train_full[col].dtype in (["int64", "float64"])]
categorical_cols = [col for col in X_train_full.columns if X_train_full[col].dtype == "object"]

my_cols = numeric_cols + categorical_cols
X_train = X_train_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

In [None]:
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown = "ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

## Linear Regression

In [None]:
linear_model = LinearRegression()

In [None]:
linear_pipeline = Pipeline(steps = [
    ("preprocessor", preprocessor),
    ("model", linear_model)
])

In [None]:
linear_pipeline.fit(X_train, y_train)

In [None]:
linear_preds = linear_pipeline.predict(X_test)

In [None]:
linear_mae = mean_absolute_error(y_test, linear_preds)
print(linear_mae)

In [None]:
linear_r2 = r2_score(y_test, linear_preds)
print(linear_r2)

## Stochastic Gradient Descent

In [None]:
sgd_model = SGDRegressor()

In [None]:
sgd_pipeline = Pipeline(steps = [
    ("preprocessor", preprocessor),
    ("model", sgd_model)
])

In [None]:
sgd_params = {
    "model__penalty": ["l2", "l1", "elasticnet"],
    "model__max_iter": [500, 1000, 1500],
    "model__learning_rate": ["constant", "optimal", "invscaling", "adaptive"]
}

In [None]:
clf_sgd = GridSearchCV(sgd_pipeline, param_grid = sgd_params, cv = 5, n_jobs = -1)

In [None]:
clf_sgd.fit(X_train, y_train)

In [None]:
sgd_preds = clf_sgd.predict(X_test)

In [None]:
sgd_mae = mean_absolute_error(y_test, sgd_preds)
print(sgd_mae)

In [None]:
sgd_r2 = r2_score(y_test, sgd_preds)
print(sgd_r2)

## Random Forest

In [None]:
rf_model =  RandomForestRegressor()

In [None]:
rf_pipeline = Pipeline(steps = [
    ("preprocessor", preprocessor),
    ("model", rf_model)
])

In [None]:
rf_params = {
    "model__n_estimators": [100, 250, 500],
}

In [None]:
clf_rf = GridSearchCV(rf_pipeline, param_grid = rf_params, cv = 5, n_jobs = -1)

In [None]:
clf_rf.fit(X_train, y_train)

In [None]:
rf_preds = clf_rf.predict(X_test)

In [None]:
rf_mae = mean_absolute_error(y_test, rf_preds)
print(rf_mae)

In [None]:
rf_r2 = r2_score(y_test, rf_preds)
print(rf_r2)

## XGBoost

In [None]:
xgb_model = XGBRegressor()

In [None]:
xgb_pipeline = Pipeline(steps = [
    ("preprocessor", preprocessor),
    ("model", xgb_model)
])

In [None]:
xgb_params = {
    "model__n_estimators": [100, 250, 500],
    "model__learning_rate": [0.001, 0.01, 0.1]
}

In [None]:
clf_xgb = GridSearchCV(xgb_pipeline, param_grid = xgb_params, cv = 5)

In [None]:
clf_xgb.fit(X_train, y_train)

In [None]:
xgb_preds = clf_xgb.predict(X_test)

In [None]:
xgb_mae = mean_absolute_error(y_test, xgb_preds)
print(xgb_mae)

In [None]:
xgb_r2 = r2_score(y_test, xgb_preds)
print(xgb_r2)

# Compare Model Performance

In [None]:
fig, (ax0, ax1, ax2, ax3) = plt.subplots(4, 1, figsize = (8, 10))

sns.regplot(x = linear_preds, y = y_test, ci = None, ax = ax0, color = "blue", label = "Linear Model")
sns.regplot(x = sgd_preds, y = y_test, ci = None, ax = ax1, color = "orange", label = "SGD Model")
sns.regplot(x = rf_preds, y = y_test, ci = None, ax = ax2, color = "red", label = "Random Forest Model")
sns.regplot(x = xgb_preds, y = y_test, ci = None, ax = ax3, color = "green",label = "XGBoost Model")

plt.title("Comparing Model Predictions to Actual Values", y = 4.6)


for ax in fig.axes:
    ax.legend(loc = "upper left")
    ax.set_ylabel("Actual Values")
    ax.set_xlim(0, 45000)
plt.xlabel("Model Predictions")

From just comparing models predicted values to actual values, its easy to see that the Random Forest and XGBoost models peformed better than the Linear and SGD models. We can also look at the mean absolute error and r^2 value to have a better comparision.

In [None]:
model_comparison_df = pd.DataFrame({"Model": ["Linear", "SGD", "Random Forest", "XGBoost"], "MAE": [linear_mae, sgd_mae, rf_mae, xgb_mae], 
                                    "R2": [linear_r2, sgd_r2, rf_r2, xgb_r2]})
model_comparison_df

In [None]:
fig, (ax0, ax1) = plt.subplots(1, 2, figsize = (12, 6))

sns.barplot(x = "Model", y = "MAE", data = model_comparison_df, ax = ax0)
sns.barplot(x = "Model", y = "R2", data = model_comparison_df, ax = ax1)

plt.title("Comparing Model MAE and R2 Values", x = -.15, y = 1.01)

We can see more clearly in the graph which models perform better. On the left, we have the mean absolute value, which is the average of the absolute errors between actual values and predicted values, is much lower for the Random Forest and XGBoost model than the Linear and SGD models. This means that the predictions for the Random Forest and XGBoost model, on average, are closer to the actual values. On the right, we have the r squared scores for each model, and can see that again, the Random Forest and XGBoost models perform better than the Linear and SGD Models.

# Conclusion

Overall, we performed a brief EDA of the variables in the dataset, and ran four different models to try to predict the price of cars. We were able to determine the two best models that were able to predict car price accurately. Out of the two models, the Random Forest model has a lower mean absolute value, and a higher r sqaured. Because of this, we will say that the Random Forest Model performed the best.