In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [5]:
# Import data from github

url = "https://raw.githubusercontent.com/rpulipaka-22/Car-Price-Linear-Regression-ML/main/CarPrice_Assignment.csv"
data = pd.read_csv(url)

data

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.40,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.40,8.0,115,5500,18,22,17450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,201,-1,volvo 145e (sw),gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114,5400,23,28,16845.0
201,202,-1,volvo 144ea,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160,5300,19,25,19045.0
202,203,-1,volvo 244dl,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485.0
203,204,-1,volvo 246,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.40,23.0,106,4800,26,27,22470.0


In [6]:
# Compute the correlations of each numeric column to the "price" column

data.corr()["price"].sort_values(ascending=False)

price               1.000000
enginesize          0.874145
curbweight          0.835305
horsepower          0.808139
carwidth            0.759325
carlength           0.682920
wheelbase           0.577816
boreratio           0.553173
carheight           0.119336
stroke              0.079443
compressionratio    0.067984
symboling          -0.079978
peakrpm            -0.085267
car_ID             -0.109093
citympg            -0.685751
highwaympg         -0.697599
Name: price, dtype: float64

In [7]:
# We will only work with columns that have moderate to high correlation with the "price" column
# (i.e. columns with a correlation coefficient of 0.6 and above)

dropped_data = data.drop(["highwaympg", "citympg", "car_ID", "peakrpm", "symboling",
                          "compressionratio", "stroke", "carheight", "boreratio",
                          "wheelbase"], axis=1)

# We will also remove all non numeric columns as those will not be used in creating the Linear Regression model

data = dropped_data._get_numeric_data()

data

Unnamed: 0,carlength,carwidth,curbweight,enginesize,horsepower,price
0,168.8,64.1,2548,130,111,13495.0
1,168.8,64.1,2548,130,111,16500.0
2,171.2,65.5,2823,152,154,16500.0
3,176.6,66.2,2337,109,102,13950.0
4,176.6,66.4,2824,136,115,17450.0
...,...,...,...,...,...,...
200,188.8,68.9,2952,141,114,16845.0
201,188.8,68.8,3049,141,160,19045.0
202,188.8,68.9,3012,173,134,21485.0
203,188.8,68.9,3217,145,106,22470.0


In [11]:
# We will now construct a Linear Regression model

data_np = data.to_numpy()

X = data_np[:, :-1]
y = data_np[:, -1]

# Let's create one model and test its accuracy using data in a train-test split of 80% and 20% respectively

fX_train, fX_test, fy_train, fy_test = train_test_split(X, y, test_size=0.2)

fModel = LinearRegression()
fModel.fit(fX_train, fy_train)
fAccuracy = fModel.score(fX_test, fy_test)
fPredictions = fModel.predict(fX_test)

In [17]:
# Let's print the details of the first model generated

print("Model Coefficients: ", fModel.coef_)
print("Model Intercept: ", fModel.intercept_)
print("Mean Absolute Error of the Model: ", mean_absolute_error(fPredictions, fy_test))
print("Mean Squared Error of the Model: ", mean_squared_error(fPredictions, fy_test))
print("Model Accuracy Score: ", fAccuracy)

Model Coefficients:  [-48.25189228 517.01040513   3.22917037  89.39914851  51.33019547]
Model Intercept:  -37186.589291020166
Mean Absolute Error of the Model:  2742.687354560218
Mean Squared Error of the Model:  13207598.56190566
Model Accuracy Score:  0.4745842354625447


In [19]:
# We can improve this model by repeatedly generating new models with a
# different set of training data and select the highest accuracy score

# Let's initialize the details for the best model

bestModel = fModel
bestAccuracy = fAccuracy
bX_train, bX_test, by_train, by_test = fX_train, fX_test, fy_train, fy_test

for test in range(30):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    model = LinearRegression()
    model.fit(X_train, y_train)
    accuracy = model.score(X_test, y_test)

    if accuracy > bestAccuracy:
        bestAccuracy = accuracy
        bestModel = model
        bX_train, bX_test, by_train, by_test = X_train, X_test, y_train, y_test

bestPredictions = bestModel.predict(bX_test)

In [20]:
# Let's print the details of the best model generated

print("Best Model Coefficients: ", bestModel.coef_)
print("Best Model Intercept: ", bestModel.intercept_)
print("Mean Absolute Error of the Best Model: ", mean_absolute_error(bestPredictions, by_test))
print("Mean Squared Error of the Best Model: ", mean_squared_error(bestPredictions, by_test))
print("Best Model Accuracy Score: ", bestAccuracy)

Best Model Coefficients:  [ -2.44445147 455.87992335   2.28716327  82.17107625  53.18235669]
Best Model Intercept:  -38155.5972619735
Mean Absolute Error of the Best Model:  2237.5994019017867
Mean Squared Error of the Best Model:  9317349.231568217
Best Model Accuracy Score:  0.8793910608540554


In [22]:
# We see that we now have a model and the associated training and test
# data that yields the highest accuracy

# Let's create a dataframe that displays both the actual values of the test
# data as well as the predicted values using the model

predictions_dataX = pd.DataFrame(bX_test, columns=["Car Length", "Car Width", "Curb Weight", "Engine Size", "Horsepower"])
predictions_dataY = pd.DataFrame(by_test, columns=["Actual Price"])
predictions_dataP = pd.DataFrame(bestPredictions, columns=["Predicted Price"])

final_data = pd.concat([predictions_dataX, predictions_dataY, predictions_dataP], axis=1)

final_data.head()

Unnamed: 0,Car Length,Car Width,Curb Weight,Engine Size,Horsepower,Actual Price,Predicted Price
0,208.1,71.7,3900.0,308.0,184.0,40960.0,38036.484775
1,171.7,65.5,2275.0,109.0,85.0,8495.0,9965.269478
2,192.7,71.4,2844.0,136.0,110.0,17710.0,17453.201424
3,188.8,67.2,3045.0,130.0,162.0,18420.0,18280.215014
4,184.6,66.5,3060.0,181.0,152.0,13499.0,21664.574535


In [25]:
# To confirm the accuracy between the "Actual Price" column and the "Predicted Price" column
# we can determine the r2 score. The closer this value is to 1 the more accurate this model is

print("R2 Score: ", r2_score(by_test, bestPredictions))

R2 Score:  0.8793910608540554
