In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV, BayesianRidge
import statsmodels.formula.api as sm
import matplotlib.pylab as plt

from dmba import regressionSummary, exhaustive_search
from dmba import backward_elimination, forward_selection, stepwise_selection
from dmba import adjusted_r2_score, AIC_score, BIC_score


no display found. Using non-interactive Agg backend


Business Goal: Predict the price the dealership will get for a used Toyota Corolla.

In [2]:
# Read csv
corolla_df = pd.read_csv("ToyotaCorolla.csv")


In [3]:
# Obtain the first 1000 observations in the dataset
corolla_df = corolla_df.iloc[0:1000]

# Determine feature variables (independent variables)
features = [
    "Age_08_04",
    "KM",
    "Fuel_Type",
    "HP",
    "Met_Color",
    "Automatic",
    "cc",
    "Doors",
    "Quarterly_Tax",
    "Weight"
]

# Determine target variable (dependent variable)
target = "Price"

# Let X equal encoded features
X = pd.get_dummies(corolla_df[features], drop_first=True)

# Let y equal target
y = corolla_df[target]


In [4]:
# Partition the data into training (60%) and validation (40%) sets
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4)


In [5]:
# Fit a multiple linear regression model to predict the price

car_lm = LinearRegression()
car_lm.fit(train_X, train_y)

# Print y-intercept
print(f"Intercept: {car_lm.intercept_}")

# Print coefficients
coef = pd.DataFrame({
    "Feature": X.columns,
    "Coefficient": car_lm.coef_
})

print(coef)

# Print performance measures (training data)
regressionSummary(train_y, car_lm.predict(train_X))


Intercept: -4824.780628268925
             Feature  Coefficient
0          Age_08_04  -130.106008
1                 KM    -0.019325
2                 HP    37.396794
3          Met_Color    80.237538
4          Automatic   537.515864
5                 cc    -0.013806
6              Doors    30.214286
7      Quarterly_Tax    13.200760
8             Weight    15.830310
9   Fuel_Type_Diesel   797.955469
10  Fuel_Type_Petrol  2016.085270

Regression statistics

                      Mean Error (ME) : 0.0000
       Root Mean Squared Error (RMSE) : 1346.6565
            Mean Absolute Error (MAE) : 1004.2050
          Mean Percentage Error (MPE) : -0.9826
Mean Absolute Percentage Error (MAPE) : 8.9189


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


In [6]:
# Use predict() to make predictions on a new set (validation set)
car_lm_pred = car_lm.predict(valid_X)

# Show Predicted, Actual, and Residual
result_df = pd.DataFrame({
    "Predicted Values": car_lm_pred,
    "Actual Values": valid_y,
    "Residual": valid_y - car_lm_pred
})

print(result_df)

# Print performance measures (validation data)
regressionSummary(valid_y, car_lm_pred)


     Predicted Values  Actual Values     Residual
75       16753.480553          16950   196.519447
170      17611.596340          18245   633.403660
971       8150.844917          10495  2344.155083
960      14590.950771           9390 -5200.950771
577      11709.769644          11950   240.230356
..                ...            ...          ...
185      18652.444403          18245  -407.444403
224      13724.135121          12450 -1274.135121
12       20641.557066          19600 -1041.557066
268      15240.005836          14750  -490.005836
254      12671.739135           9940 -2731.739135

[400 rows x 3 columns]

Regression statistics

                      Mean Error (ME) : 65.5758
       Root Mean Squared Error (RMSE) : 1380.9519
            Mean Absolute Error (MAE) : 1061.2200
          Mean Percentage Error (MPE) : -0.7748
Mean Absolute Percentage Error (MAPE) : 9.5151


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
