# 🚗 Car Sales Price Prediction

This notebook extracts data from a csv file containing sales details of cars and tries to predict the the price of car in dollars.

In [17]:
# Standard imports 
import pandas as pd
import numpy as np

In [18]:
# Importing the dataset
df = pd.read_csv("./data/car-sales.csv")

In [19]:
# Viewing the head of the dataset
df.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [20]:
df.shape

(1000, 5)

In [21]:
df.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [22]:
df.dtypes

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price            float64
dtype: object

In [23]:
df.dtypes

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price            float64
dtype: object

In [24]:
# Dropping the rows with no labels
df.dropna(subset=["Price"], inplace = True)

In [25]:
df.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [26]:
# Splitting the data into X and Y
X = df.drop("Price", axis = 1)
y = df["Price"]

In [27]:
# Splitting the data into Training and Test Sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [28]:
X_train.shape, X_test.shape

((760, 4), (190, 4))

In [29]:
X_train.isna().sum()

Make             37
Colour           39
Odometer (KM)    37
Doors            39
dtype: int64

In [30]:
# Filling the missing values in Training set
X_train["Make"].fillna("missing", inplace = True)
X_train["Colour"].fillna("missing", inplace = True)
X_train["Odometer (KM)"].fillna(X_train["Odometer (KM)"].median(), inplace = True)
X_train["Doors"].fillna(4, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [31]:
X_train.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
dtype: int64

In [32]:
# Filling the missing values in Test set
X_test["Make"].fillna("missing", inplace = True)
X_test["Colour"].fillna("missing", inplace = True)
X_test["Odometer (KM)"].fillna(X_train["Odometer (KM)"].median(), inplace = True)
X_test["Doors"].fillna(4, inplace = True)

In [33]:
X_test.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
dtype: int64

In [34]:
y_train.isna().sum()

0

In [35]:
y_test.isna().sum()

0

Great!!! All the missing values have been removed from the dataset. 

It's time to convert the non-numerical data into categories.

In [36]:
df.dtypes

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price            float64
dtype: object

In [38]:
X_train.dtypes

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
dtype: object

In [122]:
# Converting the Categories to Numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour"]
onehot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                   onehot,
                                   categorical_features)],
                               remainder = "passthrough")
transformed_X_train = transformer.fit_transform(X_train)
transformed_X_test = transformer.fit_transform(X_test)

In [42]:
transformed_X_train.shape, transformed_X_test.shape

((760, 15), (190, 15))

In [71]:
from sklearn.ensemble import RandomForestRegressor

In [123]:
reg = RandomForestRegressor()
reg.fit(transformed_X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [124]:
# Score on training data
reg.score(transformed_X_train, y_train)

0.8732058789717575

In [125]:
# Score on test data 
reg.score(transformed_X_test, y_test)

0.25387902250054295

In [108]:
y_preds = reg.predict(transformed_X_test)

In [109]:
y_preds[:10], y_test[:10]

(array([13840.78      , 11618.73633333, 10475.26      ,  8079.66410897,
        12557.74      , 19322.76      , 17708.38      , 19261.8       ,
         8734.83      ,  9657.49      ]),
 205    12553.0
 518    12311.0
 375    12434.0
 93      6283.0
 497     4200.0
 298     8460.0
 458    16319.0
 986    26882.0
 472    10196.0
 691     5843.0
 Name: Price, dtype: float64)

In [110]:
from sklearn.metrics import mean_absolute_error as mea

In [111]:
mea(y_test, y_preds)

5930.907456735224