1. Problem Formulation
- We want to understand the factors that affect the car prices
- We want to be able to predict car prices based on our data/variables

In [40]:
import os # provides functions for interacting with the operating system
import numpy as np # fundamental package for scientific computing with Python
import pandas as pd # provides high-performance, easy-to-use data structures and data analysis tools
import matplotlib.pyplot as plt # provides a MATLAB-like plotting framework
import seaborn as sns # provides a high-level interface for drawing attractive and informative statistical graphics
from sklearn.linear_model import LinearRegression # Linear Regression
from sklearn.model_selection import train_test_split # Split arrays or matrices into random train and test subsets
from sklearn.metrics import mean_squared_error # Mean squared error regression loss
from sklearn.metrics import r2_score # R^2 (coefficient of determination) regression score function
from sklearn.metrics import mean_absolute_error # Mean absolute error regression loss
from sklearn.metrics import explained_variance_score # Explained variance regression score function
from math import sqrt # Return the square root of x

%matplotlib inline

# Change scientific notation to decimal
pd.options.display.float_format = '{:.2f}'.format

# Increase the size of sns plots
sns.set(rc={'figure.figsize':(12,10)})

# View all dataframes
pd.set_option('display.max_columns', None)

# Remove warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

2. Load the raw data

In [41]:
# Loading the data
raw_data = pd.read_csv('car_price.csv')

# Print the shape of the data
print('Shape of the data: ', raw_data.shape)

# Print the first 5 rows of the data
raw_data.head()

Shape of the data:  (205, 26)


Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,carheight,curbweight,enginetype,cylindernumber,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


3. Data Preprocessing

- 3.1 EDA(Exploratory Data Analysis)

In [42]:
# Investigate all the elements within each feature

for col in raw_data:
  unique_vals = raw_data[col].unique()
  no_unique_vals = len(unique_vals)
  if no_unique_vals <= 10:
    print('The feature {} has {} unique values: {}'.format(col, no_unique_vals, unique_vals))
  else:
    print('The feature {} has {} unique values'.format(col, no_unique_vals))


The feature car_ID has 205 unique values
The feature symboling has 6 unique values: [ 3  1  2  0 -1 -2]
The feature CarName has 147 unique values
The feature fueltype has 2 unique values: ['gas' 'diesel']
The feature aspiration has 2 unique values: ['std' 'turbo']
The feature doornumber has 2 unique values: ['two' 'four']
The feature carbody has 5 unique values: ['convertible' 'hatchback' 'sedan' 'wagon' 'hardtop']
The feature drivewheel has 3 unique values: ['rwd' 'fwd' '4wd']
The feature enginelocation has 2 unique values: ['front' 'rear']
The feature wheelbase has 53 unique values
The feature carlength has 75 unique values
The feature carwidth has 44 unique values
The feature carheight has 49 unique values
The feature curbweight has 171 unique values
The feature enginetype has 7 unique values: ['dohc' 'ohcv' 'ohc' 'l' 'rotor' 'ohcf' 'dohcv']
The feature cylindernumber has 7 unique values: ['four' 'six' 'five' 'three' 'twelve' 'two' 'eight']
The feature enginesize has 44 unique value

3.2 Check for missing values

In [43]:
raw_data.isnull().sum().sort_values(ascending=True)

car_ID              0
citympg             0
peakrpm             0
horsepower          0
compressionratio    0
stroke              0
boreratio           0
fuelsystem          0
enginesize          0
cylindernumber      0
enginetype          0
curbweight          0
carheight           0
carwidth            0
carlength           0
wheelbase           0
enginelocation      0
drivewheel          0
carbody             0
doornumber          0
aspiration          0
fueltype            0
CarName             0
symboling           0
highwaympg          0
price               0
dtype: int64

Ways of dealing with missing data
- Delete the whole observation
- Replace the null value with another value (e.g Mean, rolling mean, last day value, closest day value, mode, median, zero, etc)

In [57]:
# Manually creating null values
# raw_data.loc[raw_data['price'] == '?', 'price'] = np.nan
# raw_data['price2'] = raw_data['price'].replace('?', np.nan)
#  OR change some elements to null
# raw_data['price2'] = raw_data['price']
# raw_data['price2'].iloc[0] = np.nan
# raw_data['price2'].iloc[1] = np.nan
# raw_data['price2'].iloc[2] = np.nan

# Drop the rows with null values
# raw_data = raw_data.dropna(subset=['price2'])

# Or update with the mean value
# raw_data['price2'] = raw_data['price2'].fillna(raw_data['price2'].mean())
# raw_data['price2'][raw_data['price2'].isna()] = raw_data['price2'].mean()

# Drop priec2 column
# raw_data = raw_data.drop('price2', axis=1)
# del raw_data['price2']
# raw_data

RangeIndex(start=203, stop=205, step=1)