<a href="https://colab.research.google.com/github/polock11/bmw_used_cars_analysis/blob/main/bmw_used_car_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Libraries for loading dataset and machine learning

In [35]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, r2_score

#Loading dataset

In [69]:
df = pd.read_csv('bmw.csv')
df.sample(10)

Unnamed: 0,model,year,price,transmission,mileage,fuelType,mpg,engineSize
852,X2,2019,23000,Semi-Auto,929,Petrol,40.4,2.0
796,5 Series,2016,18385,Semi-Auto,45356,Diesel,51.4,3.0
9575,X3,2014,15159,Automatic,77129,Diesel,54.3,2.0
9297,3 Series,2015,7995,Manual,101215,Diesel,65.7,2.0
2212,1 Series,2016,12475,Manual,17550,Petrol,56.5,1.5
458,X2,2019,27000,Automatic,49,Diesel,50.4,2.0
5073,1 Series,2016,11000,Manual,38810,Petrol,53.3,1.5
1601,5 Series,2016,15991,Semi-Auto,21476,Diesel,62.8,2.0
5671,3 Series,2019,32990,Automatic,9892,Diesel,48.7,2.0
9632,X3,2016,14299,Automatic,83354,Diesel,54.3,2.0


#Analysis of the dataset

In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10781 entries, 0 to 10780
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         10781 non-null  object 
 1   year          10781 non-null  int64  
 2   price         10781 non-null  int64  
 3   transmission  10781 non-null  object 
 4   mileage       10781 non-null  int64  
 5   fuelType      10781 non-null  object 
 6   mpg           10781 non-null  float64
 7   engineSize    10781 non-null  float64
dtypes: float64(2), int64(3), object(3)
memory usage: 673.9+ KB


In [71]:
df.describe()

Unnamed: 0,year,price,mileage,mpg,engineSize
count,10781.0,10781.0,10781.0,10781.0,10781.0
mean,2017.078935,22733.408867,25496.98655,56.399035,2.167767
std,2.349038,11415.528189,25143.192559,31.336958,0.552054
min,1996.0,1200.0,1.0,5.5,0.0
25%,2016.0,14950.0,5529.0,45.6,2.0
50%,2017.0,20462.0,18347.0,53.3,2.0
75%,2019.0,27940.0,38206.0,62.8,2.0
max,2020.0,123456.0,214000.0,470.8,6.6


In [72]:
#getting all categorical features 
object_feature = []

for col in df.columns:
    if df[col].dtypes == 'O':
        object_feature.append(col)

print(object_feature)

['model', 'transmission', 'fuelType']


#Separating numeric and target variables

In [73]:
#droping object variables with the target variable
X = df.drop(['model', 'transmission', 'fuelType', 'price'], axis = 1)
#sroting the target variable
y = df['price']

display(X.head())
display(y.head())

Unnamed: 0,year,mileage,mpg,engineSize
0,2014,67068,57.6,2.0
1,2018,14827,42.8,2.0
2,2016,62794,51.4,3.0
3,2017,26676,72.4,1.5
4,2014,39554,50.4,3.0


0    11200
1    27000
2    16000
3    12750
4    14500
Name: price, dtype: int64

#Splitting data for training and testing

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(8624, 4) (2157, 4) (8624,) (2157,)


#Applying Liniear Regression

In [75]:
model = LinearRegression()
model = model.fit(X_train, y_train)

#Predicting for the test set

In [76]:
y_pred = model.predict(X_test)

temp_df = pd.DataFrame({'actual price': y_test,
                        'predicted price': y_pred})

temp_df.head(10)

Unnamed: 0,actual price,predicted price
8728,15300,16192.687561
761,15495,11347.810745
7209,39875,36923.407157
6685,21730,22092.396087
8548,13799,19530.333962
9385,24499,14604.318002
6085,42202,34702.918794
1885,32400,27280.804653
4921,28990,27190.786533
1121,15790,22688.966163


#Different KPI's

In [77]:
#mean_absolute_error
mae = np.round(mean_absolute_error(y_test, y_pred), 2)
print('Mean Absolute Error: '+ str(mae))

#mean_absolute_percentage_error
mape = np.round(mean_absolute_percentage_error(y_test, y_pred), 2)*100
print('Mean Absolute Percentage Error: '+ str(mape))

#mean_squared_error
mse = np.round(mean_squared_error(y_test, y_pred), 2)
print('Mean Squared Error: '+ str(mse))

#root_mean_squared_error
rmse = np.round(np.sqrt(mse),2)
print('Root Mean Squared Error: '+ str(rmse))

#r2_score
r2_error = np.round(r2_score(y_test, y_pred), 2)
print('R2_Score: '+ str(r2_error))

Mean Absolute Error: 4644.49
Mean Absolute Percentage Error: 23.0
Mean Squared Error: 46236600.68
Root Mean Squared Error: 6799.75
R2_Score: 0.64
