In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.preprocessing import PolynomialFeatures

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression


In [3]:
df = pd.read_csv('CarPrice_Assignment.csv')
df.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [4]:
df.shape

(205, 26)

In [5]:
df.isnull().sum()

car_ID              0
symboling           0
CarName             0
fueltype            0
aspiration          0
doornumber          0
carbody             0
drivewheel          0
enginelocation      0
wheelbase           0
carlength           0
carwidth            0
carheight           0
curbweight          0
enginetype          0
cylindernumber      0
enginesize          0
fuelsystem          0
boreratio           0
stroke              0
compressionratio    0
horsepower          0
peakrpm             0
citympg             0
highwaympg          0
price               0
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            205 non-null    int64  
 1   symboling         205 non-null    int64  
 2   CarName           205 non-null    object 
 3   fueltype          205 non-null    object 
 4   aspiration        205 non-null    object 
 5   doornumber        205 non-null    object 
 6   carbody           205 non-null    object 
 7   drivewheel        205 non-null    object 
 8   enginelocation    205 non-null    object 
 9   wheelbase         205 non-null    float64
 10  carlength         205 non-null    float64
 11  carwidth          205 non-null    float64
 12  carheight         205 non-null    float64
 13  curbweight        205 non-null    int64  
 14  enginetype        205 non-null    object 
 15  cylindernumber    205 non-null    object 
 16  enginesize        205 non-null    int64  
 1

In [7]:
df.drop(['car_ID','CarName'], axis=1, inplace=True) # axis=1 means column

In [8]:
df.head()

Unnamed: 0,symboling,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,3,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,3,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,1,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,2,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,2,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [9]:
df['fueltype'].value_counts()

fueltype
gas       185
diesel     20
Name: count, dtype: int64

In [10]:
df['fuelsystem'].value_counts()

fuelsystem
mpfi    94
2bbl    66
idi     20
1bbl    11
spdi     9
4bbl     3
mfi      1
spfi     1
Name: count, dtype: int64

In [11]:
object_list = []
for col in df.columns:
    if df[col].dtype == 'object':
        print(df[col].value_counts())
        object_list.append(col)

fueltype
gas       185
diesel     20
Name: count, dtype: int64
aspiration
std      168
turbo     37
Name: count, dtype: int64
doornumber
four    115
two      90
Name: count, dtype: int64
carbody
sedan          96
hatchback      70
wagon          25
hardtop         8
convertible     6
Name: count, dtype: int64
drivewheel
fwd    120
rwd     76
4wd      9
Name: count, dtype: int64
enginelocation
front    202
rear       3
Name: count, dtype: int64
enginetype
ohc      148
ohcf      15
ohcv      13
dohc      12
l         12
rotor      4
dohcv      1
Name: count, dtype: int64
cylindernumber
four      159
six        24
five       11
eight       5
two         4
three       1
twelve      1
Name: count, dtype: int64
fuelsystem
mpfi    94
2bbl    66
idi     20
1bbl    11
spdi     9
4bbl     3
mfi      1
spfi     1
Name: count, dtype: int64


In [12]:
object_list

['fueltype',
 'aspiration',
 'doornumber',
 'carbody',
 'drivewheel',
 'enginelocation',
 'enginetype',
 'cylindernumber',
 'fuelsystem']

In [13]:
# data encoding
# mannual method of encoding data.
# df.replace(
#     'fueltype': {'gas':0, }
# )

from sklearn.preprocessing import LabelEncoder

labelenc = LabelEncoder()

In [14]:
# now encode the label using labelEncoder
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = labelenc.fit_transform(df[col])

In [15]:
df.head()

Unnamed: 0,symboling,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,3,1,0,1,0,2,0,88.6,168.8,64.1,...,130,5,3.47,2.68,9.0,111,5000,21,27,13495.0
1,3,1,0,1,0,2,0,88.6,168.8,64.1,...,130,5,3.47,2.68,9.0,111,5000,21,27,16500.0
2,1,1,0,1,2,2,0,94.5,171.2,65.5,...,152,5,2.68,3.47,9.0,154,5000,19,26,16500.0
3,2,1,0,0,3,1,0,99.8,176.6,66.2,...,109,5,3.19,3.4,10.0,102,5500,24,30,13950.0
4,2,1,0,0,3,0,0,99.4,176.6,66.4,...,136,5,3.19,3.4,8.0,115,5500,18,22,17450.0


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   symboling         205 non-null    int64  
 1   fueltype          205 non-null    int64  
 2   aspiration        205 non-null    int64  
 3   doornumber        205 non-null    int64  
 4   carbody           205 non-null    int64  
 5   drivewheel        205 non-null    int64  
 6   enginelocation    205 non-null    int64  
 7   wheelbase         205 non-null    float64
 8   carlength         205 non-null    float64
 9   carwidth          205 non-null    float64
 10  carheight         205 non-null    float64
 11  curbweight        205 non-null    int64  
 12  enginetype        205 non-null    int64  
 13  cylindernumber    205 non-null    int64  
 14  enginesize        205 non-null    int64  
 15  fuelsystem        205 non-null    int64  
 16  boreratio         205 non-null    float64
 1

In [17]:
x = df.drop('price', axis=1)
y = df['price']

In [18]:
# split train and test data before scalling:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2)

In [19]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [20]:
model_LR = LinearRegression()

model_LR.fit(x_train_scaled, y_train)

In [21]:
print('Train R2 Score Linear Regression:(0-1)')
y_train_pred = model_LR.predict(x_train_scaled)
print(r2_score(y_train, y_train_pred))

print('Test R2 Score Linear Regression:(0-1))')
y_pred = model_LR.predict(x_test_scaled)
print(r2_score(y_test, y_pred))

Train R2 Score Linear Regression:(0-1)
0.8904263507014593
Test R2 Score Linear Regression:(0-1))
0.733909415358835


In [22]:
model_rfr = RandomForestRegressor()
model_rfr.fit(x_train_scaled, y_train)

In [28]:
y_train_pred_rfr = model_rfr.predict(x_train_scaled)
y_pred_rfr = model_rfr.predict(x_test_scaled)
print(f'Train R2 score for Random Forest: {r2_score(y_train, y_train_pred_rfr)}')
print(f'Test R2 score for Random Forest: {r2_score(y_test, y_pred_rfr)}')

Train R2 score for Random Forest: 0.9894520087924383
Test R2 score for Random Forest: 0.852251239869314


In [25]:
y_pred_rfr

array([10066.395     ,  5857.59      , 10201.92666667,  8997.01      ,
        7509.175     ,  9004.805     , 14591.81      , 35303.07357143,
        6625.47      ,  8535.06666667, 18571.74      ,  7387.42      ,
       17326.085     , 11543.61      ,  6866.73      , 16332.49      ,
       17533.84      , 16114.32      ,  8175.26      ,  8206.75      ,
       18701.78      , 16987.53      ,  9297.6       ,  6680.63      ,
       10167.49666667, 15974.35      ,  6347.22      ,  6416.3       ,
        7743.415     ,  8238.55      , 35505.12857143, 11206.135     ,
       16789.76      ,  6619.06      , 13111.44083333,  9264.77      ,
       18166.91      ,  9418.48      , 14609.30083333,  8218.99      ,
        9290.79166667])

In [26]:
y_test

3      13950.000
76      5389.000
10     16430.000
144     9233.000
140     7603.000
63     10795.000
171    11549.000
72     35056.000
26      7609.000
38      9095.000
9      17859.167
97      7999.000
14     24565.000
57     13645.000
96      7499.000
4      17450.000
199    18950.000
181    15750.000
79      7689.000
163     8058.000
7      18920.000
106    18399.000
88      9279.000
94      7299.000
11     16925.000
109    12440.000
24      6229.000
77      6189.000
159     7788.000
185     8195.000
49     36000.000
41     12945.000
155     8778.000
78      6669.000
149    11694.000
190     9980.000
204    22625.000
174    10698.000
65     18280.000
23      7957.000
62     10245.000
Name: price, dtype: float64