# Import Libraries

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# 1. Problem Statement

# 2. Data Gathering

In [2]:
df = pd.read_csv('autos.csv')
df

Unnamed: 0,symbol,loss,make,fuel,aspir,doors,style,drive,eng_loc,wb,...,eng_cc,fuel.sys,bore,stroke,comp.ratio,hp,rpm,city_mpg,hw_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.40,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.40,8.0,115.0,5500.0,18,22,17450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,95.0,volvo,gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114.0,5400.0,23,28,16845.0
201,-1,95.0,volvo,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160.0,5300.0,19,25,19045.0
202,-1,95.0,volvo,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134.0,5500.0,18,23,21485.0
203,-1,95.0,volvo,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.40,23.0,106.0,4800.0,26,27,22470.0


In [3]:
df.head().T

Unnamed: 0,0,1,2,3,4
symbol,3,3,1,2,2
loss,,,,164.0,164.0
make,alfa-romero,alfa-romero,alfa-romero,audi,audi
fuel,gas,gas,gas,gas,gas
aspir,std,std,std,std,std
doors,two,two,two,four,four
style,convertible,convertible,hatchback,sedan,sedan
drive,rwd,rwd,rwd,fwd,4wd
eng_loc,front,front,front,front,front
wb,88.6,88.6,94.5,99.8,99.4


In [4]:
df.shape

(205, 26)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   symbol      205 non-null    int64  
 1   loss        164 non-null    float64
 2   make        205 non-null    object 
 3   fuel        205 non-null    object 
 4   aspir       205 non-null    object 
 5   doors       203 non-null    object 
 6   style       205 non-null    object 
 7   drive       205 non-null    object 
 8   eng_loc     205 non-null    object 
 9   wb          205 non-null    float64
 10  length      205 non-null    float64
 11  width       205 non-null    float64
 12  height      205 non-null    float64
 13  weight      205 non-null    int64  
 14  eng_type    205 non-null    object 
 15  cylinders   205 non-null    object 
 16  eng_cc      205 non-null    int64  
 17  fuel.sys    205 non-null    object 
 18  bore        201 non-null    float64
 19  stroke      201 non-null    f

In [6]:
df['cylinders'].value_counts()

four      159
six        24
five       11
eight       5
two         4
three       1
twelve      1
Name: cylinders, dtype: int64

# Encoding

# 1. Label Encoding

In [7]:
df['cylinders'].unique()

array(['four', 'six', 'five', 'three', 'twelve', 'two', 'eight'],
      dtype=object)

In [8]:
df['cylinders'].replace({'four':4,'six':6,'five':5,'three':3,'twelve':12,'two':2,
                         'eight':8}, inplace = True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   symbol      205 non-null    int64  
 1   loss        164 non-null    float64
 2   make        205 non-null    object 
 3   fuel        205 non-null    object 
 4   aspir       205 non-null    object 
 5   doors       203 non-null    object 
 6   style       205 non-null    object 
 7   drive       205 non-null    object 
 8   eng_loc     205 non-null    object 
 9   wb          205 non-null    float64
 10  length      205 non-null    float64
 11  width       205 non-null    float64
 12  height      205 non-null    float64
 13  weight      205 non-null    int64  
 14  eng_type    205 non-null    object 
 15  cylinders   205 non-null    int64  
 16  eng_cc      205 non-null    int64  
 17  fuel.sys    205 non-null    object 
 18  bore        201 non-null    float64
 19  stroke      201 non-null    f

# Label Encoding using LabelEncoder

In [10]:
df['cylinders']

0      4
1      4
2      6
3      4
4      5
      ..
200    4
201    4
202    6
203    6
204    4
Name: cylinders, Length: 205, dtype: int64

In [11]:
l1 = ['four', 'six', 'five', 'three', 'twelve', 'two', 'eight']
sorted(l1)

['eight', 'five', 'four', 'six', 'three', 'twelve', 'two']

In [12]:
labelEnc = LabelEncoder()
labelEnc.fit_transform(df['cylinders'])

array([2, 2, 4, 2, 3, 3, 3, 3, 3, 3, 2, 2, 4, 4, 4, 4, 4, 4, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 4, 4, 6, 2, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2,
       2, 3, 3, 3, 3, 5, 5, 5, 5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 5, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 4, 4, 2], dtype=int64)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   symbol      205 non-null    int64  
 1   loss        164 non-null    float64
 2   make        205 non-null    object 
 3   fuel        205 non-null    object 
 4   aspir       205 non-null    object 
 5   doors       203 non-null    object 
 6   style       205 non-null    object 
 7   drive       205 non-null    object 
 8   eng_loc     205 non-null    object 
 9   wb          205 non-null    float64
 10  length      205 non-null    float64
 11  width       205 non-null    float64
 12  height      205 non-null    float64
 13  weight      205 non-null    int64  
 14  eng_type    205 non-null    object 
 15  cylinders   205 non-null    int64  
 16  eng_cc      205 non-null    int64  
 17  fuel.sys    205 non-null    object 
 18  bore        201 non-null    float64
 19  stroke      201 non-null    f

In [13]:
Test = ['Low','Medium','Medium','High','High','Low']
sorted(Test)

['High', 'High', 'Low', 'Low', 'Medium', 'Medium']

In [14]:
['Low','Medium','Medium','High','High','Low']
   1      2         2       0     0      1
   

IndentationError: unexpected indent (Temp/ipykernel_3688/2454249855.py, line 2)

In [18]:
df_test = pd.DataFrame({'Test':['Low','Medium','Medium','High','High','Low']})
labelEnc.fit_transform(df_test['Test'])

#1   2   2   0   0   1

array([1, 2, 2, 0, 0, 1])

In [16]:
df_test = pd.DataFrame({'Test':['Low','Medium','Medium','High','High','Low']})
labelEnc.fit_transform('Test')

ValueError: y should be a 1d array, got an array of shape () instead.

# One Hot Encoding

In [None]:
Low 
Medium
High

           Low     Medium   High
    
Low         1        0       0
Medium      0        1       0
Medium      0        1       0
High        0        0       1
High        0        0       1
Low         1        0       0

In [19]:
df_test = pd.DataFrame({'Test':['Low','Medium','Medium','High','High','Low']})

df_new = pd.get_dummies(df_test, columns = ['Test'])
df_new

Unnamed: 0,Test_High,Test_Low,Test_Medium
0,0,1,0
1,0,0,1
2,0,0,1
3,1,0,0
4,1,0,0
5,0,1,0


In [20]:
df_test = pd.DataFrame({'Test':['Low','Medium','Medium','High','High','Low']})

df_new = pd.get_dummies(df_test, columns = ['Test'],drop_first=True)
df_new

Unnamed: 0,Test_Low,Test_Medium
0,1,0
1,0,1
2,0,1
3,0,0
4,0,0
5,1,0


In [21]:
df_test = pd.DataFrame({'Test':['Low','Medium','Medium','High','High','Low']})

df_new = pd.get_dummies(df_test, columns = ['Test'],drop_first=True,prefix = 'test')
df_new

Unnamed: 0,test_Low,test_Medium
0,1,0
1,0,1
2,0,1
3,0,0
4,0,0
5,1,0


In [22]:
df_test

Unnamed: 0,Test
0,Low
1,Medium
2,Medium
3,High
4,High
5,Low


In [23]:
df_new

Unnamed: 0,test_Low,test_Medium
0,1,0
1,0,1
2,0,1
3,0,0
4,0,0
5,1,0


In [24]:
onehotenc = OneHotEncoder()
df_onehot = pd.DataFrame(onehotenc.fit_transform(df_test[['Test']]).toarray(),dtype = int)
df_onehot

Unnamed: 0,0,1,2
0,0,1,0
1,0,0,1
2,0,0,1
3,1,0,0
4,1,0,0
5,0,1,0


In [31]:
onehotenc = OneHotEncoder()
df_onehot = pd.DataFrame(onehotenc.fit_transform(df_test[['Test']]).toarray())
df_onehot

Unnamed: 0,0,1,2
0,0.0,1.0,0.0
1,0.0,0.0,1.0
2,0.0,0.0,1.0
3,1.0,0.0,0.0
4,1.0,0.0,0.0
5,0.0,1.0,0.0


In [29]:
type(onehotenc.fit_transform(df_test[['Test']]))

scipy.sparse.csr.csr_matrix

In [28]:
df_test

Unnamed: 0,Test
0,Low
1,Medium
2,Medium
3,High
4,High
5,Low
