# OneHotEncoding

## Part 1 - Data Preprocessing

### Importing the dataset

In [519]:
import numpy as np
import pandas as pd

In [520]:
df = pd.read_csv("OneHot.csv")

In [521]:
df.head()

Unnamed: 0.1,Unnamed: 0,brand,owner,fuel,selling_price
0,0,Ferrari,third,diesel,8130000
1,1,Porsche,first,cng,9980000
2,2,Ferrari,first,lpg,4960000
3,3,Audi,first,lpg,8930000
4,4,Chevrolet,third,petrol,9340000


## Feature Engineering

### Drop Column

In [522]:
df.drop(df.columns[0],axis=1,inplace=True)

In [523]:
df.head()

Unnamed: 0,brand,owner,fuel,selling_price
0,Ferrari,third,diesel,8130000
1,Porsche,first,cng,9980000
2,Ferrari,first,lpg,4960000
3,Audi,first,lpg,8930000
4,Chevrolet,third,petrol,9340000


### Value Count

In [524]:
df['brand'].value_counts()

brand
Toyota           584
Audi             563
Chevrolet        562
Renault          561
Tesla            557
Ferrari          547
Porsche          546
Jaguar           545
Peugeot          536
Ford             529
Volkswagen       523
Volvo            505
BMW               95
Honda             85
Kia               85
Mitsubishi        83
Lamborghini       41
Nissan            22
Mercedes-Benz     10
Jeep               9
Land Rover         7
Hyundai            5
Name: count, dtype: int64

In [525]:
df['owner'].value_counts()

owner
third     2424
first     2292
second    2284
Name: count, dtype: int64

In [526]:
df['fuel'].value_counts()

fuel
petrol    1754
diesel    1753
lpg       1751
cng       1742
Name: count, dtype: int64

### NUNIQUE

In [527]:
df['brand'].nunique()

22

In [528]:
df['owner'].nunique()

3

In [529]:
df['fuel'].nunique()

4

## One Hot Encoding

### One Way to do One Hot Encoding


In [530]:
pd.get_dummies(df,columns=['fuel','owner'])

Unnamed: 0,brand,selling_price,fuel_cng,fuel_diesel,fuel_lpg,fuel_petrol,owner_first,owner_second,owner_third
0,Ferrari,8130000,False,True,False,False,False,False,True
1,Porsche,9980000,True,False,False,False,True,False,False
2,Ferrari,4960000,False,False,True,False,True,False,False
3,Audi,8930000,False,False,True,False,True,False,False
4,Chevrolet,9340000,False,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...
6995,Ferrari,3930000,False,False,True,False,False,False,True
6996,Ferrari,3510000,True,False,False,False,False,True,False
6997,Ford,7420000,False,True,False,False,True,False,False
6998,Jaguar,2490000,False,True,False,False,True,False,False


### Getting the inputs and output

In [531]:
x = df.iloc[:,0:3]


In [532]:
x.shape

(7000, 3)

In [533]:
y = df.iloc[:,-1]

In [534]:
y.shape

(7000,)

### Creating the Training Set and the Test Set

In [535]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 2)

In [536]:
x_train.shape

(5250, 3)

In [537]:
y_train.shape

(5250,)

## One Hot Encoding

## Second way of doing onehotencoding

In [538]:
from sklearn.preprocessing import OneHotEncoder
onehot = OneHotEncoder(drop = 'first',sparse_output=False,dtype=np.int32)

In [539]:
onehot.fit(x_train[df.columns[1:3]])

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.int32'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [540]:
x_train

Unnamed: 0,brand,owner,fuel
4689,Renault,third,cng
5908,Tesla,third,diesel
6544,Audi,first,diesel
688,Porsche,second,diesel
6517,Volkswagen,second,cng
...,...,...,...
6443,Volkswagen,third,cng
3606,Ferrari,second,cng
5704,Chevrolet,third,petrol
6637,Volkswagen,first,petrol


In [541]:
oneHotTransformedColumns = onehot.transform(x_train[df.columns[1:3]])

In [542]:
oneHotTransformedColumns.shape

(5250, 5)

In [543]:
encoded_df = pd.DataFrame(oneHotTransformedColumns,columns=onehot.get_feature_names_out(['owner','fuel']))
x_train.reset_index(drop = True,inplace=True)
x_train = pd.concat([x_train.drop(['owner','fuel'],axis=1),encoded_df],axis=1)

In [544]:
encoded_df

Unnamed: 0,owner_second,owner_third,fuel_diesel,fuel_lpg,fuel_petrol
0,0,1,0,0,0
1,0,1,1,0,0
2,0,0,1,0,0
3,1,0,1,0,0
4,1,0,0,0,0
...,...,...,...,...,...
5245,0,1,0,0,0
5246,1,0,0,0,0
5247,0,1,0,0,1
5248,0,0,0,0,1


In [545]:
x_train

Unnamed: 0,brand,owner_second,owner_third,fuel_diesel,fuel_lpg,fuel_petrol
0,Renault,0,1,0,0,0
1,Tesla,0,1,1,0,0
2,Audi,0,0,1,0,0
3,Porsche,1,0,1,0,0
4,Volkswagen,1,0,0,0,0
...,...,...,...,...,...,...
5245,Volkswagen,0,1,0,0,0
5246,Ferrari,1,0,0,0,0
5247,Chevrolet,0,1,0,0,1
5248,Volkswagen,0,0,0,0,1


## Third of doing OneHotEncoding


In [546]:
brand_counts = df['brand'].value_counts()

In [547]:
brand_counts

brand
Toyota           584
Audi             563
Chevrolet        562
Renault          561
Tesla            557
Ferrari          547
Porsche          546
Jaguar           545
Peugeot          536
Ford             529
Volkswagen       523
Volvo            505
BMW               95
Honda             85
Kia               85
Mitsubishi        83
Lamborghini       41
Nissan            22
Mercedes-Benz     10
Jeep               9
Land Rover         7
Hyundai            5
Name: count, dtype: int64

In [548]:
df.nunique()

brand             22
owner              3
fuel               4
selling_price    900
dtype: int64

In [549]:
threshold = 100
brands_to_remove = brand_counts[brand_counts <= threshold].index

In [550]:
pd.get_dummies(df['brand'].replace(brands_to_remove,'Mix Brand')).head()

Unnamed: 0,Audi,Chevrolet,Ferrari,Ford,Jaguar,Mix Brand,Peugeot,Porsche,Renault,Tesla,Toyota,Volkswagen,Volvo
0,False,False,True,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,True,False,False,False,False,False
2,False,False,True,False,False,False,False,False,False,False,False,False,False
3,True,False,False,False,False,False,False,False,False,False,False,False,False
4,False,True,False,False,False,False,False,False,False,False,False,False,False


## Part 2 - Building and training the model

### Building the model

### Training the model

### Normalazation Data 

### Inference

Making the predictions of the data points in the test set

Making the prediction of a single data point with AT = 15, V = 40, AP = 1000, RH = 75

## Part 3: Evaluating the model

### intercept (c)

### coef(M)

### R-Score

### R-Squared

### Adjusted R-Squared