In [1]:
# Importing Libraries
import numpy as np
import pandas as pd

In [2]:
# Reading CSV Data
df = pd.read_csv('/content/customer.csv')
df.sample(5)

Unnamed: 0,age,gender,review,education,purchased
46,64,Female,Poor,PG,No
34,86,Male,Average,School,No
12,51,Male,Poor,School,No
25,57,Female,Good,School,No
6,18,Male,Good,School,No


In [3]:
# Selecting some specific column
df = df[["review","education","purchased"]]
df.sample(5)

Unnamed: 0,review,education,purchased
12,Poor,School,No
48,Good,UG,Yes
45,Poor,PG,Yes
41,Good,PG,Yes
10,Good,UG,Yes


In [4]:
# Shape of the DataFrame
df.shape

(50, 3)

In [5]:
# Unique values in review column
df["review"].value_counts()

Unnamed: 0_level_0,count
review,Unnamed: 1_level_1
Poor,18
Good,18
Average,14


In [6]:
# Unique values in education column
df["education"].value_counts()

Unnamed: 0_level_0,count
education,Unnamed: 1_level_1
PG,18
School,16
UG,16


In [7]:
# Unique values in purchased column
df["purchased"].value_counts()

Unnamed: 0_level_0,count
purchased,Unnamed: 1_level_1
No,26
Yes,24


In [8]:
# Importing train_test_split
from sklearn.model_selection import train_test_split

In [9]:
# Dividing Features and Target Variables
X = df[["review","education"]]
y = df["purchased"]

In [10]:
# Splitting the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [11]:
# Shape of Training and Testing Set
print(X_train.shape, X_test.shape)

(35, 2) (15, 2)


In [12]:
# Importing OrdinalEncoder
from sklearn.preprocessing import OrdinalEncoder

In [13]:
# Creating OrdinalEncoder Object
# With order of categories defined in categories parameter
oe = OrdinalEncoder(categories=[["Poor","Average","Good"],["School","UG","PG"]])

In [14]:
# Fit and Transform is called on Training Data only
X_train_transformed = oe.fit_transform(X_train)

In [15]:
# Transformation on Testing Data
X_test_transformed = oe.transform(X_test)

In [16]:
# Parameter to view the Categories
oe.categories_

[array(['Poor', 'Average', 'Good'], dtype=object),
 array(['School', 'UG', 'PG'], dtype=object)]

In [17]:
# Importing LabelEncoder
from sklearn.preprocessing import LabelEncoder

In [18]:
# Creating LabelEncoder Object
le = LabelEncoder()

In [19]:
# Fit and Transform is called on Training Data only
y_train_transformed = le.fit_transform(y_train)

In [20]:
# Transformation on Testing Data
y_test_transformed = le.transform(y_test)

#Nominal Data

In [21]:
# Reading CSV Data
df = pd.read_csv('/content/cars.csv')
df.sample(5)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
2507,Maruti,150000,Petrol,Second Owner,85000
2048,Chevrolet,80000,Diesel,Second Owner,600000
6995,Maruti,50000,Petrol,First Owner,220000
252,Tata,110000,Diesel,Second Owner,75000
2065,BMW,140000,Diesel,Second Owner,1300000


In [22]:
# Shape of the DataFrame
df.shape

(8128, 5)

In [23]:
# Unique values in fuel column
df["fuel"].value_counts()

Unnamed: 0_level_0,count
fuel,Unnamed: 1_level_1
Diesel,4402
Petrol,3631
CNG,57
LPG,38


In [24]:
# Unique values in owner column
df["owner"].value_counts()

Unnamed: 0_level_0,count
owner,Unnamed: 1_level_1
First Owner,5289
Second Owner,2105
Third Owner,555
Fourth & Above Owner,174
Test Drive Car,5


In [25]:
# Unique values in brand column
df["brand"].unique()

array(['Maruti', 'Skoda', 'Honda', 'Hyundai', 'Toyota', 'Ford', 'Renault',
       'Mahindra', 'Tata', 'Chevrolet', 'Fiat', 'Datsun', 'Jeep',
       'Mercedes-Benz', 'Mitsubishi', 'Audi', 'Volkswagen', 'BMW',
       'Nissan', 'Lexus', 'Jaguar', 'Land', 'MG', 'Volvo', 'Daewoo',
       'Kia', 'Force', 'Ambassador', 'Ashok', 'Isuzu', 'Opel', 'Peugeot'],
      dtype=object)

In [26]:
# Encoding Categorical Data
# Using Pandas pd.get_dummies() method
pd.get_dummies(df, columns=["fuel","owner"], dtype=np.int32)

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,0,1,0,0,1,0,0,0,0
1,Skoda,120000,370000,0,1,0,0,0,0,1,0,0
2,Honda,140000,158000,0,0,0,1,0,0,0,0,1
3,Hyundai,127000,225000,0,1,0,0,1,0,0,0,0
4,Maruti,120000,130000,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,0,1,1,0,0,0,0
8124,Hyundai,119000,135000,0,1,0,0,0,1,0,0,0
8125,Maruti,120000,382000,0,1,0,0,1,0,0,0,0
8126,Tata,25000,290000,0,1,0,0,1,0,0,0,0


In [27]:
# Encoding Categorical Data
# Using Pandas pd.get_dummies() method with drop_first=True
# To avoid dummy variable trap and multicollinearity
pd.get_dummies(df, columns=["fuel","owner"], drop_first=True, dtype=np.int32)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1,0,0,0,0,0,0
1,Skoda,120000,370000,1,0,0,0,1,0,0
2,Honda,140000,158000,0,0,1,0,0,0,1
3,Hyundai,127000,225000,1,0,0,0,0,0,0
4,Maruti,120000,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,1,0,0,0,0
8124,Hyundai,119000,135000,1,0,0,1,0,0,0
8125,Maruti,120000,382000,1,0,0,0,0,0,0
8126,Tata,25000,290000,1,0,0,0,0,0,0


In [28]:
# Importing train_test_split
from sklearn.model_selection import train_test_split

In [29]:
# Dividing Features and Target Variables
X = df.iloc[:,0:4]
y = df["selling_price"]

In [30]:
# Splitting the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [31]:
# Shape of Training and Testing Set
print(X_train.shape, X_test.shape)

(5689, 4) (2439, 4)


In [32]:
# Importing OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

In [33]:
# Creating OneHotEncoder Object
ohe = OneHotEncoder(drop="first", sparse_output=False)

In [34]:
# Fit and Transform is called on Training Data only
X_train_transformed = ohe.fit_transform(X_train[["fuel","owner"]])

In [35]:
# Transformation on Testing Data
X_test_transformed = ohe.transform(X_test[["fuel","owner"]])

In [36]:
# Unique values in brand column
df["brand"].value_counts()

Unnamed: 0_level_0,count
brand,Unnamed: 1_level_1
Maruti,2448
Hyundai,1415
Mahindra,772
Tata,734
Toyota,488
Honda,467
Ford,397
Chevrolet,230
Renault,228
Volkswagen,186


In [37]:
# No of categories in brand column
df["brand"].nunique()

32

In [38]:
# Saving value_counts() result in counts variable
counts = df["brand"].value_counts()

In [39]:
# Setting a limit for frequency of categories
limit = 100

In [40]:
# Saving all car brand names with frequency less than 100 in the data
repl = counts[counts <= limit].index

In [41]:
# Replacing all car brand name having frequency less than 100
# With "uncommon" and then transforming all categories into numerical data
pd.get_dummies(df['brand'].replace(repl, 'uncommon'), dtype=np.int32).sample(5)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
4921,0,0,0,0,0,1,0,0,0,0,0,0,0
542,0,0,0,0,0,0,1,0,0,0,0,0,0
7708,0,0,0,1,0,0,0,0,0,0,0,0,0
1759,0,0,0,0,0,0,1,0,0,0,0,0,0
5823,0,0,0,0,0,0,0,0,0,0,0,0,1
