In [5]:
########################################
# Categorical variables
########################################
import pandas as pd 

used_cars = pd.read_csv("datasets/cars.csv")
display(used_cars.info())

#The used cars dataset contains information on over 38,000 used cars 
#including the manufacturer, model, and sale price. 
#This dataset is commonly used to practice building predictive models.

#Using categorical variables can save a lot of memory!
#But not always the case
print(used_cars["manufacturer_name"].describe())
#55 entries and stored as an object

print("As object: ",used_cars["manufacturer_name"].nbytes)
print("As category: ",used_cars["manufacturer_name"].astype("category").nbytes)
#reduction of memory usage by almost 90%

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38531 entries, 0 to 38530
Data columns (total 30 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   manufacturer_name  38531 non-null  object 
 1   model_name         38531 non-null  object 
 2   transmission       38531 non-null  object 
 3   color              38531 non-null  object 
 4   odometer_value     38531 non-null  int64  
 5   year_produced      38531 non-null  int64  
 6   engine_fuel        38531 non-null  object 
 7   engine_has_gas     38531 non-null  bool   
 8   engine_type        38531 non-null  object 
 9   engine_capacity    38521 non-null  float64
 10  body_type          38531 non-null  object 
 11  has_warranty       38531 non-null  bool   
 12  state              38531 non-null  object 
 13  drivetrain         38531 non-null  object 
 14  price_usd          38531 non-null  float64
 15  is_exchangeable    38531 non-null  bool   
 16  location_region    385

None

count          38531
unique            55
top       Volkswagen
freq            4243
Name: manufacturer_name, dtype: object
As object:  308248
As category:  38971


In [6]:
used_cars["odometer_value"].astype("object").describe()

print(f"As float: {used_cars['odometer_value'].nbytes}")
print(f"As category: {used_cars['odometer_value'].astype('category').nbytes}")
#reduction of memory usage by almost 60%

As float: 308248
As category: 125566


In [7]:
#check and convert category type

#check
used_cars["color"] = used_cars["color"].astype("category")
used_cars["color"] = used_cars["color"].str.upper()
print(used_cars["color"].dtype)

#convert
used_cars["color"] = used_cars["color"].astype("category")
print(used_cars["color"].dtype)


object
category


In [10]:
#look for missing values
used_cars = pd.read_csv("datasets/cars.csv")
used_cars["color"] = used_cars["color"].astype("category")

#set categories
used_cars["color"] = used_cars["color"].cat.set_categories(["black","silver","blue"])
used_cars["color"].value_counts(dropna=False)
#18172 missing values 
#maybe use a different method for updating the categories

NaN       18172
black      7705
silver     6852
blue       5802
Name: color, dtype: int64

In [11]:
#using numpy array
#a categorical variable is not a NumPy array
used_cars["number_of_photos"] = used_cars["number_of_photos"].astype("category")
## used_cars["number_of_photos"].sum() #Gives an error
used_cars["number_of_photos"].astype("int").sum()



371788

In [16]:
used_cars = pd.read_csv("datasets/cars.csv")
import numpy as np
used_cars.loc[2:5,"body_type"]=np.nan

# Print the frequency table of body_type and include NaN values
print(used_cars["body_type"].value_counts(dropna=False))

# Update NaN values
used_cars.loc[used_cars["body_type"].isna(), "body_type"] = "other"
print(used_cars["body_type"].value_counts(dropna=False))

# Convert body_type to title case
used_cars["body_type"] = used_cars["body_type"].str.title()
print(used_cars["body_type"].value_counts(dropna=False))

# Check the dtype
print(used_cars["body_type"].dtype)

sedan        13010
hatchback     7644
universal     5505
suv           5163
minivan       3608
minibus       1369
van            808
coupe          652
liftback       552
pickup         129
cabriolet       75
limousine       12
NaN              4
Name: body_type, dtype: int64
sedan        13010
hatchback     7644
universal     5505
suv           5163
minivan       3608
minibus       1369
van            808
coupe          652
liftback       552
pickup         129
cabriolet       75
limousine       12
other            4
Name: body_type, dtype: int64
Sedan        13010
Hatchback     7644
Universal     5505
Suv           5163
Minivan       3608
Minibus       1369
Van            808
Coupe          652
Liftback       552
Pickup         129
Cabriolet       75
Limousine       12
Other            4
Name: body_type, dtype: int64
object


In [19]:
#encoding label
used_cars["manufacturer_name"] = used_cars["manufacturer_name"].astype("category")
used_cars["manufacturer_code"] = used_cars["manufacturer_name"].cat.codes
print(used_cars[["manufacturer_name","manufacturer_code"]])

#Subaru is the first manufacturer name in the dataset, 
#but is the 46th name in alphabetical order. 
#It has been assigned a code of 45. 
#Chrysler is the 9th in order, and has been given a code of 8


      manufacturer_name  manufacturer_code
0                Subaru                 45
1                Subaru                 45
2                Subaru                 45
3                Subaru                 45
4                Subaru                 45
...                 ...                ...
38526          Chrysler                  8
38527          Chrysler                  8
38528          Chrysler                  8
38529          Chrysler                  8
38530          Chrysler                  8

[38531 rows x 2 columns]


In [21]:
#Creating a code book

#If you do create a label encoding 
#you will want to create a map from the new codes to the old values

#This can be done by creating an object for the codes and an object for the categories

codes = used_cars["manufacturer_name"].cat.codes
categories = used_cars["manufacturer_name"]
name_map = dict(zip(codes,categories))
print(name_map)

#revert to the previous values
print(used_cars["manufacturer_code"].map(name_map))


{45: 'Subaru', 24: 'LADA', 12: 'Dodge', 54: 'УАЗ', 23: 'Kia', 35: 'Opel', 53: 'Москвич', 1: 'Alfa Romeo', 0: 'Acura', 10: 'Dacia', 27: 'Lexus', 33: 'Mitsubishi', 25: 'Lancia', 9: 'Citroen', 32: 'Mini', 21: 'Jaguar', 38: 'Porsche', 44: 'SsangYong', 11: 'Daewoo', 15: 'Geely', 50: 'ВАЗ', 13: 'Fiat', 14: 'Ford', 39: 'Renault', 42: 'Seat', 40: 'Rover', 48: 'Volkswagen', 28: 'Lifan', 22: 'Jeep', 5: 'Cadillac', 2: 'Audi', 52: 'ЗАЗ', 47: 'Toyota', 51: 'ГАЗ', 49: 'Volvo', 7: 'Chevrolet', 16: 'Great Wall', 4: 'Buick', 37: 'Pontiac', 29: 'Lincoln', 18: 'Hyundai', 34: 'Nissan', 46: 'Suzuki', 3: 'BMW', 30: 'Mazda', 26: 'Land Rover', 20: 'Iveco', 43: 'Skoda', 41: 'Saab', 19: 'Infiniti', 6: 'Chery', 17: 'Honda', 31: 'Mercedes-Benz', 36: 'Peugeot', 8: 'Chrysler'}
0          Subaru
1          Subaru
2          Subaru
3          Subaru
4          Subaru
           ...   
38526    Chrysler
38527    Chrysler
38528    Chrysler
38529    Chrysler
38530    Chrysler
Name: manufacturer_code, Length: 38531, dtyp

In [22]:
#Boolean encoding
used_cars["van_code"] = np.where(used_cars["body_type"].str.contains("van",regex=False),1 ,0)
used_cars["van_code"].value_counts()

0    34923
1     3608
Name: van_code, dtype: int64

In [23]:
#Create a label encoding and map
#"color", needs to be converted to codesThe company believes that a car's color will be important when predicting sales price.
#'color', needs to be converted to codes

# Convert to categorical and print the frequency table
used_cars["color"] = used_cars["color"].astype("category")
print(used_cars["color"].value_counts())

# Create a label encoding
used_cars["color_code"] = used_cars["color"].cat.codes

# Create codes and categories objects
codes = used_cars["color"].cat.codes
categories = used_cars["color"]
color_map = dict(zip(codes, categories))

# Print the map
print(color_map)

#the label encoding is created for the color column. 
#Creating an encoding like this can save on memory and improve performance


black     7705
silver    6852
blue      5802
white     4212
grey      3751
red       2925
green     2760
other     2688
brown      886
violet     463
yellow     303
orange     184
Name: color, dtype: int64
{8: 'silver', 1: 'blue', 7: 'red', 0: 'black', 4: 'grey', 6: 'other', 2: 'brown', 10: 'white', 3: 'green', 9: 'violet', 5: 'orange', 11: 'yellow'}


In [24]:
# Print the "manufacturer_name" frequency table.
print(used_cars["manufacturer_name"].value_counts())

# Create a Boolean column for the most common manufacturer name
used_cars["is_volkswagen"] = np.where(
  used_cars["manufacturer_name"].str.contains("Volkswagen", regex=False), 1, 0
)
  
# Check the final frequency table
print(used_cars["is_volkswagen"].value_counts())


Volkswagen       4243
Opel             2759
BMW              2610
Ford             2566
Renault          2493
Audi             2468
Mercedes-Benz    2237
Peugeot          1909
Citroen          1562
Nissan           1361
Mazda            1328
Toyota           1246
Hyundai          1116
Skoda            1089
Kia               912
Mitsubishi        887
Fiat              824
Honda             797
Volvo             721
ВАЗ               481
Chevrolet         436
Chrysler          410
Seat              303
Dodge             297
Subaru            291
Rover             235
Suzuki            234
Daewoo            221
Lexus             213
Alfa Romeo        207
ГАЗ               200
Land Rover        184
Infiniti          162
LADA              146
Iveco             139
Saab              108
Jeep              107
Lancia             92
SsangYong          79
УАЗ                74
Geely              71
Mini               68
Acura              66
Porsche            61
Dacia              59
Chery     

In [26]:
#why not just label encoding?

used_cars["engine_fuel"] = used_cars["engine_fuel"].astype("category")
#print(used_cars["color"].value_counts())

# Create a label encoding
# Create codes and categories objects
codes = used_cars["engine_fuel"].cat.codes
categories = used_cars["engine_fuel"]
fuel_map = dict(zip(codes, categories))
print(fuel_map)

#keys are assigned in alphabetical order, or the order of the category if the column is ordinal

#If you try to use a column of these codes in a machine learning model, 
#the algorithm might misinterpret their meaning. 
#algorithms train on numbers! 
#diesel with a value of 0 might be given less weight than gasoline with a value of 3. 
#We need a better approach.


{3: 'gasoline', 2: 'gas', 0: 'diesel', 5: 'hybrid-petrol', 4: 'hybrid-diesel', 1: 'electric'}


In [28]:
#one-hot encoding on a subset DataFrame
used_cars_onehot = pd.get_dummies(used_cars[["odometer_value", "color"]])
display(used_cars_onehot.head())
print(used_cars_onehot.shape)

Unnamed: 0,odometer_value,color_black,color_blue,color_brown,color_green,color_grey,color_orange,color_other,color_red,color_silver,color_violet,color_white,color_yellow
0,190000,0,0,0,0,0,0,0,0,1,0,0,0
1,290000,0,1,0,0,0,0,0,0,0,0,0,0
2,402000,0,0,0,0,0,0,0,1,0,0,0,0
3,10000,0,1,0,0,0,0,0,0,0,0,0,0
4,280000,1,0,0,0,0,0,0,0,0,0,0,0


(38531, 13)


In [29]:
#one-hot encoding on the entire DataFrame but a subset of columns
used_cars_onehot = pd.get_dummies(used_cars, columns=["color"], prefix="")
display(used_cars_onehot.head())
print(used_cars_onehot.shape)

Unnamed: 0,manufacturer_name,model_name,transmission,odometer_value,year_produced,engine_fuel,engine_has_gas,engine_type,engine_capacity,body_type,...,_brown,_green,_grey,_orange,_other,_red,_silver,_violet,_white,_yellow
0,Subaru,Outback,automatic,190000,2010,gasoline,False,gasoline,2.5,Universal,...,0,0,0,0,0,0,1,0,0,0
1,Subaru,Outback,automatic,290000,2002,gasoline,False,gasoline,3.0,Universal,...,0,0,0,0,0,0,0,0,0,0
2,Subaru,Forester,automatic,402000,2001,gasoline,False,gasoline,2.5,Other,...,0,0,0,0,0,1,0,0,0,0
3,Subaru,Impreza,mechanical,10000,1999,gasoline,False,gasoline,3.0,Other,...,0,0,0,0,0,0,0,0,0,0
4,Subaru,Legacy,automatic,280000,2001,gasoline,False,gasoline,2.5,Other,...,0,0,0,0,0,0,0,0,0,0


(38531, 45)


In [30]:
#one-hot encoding on the entire DataFrame all columns
#Overfitting
used_cars_onehot = pd.get_dummies(used_cars)
print(used_cars_onehot.shape)

(38531, 1245)


In [32]:
# Create one-hot encoding for just two columns
used_cars_simple = pd.get_dummies(
  used_cars,
  # Specify the columns from the instructions
  columns = ["manufacturer_name", "transmission"],
  # Set the prefix
  prefix="dummy"
)

# Print the shape of the new dataset
print(used_cars_simple.shape)
print(used_cars_simple.columns)

#You won't be able to use categorical columns that have not been encoded, 
#but limiting which categorical columns are included in your algorithm may prevent overfitting 



(38531, 89)
Index(['model_name', 'color', 'odometer_value', 'year_produced', 'engine_fuel',
       'engine_has_gas', 'engine_type', 'engine_capacity', 'body_type',
       'has_warranty', 'state', 'drivetrain', 'price_usd', 'is_exchangeable',
       'location_region', 'number_of_photos', 'up_counter', 'feature_0',
       'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5',
       'feature_6', 'feature_7', 'feature_8', 'feature_9', 'duration_listed',
       'manufacturer_code', 'van_code', 'color_code', 'is_volkswagen',
       'dummy_Acura', 'dummy_Alfa Romeo', 'dummy_Audi', 'dummy_BMW',
       'dummy_Buick', 'dummy_Cadillac', 'dummy_Chery', 'dummy_Chevrolet',
       'dummy_Chrysler', 'dummy_Citroen', 'dummy_Dacia', 'dummy_Daewoo',
       'dummy_Dodge', 'dummy_Fiat', 'dummy_Ford', 'dummy_Geely',
       'dummy_Great Wall', 'dummy_Honda', 'dummy_Hyundai', 'dummy_Infiniti',
       'dummy_Iveco', 'dummy_Jaguar', 'dummy_Jeep', 'dummy_Kia', 'dummy_LADA',
       'dummy_Lancia', 'dum