In [45]:
import sys
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [46]:
# ! DO NOT MODIFY
# Sample data
data = {
    "City": ["New York", "Los Angeles", "Chicago", "New York", "Chicago"], # categorical
    "Transport_Mode": ["Bus", "Train", "Bike", "Bike", "Train"], # categorical
    "Travel_Time": [30, 45, 15, 25, 40]  # numeric
}

In [47]:
# Store `data` in a DataFrame `df`
df = pd.DataFrame(data)

# Show `df`
df

Unnamed: 0,City,Transport_Mode,Travel_Time
0,New York,Bus,30
1,Los Angeles,Train,45
2,Chicago,Bike,15
3,New York,Bike,25
4,Chicago,Train,40


In [48]:
# Store categorical columns in a list `categorical_columns`
categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_columns

['City', 'Transport_Mode']

In [49]:
# Apply `OneHotEncoder` and the output should be sparse
encoder_sparse = OneHotEncoder(sparse_output=True)

# Fit the categorical columns to `encoder_sparse`
encoded_features_sparse = encoder_sparse.fit_transform(df[categorical_columns])

In [50]:
# Apply `OneHotEncoder` and the output should be dense 
encoder_dense = OneHotEncoder(sparse_output=False)

# Fit the categorical columns to `encoder_dense`
encoded_features_dense = encoder_dense.fit_transform(df[categorical_columns])

In [51]:
# Print `encoded_features_sparse`
print(encoded_features_sparse)

# Print `encoded_features_dense`
print(encoded_features_dense)


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 10 stored elements and shape (5, 6)>
  Coords	Values
  (0, 2)	1.0
  (0, 4)	1.0
  (1, 1)	1.0
  (1, 5)	1.0
  (2, 0)	1.0
  (2, 3)	1.0
  (3, 2)	1.0
  (3, 3)	1.0
  (4, 0)	1.0
  (4, 5)	1.0
[[0. 0. 1. 0. 1. 0.]
 [0. 1. 0. 0. 0. 1.]
 [1. 0. 0. 1. 0. 0.]
 [0. 0. 1. 1. 0. 0.]
 [1. 0. 0. 0. 0. 1.]]


In [52]:
# Check memory usage using sys module:
print("Sparse matrix size:", sys.getsizeof(encoded_features_sparse), 'bytes')
print("Dense array size:", sys.getsizeof(encoded_features_dense), 'bytes')

Sparse matrix size: 48 bytes
Dense array size: 368 bytes


## Dummy Variable Redundancy Problem

The Dummy Variable Redundancy Problem occurs when using excessive dummy variables to represent a categorical feature, leading to perfect multicollinearity. This makes it difficult for models to estimate individual category effects, resulting in unstable results. For example, if a "City" variable has categories New York, Los Angeles, and Chicago, using three dummy variables is redundant as one can be predicted from the others.

This can be solved by dropping one dummy variable for each categorical feature, ensuring accurate estimations.

In [53]:
# Apply `OneHotEncoder`, the output should be sparse, and drop the first column
encoder_sparse_better = OneHotEncoder(sparse_output=True, drop='first')

# Fit the categorical columns to `encoder_sparse_better`
encoded_features_sparse_better = encoder_sparse_better.fit_transform(df[categorical_columns])

In [54]:
# Convert to DataFrame with proper column names

# Convert the sparse matrix to a DataFrame
df_encoded = pd.DataFrame(encoded_features_sparse.toarray(), columns=encoder_sparse.get_feature_names_out(categorical_columns))

# Convert the BETTER sparse matrix to a DataFrame
df_encoded_better = pd.DataFrame(encoded_features_sparse_better.toarray(), columns=encoder_sparse_better.get_feature_names_out(categorical_columns))

In [55]:
print('Normal encoding:\n')
df_encoded

Normal encoding:



Unnamed: 0,City_Chicago,City_Los Angeles,City_New York,Transport_Mode_Bike,Transport_Mode_Bus,Transport_Mode_Train
0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.0,1.0,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,1.0,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,1.0


In [56]:
print('Dropped first category encoding:\n')
df_encoded_better

Dropped first category encoding:



Unnamed: 0,City_Los Angeles,City_New York,Transport_Mode_Bus,Transport_Mode_Train
0,0.0,1.0,1.0,0.0
1,1.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,1.0
