In [1]:
# Define ordinal and nominal categories based on domain knowledge
ordinal_columns = ['Education', 'AgeGroup', 'PurchaseIntent']
nominal_columns = ['Gender', 'City', 'MaritalStatus']

print("Ordinal columns:", ordinal_columns)
print("Nominal columns:", nominal_columns)


Ordinal columns: ['Education', 'AgeGroup', 'PurchaseIntent']
Nominal columns: ['Gender', 'City', 'MaritalStatus']


In [7]:
# Apply one-hot encoding using pandas' get_dummies() for 'Gender' and 'MaritalStatus' columns
import pandas as pd
data=pd.read_csv("categorical_dataset_for_encodings.csv")
data_pandas_encoded = pd.get_dummies(data, columns=['Gender', 'MaritalStatus'], drop_first=False)

# Display the encoded DataFrame
data_pandas_encoded.head()


Unnamed: 0,CustomerID,Education,City,AgeGroup,PurchaseIntent,House_Price,Gender_Female,Gender_Male,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single
0,1,Bachelors,Chicago,Senior,Low,184811,False,True,False,True,False
1,2,Masters,Houston,Senior,Medium,290121,True,False,False,True,False
2,3,High School,Los Angeles,Teen,Medium,332892,False,True,False,False,True
3,4,High School,Houston,Adult,High,480172,False,True,False,False,True
4,5,High School,Houston,Teen,Low,139178,False,True,False,True,False


In [9]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
data=pd.read_csv("categorical_dataset_for_encodings.csv")

# Create an instance of OneHotEncoder
one_hot_encoder = OneHotEncoder(drop=None, sparse_output=False)  # Set sparse_output=False to return a dense array

# Perform one-hot encoding on 'Gender' and 'MaritalStatus' columns
encoded_array = one_hot_encoder.fit_transform(data[['Gender', 'MaritalStatus']])

# Get the feature names after one-hot encoding
encoded_columns = one_hot_encoder.get_feature_names_out(['Gender', 'MaritalStatus'])

# Convert the encoded array into a DataFrame
data_sklearn_encoded = pd.DataFrame(encoded_array, columns=encoded_columns)

# Combine the original data (without 'Gender' and 'MaritalStatus') with the one-hot encoded columns
data_sklearn_encoded = pd.concat([data.drop(columns=['Gender', 'MaritalStatus']), data_sklearn_encoded], axis=1)

# Display the encoded DataFrame
data_sklearn_encoded.head()


Unnamed: 0,CustomerID,Education,City,AgeGroup,PurchaseIntent,House_Price,Gender_Female,Gender_Male,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single
0,1,Bachelors,Chicago,Senior,Low,184811,0.0,1.0,0.0,1.0,0.0
1,2,Masters,Houston,Senior,Medium,290121,1.0,0.0,0.0,1.0,0.0
2,3,High School,Los Angeles,Teen,Medium,332892,0.0,1.0,0.0,0.0,1.0
3,4,High School,Houston,Adult,High,480172,0.0,1.0,0.0,0.0,1.0
4,5,High School,Houston,Teen,Low,139178,0.0,1.0,0.0,1.0,0.0


In [15]:
from sklearn.preprocessing import OrdinalEncoder
import numpy as np
data=pd.read_csv("categorical_dataset_for_encodings.csv")
# Create OrdinalEncoder with the strategy to handle unknown categories
ordinal_encoder = OrdinalEncoder(categories=[education_order, agegroup_order], handle_unknown='use_encoded_value', unknown_value=np.nan)

# Apply ordinal encoding (unknown categories will be encoded as NaN)
data[['Education', 'AgeGroup']] = ordinal_encoder.fit_transform(data[['Education', 'AgeGroup']])

# Handle missing values (if any) after encoding by filling them with a default value (optional)
data[['Education', 'AgeGroup']] = data[['Education', 'AgeGroup']].fillna(-1)

# Display the encoded DataFrame
data.head()



Unnamed: 0,CustomerID,Gender,Education,City,MaritalStatus,AgeGroup,PurchaseIntent,House_Price
0,1,Male,1.0,Chicago,Married,2.0,Low,184811
1,2,Female,2.0,Houston,Married,2.0,Medium,290121
2,3,Male,0.0,Los Angeles,Single,0.0,Medium,332892
3,4,Male,0.0,Houston,Single,1.0,High,480172
4,5,Male,0.0,Houston,Married,0.0,Low,139178


In [17]:
# Calculate the frequency of each category in the 'City' column
import numpy as np
data=pd.read_csv("categorical_dataset_for_encodings.csv")
city_frequency = data['City'].value_counts(normalize=True)

# Map these frequencies back to the 'City' column
data['City_Frequency_Encoded'] = data['City'].map(city_frequency)

# Display the updated DataFrame with the frequency-encoded 'City' column
data[['City', 'City_Frequency_Encoded']].head()


Unnamed: 0,City,City_Frequency_Encoded
0,Chicago,0.24
1,Houston,0.241
2,Los Angeles,0.258
3,Houston,0.241
4,Houston,0.241


In [19]:
from sklearn.preprocessing import LabelEncoder
import numpy as np
data=pd.read_csv("categorical_dataset_for_encodings.csv")
# Create a LabelEncoder instance
label_encoder = LabelEncoder()

# Apply label encoding to the 'PurchaseIntent' column
data['PurchaseIntent_LabelEncoded'] = label_encoder.fit_transform(data['PurchaseIntent'])

# Display the unique classes (categories) after encoding
print("Label classes for PurchaseIntent:", label_encoder.classes_)

# Display the updated DataFrame with the label-encoded 'PurchaseIntent' column
data[['PurchaseIntent', 'PurchaseIntent_LabelEncoded']].head()


Label classes for PurchaseIntent: ['High' 'Low' 'Medium']


Unnamed: 0,PurchaseIntent,PurchaseIntent_LabelEncoded
0,Low,1
1,Medium,2
2,Medium,2
3,High,0
4,Low,1


In [21]:
import numpy as np
data=pd.read_csv("categorical_dataset_for_encodings.csv")
# Calculate the mean House_Price for each city
city_target_mean = data.groupby('City')['House_Price'].mean()

# Map these means back to the 'City' column
data['City_TargetEncoded'] = data['City'].map(city_target_mean)

# Display the updated DataFrame with the target-encoded 'City' column
data[['City', 'City_TargetEncoded']].head()


Unnamed: 0,City,City_TargetEncoded
0,Chicago,299091.125
1,Houston,307577.493776
2,Los Angeles,303874.895349
3,Houston,307577.493776
4,Houston,307577.493776
