In [7]:
# This notebook demonstrates label encoding and one-hot encoding techniques.
# Import pandas for data manipulation and numpy for numerical operations.
import pandas as pd
import numpy as np

In [8]:
# Load the employees dataset from a CSV file.
df = pd.read_csv("employees.csv", header=0)
# Display the first 10 rows of the DataFrame.
df.head(10)

Unnamed: 0,First Name,Gender,Last Login Time,Salary,Bonus,age,height,Senior Management,Team
0,Douglas,Male,12:42 PM,97308,6.945,36,5.38,True,Marketing
1,Thomas,Male,6:53 AM,61933,4.17,43,5.52,True,Finance
2,Maria,Female,11:17 AM,130590,11.858,51,5.29,False,Finance
3,Jerry,Male,1:00 PM,138705,9.34,40,5.12,True,Finance
4,Larry,Male,4:47 PM,101004,1.389,46,5.95,True,Client Services
5,Dennis,Male,1:35 AM,115163,10.125,40,5.26,False,Legal
6,Ruby,Female,4:20 PM,65476,10.012,51,5.23,True,Product
7,Angela,Female,6:29 AM,95570,18.523,39,5.4,True,Engineering
8,Frances,Female,6:51 AM,139852,7.524,45,5.3,True,Business Development
9,Louise,Female,9:01 AM,63241,15.132,38,5.5,True,Finance


In [9]:
# Import the preprocessing module from scikit-learn.
from sklearn import preprocessing

# Create an instance of the LabelEncoder.
label_encoder = preprocessing.LabelEncoder()

# Apply label encoding to the 'Team' column.
# This will assign a unique integer to each team.
df['Team_encoded'] = label_encoder.fit_transform(df['Team'])
# Display the first 10 rows to see the new 'Team_encoded' column.
df.head(10)

Unnamed: 0,First Name,Gender,Last Login Time,Salary,Bonus,age,height,Senior Management,Team,Team_encoded
0,Douglas,Male,12:42 PM,97308,6.945,36,5.38,True,Marketing,7
1,Thomas,Male,6:53 AM,61933,4.17,43,5.52,True,Finance,4
2,Maria,Female,11:17 AM,130590,11.858,51,5.29,False,Finance,4
3,Jerry,Male,1:00 PM,138705,9.34,40,5.12,True,Finance,4
4,Larry,Male,4:47 PM,101004,1.389,46,5.95,True,Client Services,1
5,Dennis,Male,1:35 AM,115163,10.125,40,5.26,False,Legal,6
6,Ruby,Female,4:20 PM,65476,10.012,51,5.23,True,Product,8
7,Angela,Female,6:29 AM,95570,18.523,39,5.4,True,Engineering,3
8,Frances,Female,6:51 AM,139852,7.524,45,5.3,True,Business Development,0
9,Louise,Female,9:01 AM,63241,15.132,38,5.5,True,Finance,4


In [10]:
# Perform one-hot encoding on the 'Gender' column using pandas' get_dummies function.
# This creates new columns for each category with binary values.
df = pd.get_dummies(df, columns=['Gender'], dtype='int')
# Display the first 5 rows to see the new one-hot encoded columns.
df.head()

Unnamed: 0,First Name,Last Login Time,Salary,Bonus,age,height,Senior Management,Team,Team_encoded,Gender_Female,Gender_Male
0,Douglas,12:42 PM,97308,6.945,36,5.38,True,Marketing,7,0,1
1,Thomas,6:53 AM,61933,4.17,43,5.52,True,Finance,4,0,1
2,Maria,11:17 AM,130590,11.858,51,5.29,False,Finance,4,1,0
3,Jerry,1:00 PM,138705,9.34,40,5.12,True,Finance,4,0,1
4,Larry,4:47 PM,101004,1.389,46,5.95,True,Client Services,1,0,1


In [11]:
# Manually perform one-hot encoding for the 'Gender' column.
# This method provides more control and can be more intuitive for beginners.
df = pd.read_csv("employees.csv", header=0)
df['Gender_Male_Manual'] = (df['Gender'] == 'Male').astype(int)
df['Gender_Female_Manual'] = (df['Gender'] == 'Female').astype(int)

# Display the first 5 rows to show the manually created one-hot encoded columns.
df.head()

Unnamed: 0,First Name,Gender,Last Login Time,Salary,Bonus,age,height,Senior Management,Team,Gender_Male_Manual,Gender_Female_Manual
0,Douglas,Male,12:42 PM,97308,6.945,36,5.38,True,Marketing,1,0
1,Thomas,Male,6:53 AM,61933,4.17,43,5.52,True,Finance,1,0
2,Maria,Female,11:17 AM,130590,11.858,51,5.29,False,Finance,0,1
3,Jerry,Male,1:00 PM,138705,9.34,40,5.12,True,Finance,1,0
4,Larry,Male,4:47 PM,101004,1.389,46,5.95,True,Client Services,1,0


In [12]:
# Perform one-hot encoding using scikit-learn's OneHotEncoder.
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Load the dataset.
df = pd.read_csv('employees.csv')

# Create an instance of the OneHotEncoder.
encoder = OneHotEncoder(sparse_output=False, dtype=int)
# Fit and transform the 'Gender' column.
gender_encoded = encoder.fit_transform(df[['Gender']])

# Create meaningful column names for the new one-hot encoded columns.
gender_col_names = [f'Gender_{cat}' for cat in encoder.categories_[0]]
gender_df = pd.DataFrame(gender_encoded, columns=gender_col_names)

# Concatenate the new encoded DataFrame with the original DataFrame.
df_encoded = pd.concat([df, gender_df], axis=1)

# Display the original and one-hot encoded 'Gender' columns.
print(df_encoded[['Gender'] + gender_col_names].head())

   Gender  Gender_Female  Gender_Male
0    Male              0            1
1    Male              0            1
2  Female              1            0
3    Male              0            1
4    Male              0            1


In [13]:
# Perform one-hot encoding using TensorFlow and Keras.
import pandas as pd
import tensorflow as tf

# Load the dataset.
df = pd.read_csv('employees.csv')

# Extract the 'Gender' column as a numpy array.
genders = df['Gender'].values

# Use StringLookup to convert the string labels to integer indices.
lookup = tf.keras.layers.StringLookup(vocabulary=["Female", "Male"], output_mode='int')
gender_int = lookup(genders)

# Use CategoryEncoding to perform one-hot encoding on the integer indices.
one_hot = tf.keras.layers.CategoryEncoding(num_tokens=3, output_mode='one_hot')
gender_onehot = one_hot(gender_int)

# Convert the one-hot encoded tensor back to a pandas DataFrame.
gender_col_names = ["Gender_Vocab_0", "Gender_Female", "Gender_Male"]
import numpy as np
gender_df = pd.DataFrame(np.array(gender_onehot), columns=gender_col_names)

# Concatenate the new DataFrame with the original one.
df_encoded = pd.concat([df.reset_index(drop=True), gender_df.reset_index(drop=True)], axis=1)

# Display the original and one-hot encoded 'Gender' columns.
print(df_encoded[['Gender', 'Gender_Female', 'Gender_Male']].head())

   Gender  Gender_Female  Gender_Male
0    Male            0.0          1.0
1    Male            0.0          1.0
2  Female            1.0          0.0
3    Male            0.0          1.0
4    Male            0.0          1.0
