In [2]:
# What is One-hot Encoding?
# First, one-hot encoding does NOT capture the meaning of the words. 
# The computer does not know what blue looks like, but it can still find relationships between the color and other variables in the context
# of a dataset. In order to represent non-ordered, or 'nominal' features, we do the following:
# 1. Create a new column for every category present in the feature.
# 2. Set the value of each of the new columns to 1 if that row corresponds to the original category
# 3. Set the value of each of the new columns to 0 if they do not.
# 4. Remove the original column.

In [27]:
# Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer

In [40]:
# Load the data
path = 'C:/Users/User/Desktop/Raw_Medical_Data_for_day1.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0.1,Unnamed: 0,State,Lat,Lng,Area,Children,Age,Income,Marital,Gender,...,Hyperlipidemia,BackPain,Anxiety,Allergic_rhinitis,Reflux_esophagitis,Asthma,Services,Initial_days,TotalCharge,Additional_charges
0,1,AL,34.3496,-86.72508,Suburban,1.0,53,86575.93,Divorced,Male,...,0.0,1.0,1.0,1.0,0,1,Blood Work,10.58577,3726.70286,17939.40342
1,2,FL,30.84513,-85.22907,Urban,3.0,51,46805.99,Married,Female,...,0.0,0.0,0.0,0.0,1,0,Intravenous,15.129562,4193.190458,17612.99812
2,3,SD,43.54321,-96.63772,Suburban,3.0,53,14370.14,Widowed,Female,...,0.0,0.0,0.0,0.0,0,0,Blood Work,4.772177,2434.234222,17505.19246
3,4,MN,43.89744,-93.51479,Suburban,0.0,78,39741.49,Married,Male,...,0.0,0.0,0.0,0.0,1,1,Blood Work,1.714879,2127.830423,12993.43735
4,5,VA,37.59894,-76.88958,Rural,1.0,22,1209.56,Widowed,Female,...,1.0,0.0,0.0,1.0,0,0,CT Scan,1.254807,2113.073274,3716.525786


In [39]:
# Load the data
path = 'C:/Users/User/Desktop/Raw_Medical_Data_for_day1.csv'
df = pd.read_csv(path)

# There are several 'object' columns that will need to be encoded.
df.info()

# The column 'Complication_Risk' is actually ordered and we can use ordinal encoding for that one. The df.replace() method works great for this.
df['Complication_risk'].value_counts()

df['Complication_risk'].replace({'Low':0, 'Med':1, 'Medium':1, 'High':2}, inplace=True)
df['Complication_risk'].value_counts()

# The ordinal encoding we performed above did not depend on information from the test data, so does not cause data leakage. We can do this before we split the data.

# For this task, the goal is to predict "Additional charges" based on the other features in the dataset.
# We will assign "Additional_charges" as our target, y.
# We will assign the rest of the columns as our features (X). We will also drop the "Unnamed: 0" column because it is not a relevant feature.

X = df.drop(columns = ['Unnamed: 0', 'Additional_charges'])
y = df['Additional_charges']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
# Column Selector
# We only want to one-hot encode the categorical features, NOT the numeric features, so we need a way to split off the categorical features from the numeric ones. We could do this manually, but make_column_selector will allow us to this more efficiently.
# When we instantiate the make_column_selector, we use the 'dtype_include= ____ argument to tell it what kinds of columns we want it to select. By passing the argument 'dtype_include='object', it will select
# columns with the 'object' datatype. If, instead, we were to set 'dtype_include='number'', it would select both integer and float datatypes, since those are both numbers.
# In this case we will tell make_column_selector to only select columns with the type 'object'.

#make categorical selector
cat_selector = make_column_selector(dtype_include='object')

# When we apply the cat_selector to a dataframe, it will return a list of the column names of the columns that match the pattern we gave it.
cat_selector(X_train)

# We can now use that list to subset the original dataset.
# The code below creates two new dataframes '(train_cat_data' and 'test_cat_data') that contain only the object features selected with the cat_selector.

# create a subset of data for only categorical columns
train_cat_data = X_train[cat_selector(X_train)]
test_cat_data = X_test[cat_selector(X_test)]
train_cat_data

In [34]:
# OneHotEncoder
# Scikit-Learn's OneHotEncoder class will do all of the one-hot encoding work for us, but we want to make some changes from default settings when we instantiate the OneHotEncoder.
# sparse = False
# Be default, it will return what is called a 'sparse matrix'. This is a form of data compression used when arrays are mostly filled with 0s. Instead of keeping track of so many 0s, the compressed version just
# has information about where the data is NOT 0. This is great for saving memory! But not great if we want to use the data in a new dataframe. We can get OneHotEncoder to return a normal array that is
# compatible with pandas, called a ‘Dense’ array by specifying ‘sparse=False’ inside our OneHotEncoder.
# handle_unknown ='ignore'
# OneHotEncoder is a transformer like StandardScaler. When we use it we will:
# 1. Fit the OneHotEncoder ONLY on the categorical training data
# 2. Transform the categorical training data to a one-hot encoded form.
# 3. Transform the categorical testing data to a one-hot encoded form.
# Be default, OneHotEncoder will throw an error if we try to transform data that has categories that were not present in the data it was fitted on and does not have columns for. We can pass the argument
# 'handle_unknown='ignore'' to tell it to ignore any categories that were not present during the fit step (values it encounters in the test set that were not present in the train set). In this case, all columns
# corresponding to that feature will have the value 0.

#instantiate one hot encoder
ohe_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

#fit the OneHotEncoder on the training data
ohe_encoder.fit(train_cat_data)

#transform both the training and the testing data
train_ohe = ohe_encoder.transform(train_cat_data)
test_ohe = ohe_encoder.transform(test_cat_data)
train_ohe




array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [37]:
# Notice that the output is a Numpy array instead of a dataframe. This is fine for modeling, but makes it difficult to inspect the result, and we can't use Pandas to manipulate it. Converting the output to a
# dataframe is not necessary for modeling, but we will do it so now so we can see the result more clearly and so we can recombine the new one hot encoded columns with our numeric columns.
# We can use the method 'get_feature_names_out()' to get a list of the new features, but we need to pass it the original list of columns (from before the data was encoded).

#set prefixes to original column names
ohe_column_names = ohe_encoder.get_feature_names_out()
train_ohe = pd.DataFrame(train_ohe, columns=ohe_column_names)
test_ohe = pd.DataFrame(test_ohe, columns=ohe_column_names)
train_ohe

Unnamed: 0,State_AK,State_AL,State_AR,State_AZ,State_CA,State_CO,State_CT,State_DC,State_FL,State_GA,...,Gender_f,Gender_m,Gender_male,Initial_admin_Elective Admission,Initial_admin_Emergency Admission,Initial_admin_Observation Admission,Services_Blood Work,Services_CT Scan,Services_Intravenous,Services_MRI
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
746,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
747,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
748,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [38]:
# create a numeric selector
num_selector = make_column_selector(dtype_include='number')
# isolate the numeric columns
train_nums = X_train[num_selector(X_train)].reset_index(drop=True)
test_nums = X_test[num_selector(X_test)].reset_index(drop=True)
# re-combine the train and test sets on axis 1 (columns)
X_train_processed = pd.concat([train_nums, train_ohe], axis=1)
X_test_processed = pd.concat([test_nums, test_ohe], axis=1)
X_train_processed

Unnamed: 0,Lat,Lng,Children,Age,Income,ReAdmis,VitD_levels,Doc_visits,Full_meals_eaten,vitD_supp,...,Gender_f,Gender_m,Gender_male,Initial_admin_Elective Admission,Initial_admin_Emergency Admission,Initial_admin_Observation Admission,Services_Blood Work,Services_CT Scan,Services_Intravenous,Services_MRI
0,36.16307,-86.66510,2.0,60,8459.99,0,19.034162,5,1,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,34.96594,-87.12179,5.0,78,22669.31,0,15.903388,7,1,0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,36.24648,-83.51232,1.0,60,25536.25,0,18.225040,4,1,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,45.42189,-97.91165,7.0,82,94863.57,0,15.809932,5,0,2,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,42.33661,-83.28292,0.0,37,30898.36,0,20.640410,5,1,0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,42.05701,-77.43901,1.0,32,4788.93,0,19.029312,6,1,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
746,40.47773,-86.38658,4.0,27,29461.62,0,15.293840,5,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
747,40.56510,-81.07429,0.0,57,79094.04,0,19.459084,5,0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
748,33.97472,-118.35549,0.0,56,25697.12,0,15.871725,5,1,0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [None]:
# All columns in our dataframe are now numeric with no data lost. 
# This dataframe can be used in a model, however, we may want to scale the numeric columns as well if we are using certain kinds of models. 

# Why can't we one-hot encode data before we split?
# The short answer is because it causes data leakage. Let's examine how.

# Sometimes there are very few, or even only 1 sample with a particular category in a feature. When we split the data, that sample could possibly end up in the testing set. 
# This would be analogous to a deployed production model encountering a category it had never seen before. 
# Should a column already exist for that category? We have no way of knowing (in theory) what these unseen categories might be, so we cannot create new columns for them.
# OneHotEncoder should only be used to create columns for categories that appear in the training data.

# Can't the encoded testing dataframes have a column for the new category, even if the training dataframes do not? NO!
# Later on we will be fitting models on our training data. Those models will only be able to make predictions on data with the SAME number of features as the data they were trained on. 
# If we try to use them to make a prediction on testing data that has a different number of columns, the models will throw an error. 
# This is why we pass the argument 'handle_unknown='ignore' when instantiating the one-hot-encoder.