In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [18]:
# This dataset shows various patient information including their socioeconomic information along with their medical conditions that they are diagnosed with.
# Goal: The goal is to prepare this dataset with the preprocessing steps for machine learning. 
# The target will be the additional charges. 
# You will one hot encode the categorical columns and scale the numeric columns for the preprocessing steps. 
# Oncethe columns have been transformed, you will view the data as a pandas dataframe so that you can see how the columns were transformed.

df = pd.read_csv('C:/Users/User/Desktop/medical.csv')

y = df['Additional_charges']
X = df.drop(columns='Additional_charges')

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)

In [19]:
# Make column selectors
# We are making two selectors: one for object data and the other for number data.

cat_selector = make_column_selector(dtype_include='object')

num_selector = make_column_selector(dtype_include='number')

In [20]:
# Instantiate transformers
# Now that you have made the selectors, you need to instantiate each of the column transformers you want to use. These are two
# common preprocessors you will often use in preprocessing, StandardScaler to scale the numeric columns and a OneHotEncoder to
# encode the categorical columns.

# The modeling classes you will be using later can process sparse matrices just fine, and sparse matrices are often necessary to conserve memory when the
# resulting dataframe would be very large. In previous examples we set 'sparse=False' because sparse arrays are very difficult to
# interpret visually and cannot be easily transformed in to Pandas dataframes. The ColumnTransformer used later returns the sparse
# matrix to a dense array internally when it concatenates it with the scaled numeric data during transformation.

scaler = StandardScaler()
ohe = OneHotEncoder(handle_unknown='ignore')


# The next step involves matching the columns with the transformation we want to apply. We are going to scale the numeric values for our
# numbers using the Standard Scaler. We are going one hot and code any categorical values for our categorical columns using the One
# Hot Encoder. We match these by creating tuples where the first element is the transformer and the second elements is either a list of
# columns or a ColumnSelector object.

num_tuple = (scaler, num_selector)

cat_tuple = (ohe, cat_selector)

In [15]:
# Instantiate the ColumnTransformer
# We now need to instantiate the column transformer and add each of the tuples we created above. Note that we have a new import to obtain the make_column_transformer.

from sklearn.compose import make_column_transformer

col_transformer = make_column_transformer(num_tuple, cat_tuple, remainder='passthrough')


# Fit the transformer
# WE ONLY FIT ON THE TRAINING DATA

col_transformer.fit(X_train)



# Transform
# Nw we can transform botht the training and the test sets
X_train_processed = col_transformer.transform(X_train)
X_test_processed = col_transformer.transform(X_test)


# View the transformations
# While not necessary for modeling, it can be helpful to see what your code has accomplished. The output is an array which is difficult for
# us to read, but we can change it into a pandas dataframe to make it easier to view and interpret!
X_train_df = pd.DataFrame(X_train_processed)
X_train_df.head()


# Now all of our transformations are complete and we are ready for modeling! We can see that the numeric columns have been scaled
# and the categorical columns have been encoded as numbers. All of the columns are now listed as numeric columns and are ready for
# machine learning. 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.508205,0.281935,-0.060153,0.272586,-1.123467,0.0,0.509399,-0.008943,0.014639,-0.620174,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,-0.720642,0.252836,1.241233,1.119125,-0.619881,0.0,-0.999823,1.907372,0.014639,-0.620174,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,-0.493403,0.482823,-0.493948,0.272586,-0.518276,0.0,0.119354,-0.9671,0.014639,-0.620174,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.134821,-0.434666,2.108824,1.307245,1.93872,0.0,-1.044875,-0.008943,-0.983474,2.762592,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.587322,0.497439,-0.927744,-0.809103,-0.32824,0.0,1.283708,-0.008943,0.014639,-0.620174,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
