# Modeling

In [1]:
#!pip install -U pandas pandas-profiling scikit-learn

## Load the train data

In [1]:
import pandas as pd

train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")

print(train_df.shape)
train_df.head()

(267, 14)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52.0,1.0,1.0,118.0,186.0,0.0,2.0,190.0,0.0,0.0,2.0,0.0,6.0,0
1,39.0,0.0,3.0,94.0,199.0,0.0,0.0,179.0,0.0,0.0,1.0,0.0,3.0,0
2,60.0,1.0,4.0,130.0,206.0,0.0,2.0,132.0,1.0,2.4,2.0,2.0,7.0,1
3,39.0,1.0,3.0,140.0,321.0,0.0,2.0,182.0,0.0,0.0,1.0,0.0,3.0,0
4,57.0,0.0,4.0,120.0,354.0,0.0,0.0,163.0,1.0,0.6,1.0,0.0,3.0,0


## Exploratory Data Analysis

In [2]:
from pandas_profiling import ProfileReport

In [3]:
profile = ProfileReport(train_df)
profile.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

In [4]:
# Define the columns
cat_cols = ["sex", "cp", "fbs", "restecg", "exang", "slope", "ca", "thal"]
cont_cols = ["age", "trestbps", "chol", "thalach", "oldpeak"]

## Split Features and Response

In [5]:
# Split X(features) and y(response)
X_train = train_df.drop("target", axis=1)
y_train = train_df["target"]

X_test = test_df.drop("target", axis=1)
y_test = test_df["target"]

## Data Transformations

In [6]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [7]:
# One hot encode the categorical columns
ohe = OneHotEncoder(drop="first")

# Scale the continuous columns
sc = StandardScaler()

# Column transformer to apply transformations on both categorical and continuous columns
ct = ColumnTransformer([
    ("One Hot Encoding", ohe, cat_cols),
    ("Scaling", sc, cont_cols)
])

## ML Model

In [8]:
# KNN Model
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()

## Pipeline to combine feature engineering and ML model

In [9]:
# Sklearn pipeline
from sklearn.pipeline import Pipeline

pipeline_model = Pipeline([
    ("Data Transformations", ct),
    ("KNN Model", knn)
])

## Fit the Pipeline Model locally
- We run it locally to ensure there are no bugs in the code!
- For this "test" purpose we can just run it on a smaller subset of the data

In [10]:
# To view tha Pipeline model as a diagram
from sklearn import set_config
set_config(display="diagram")

In [11]:
# Fit the model locally on a smaller subset of data
pipeline_model.fit(X_train[:50], y_train[:50])

In [13]:
# Check the accuracy on training data
train_accuracy = pipeline_model.score(X_train, y_train)
print(f"Training Accuracy: {train_accuracy:.4f}")

# Check the accuracy on test data
test_accuracy = pipeline_model.score(X_test, y_test)
print(f"Testing Accuracy: {test_accuracy:.4f}")

Training Accuracy: 0.8577
Testing Accuracy: 0.8000


## Fit the Pipeline Model on Sagemaker!
- Since the model is free from bugs, we can train it on the full dataset!
- Sagemaker training allows us to scale training to large datasets!

## Deploy as Real Time Inference