### RW ML Kits

https://github.com/ravindutw

Â© Ravindu Wijesundara

# Random forest Classification Template

#### Version 1.0

In [None]:
#import warnings
#warnings.filterwarnings("ignore")

In [None]:
# Run following on Amazon SageMaker
#!conda install -c conda-forge py-xgboost-gpu -y

In [None]:
# Run following on Google Colab
#!pip install s3fs

In [None]:
import pandas as pd

### Import data

In [None]:
# Replace with data location, local or S3
df = pd.read_parquet("LOCATION")
#df = pd.read_csv("LOCATION")

In [None]:
pd.set_option('display.max_columns', None)
df.head()

In [None]:
df.info()

## Preprocessing

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.compose import make_column_selector
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import TargetEncoder
from sklearn.preprocessing import OrdinalEncoder

In [None]:
target_column = '' # Replace with the target column

In [None]:
X = df.drop(columns=[target_column])
y = df[target_column]

In [None]:
class_names = y.unique().tolist()

In [None]:
column_names = X.columns.tolist() # List all column names
numerical_cols = X.select_dtypes(include='number').columns.tolist() # List all numerical column names
cat_cols = X.select_dtypes(include='object').columns.tolist() # List all categorical column names

In [None]:
drop_features = [] # List features to drop

ordinal_encode_column_features = [] # List features to ordinal encode

# List target columns or automatically select all the remaining categorical columns
target_encode_features = [c for c in cat_cols if c not in ordinal_encode_column_features]
#target_encode_features = []

# Automatically select features to one-hot encode
oh_encode_features = [c for c in cat_cols if c not in (ordinal_encode_column_features + target_encode_features)]
#oh_encode_features = []

pass_through_features = [] # Columns to pass-through (If any)

In [None]:
ordinal_encoder = OrdinalEncoder()
ordinal_encode_column_names = [c for c in ordinal_encode_column_features if c not in drop_features]

In [None]:
target_encoder = TargetEncoder(target_type='binary', cv=5, smooth='auto', random_state=42) # Change params accordingly
target_encode_column_names = [c for c in target_encode_features if c not in drop_features]

In [None]:
oh_encoder = OneHotEncoder(sparse_output=False, drop=None, handle_unknown='ignore') # Change params accordingly
oh_encode_column_names = [c for c in oh_encode_features if c not in drop_features]

In [None]:
drop_low_var_cols = VarianceThreshold(threshold=0.0)

#### Preprocessor

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('ordinal_enc', ordinal_encoder, ordinal_encode_column_names),
        ('target_enc', target_encoder, target_encode_column_names),
        ('oh_enc', oh_encoder, oh_encode_column_names),
        ('drop_low_var', drop_low_var_cols, numerical_cols),
        ('passthrough', 'passthrough', pass_through_features)
    ],
    remainder='drop', # Drop everything that are not in the transformer
    verbose_feature_names_out=True
)

# Comment-out unncecessary transformers

In [None]:
# Encode y
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

## Train-test split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

### Model - Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(random_state=42)

In [None]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('rfc', rfc)
])

In [None]:
pipeline.fit(X_train, y_train)

### Evaluation

In [None]:
from sklearn.metrics import (ConfusionMatrixDisplay, classification_report, confusion_matrix)
import numpy as np
import matplotlib.pyplot as plt

In [None]:
preds = pipeline.predict(X_test)

In [None]:
def test_model(y_test, y_pred, class_names):

  print("Classification Report:")
  print(classification_report(y_test, y_pred, target_names=class_names))

  cm = confusion_matrix(y_test, y_pred)

  print("Confusion Matrix:")

  disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
  disp.plot(cmap=plt.cm.plasma)
  plt.title("Confusion matrix")
  plt.show()

In [None]:
test_model(y_test, preds, class_names)

### SHAP

In [None]:
import shap

In [None]:
rfc = pipeline.named_steps['rfc']

In [None]:
X_shap = X_test.sample(200, random_state=42)
X_shap_transformed = pipeline.named_steps["preprocessor"].transform(X_shap)

In [None]:
feature_names = pipeline.named_steps["preprocessor"].get_feature_names_out()

In [None]:
X_shap_exp = pd.DataFrame(
    X_shap_transformed,
    columns=feature_names
)

In [None]:
explainer = shap.TreeExplainer(rfc)

In [None]:
shap_values = explainer.shap_values(X_shap_transformed)

In [None]:
shap.summary_plot(shap_values[:, :, 1], X_shap_exp, max_display=50)

In [None]:
shap.summary_plot(shap_values[:, :, 1], X_shap_exp, plot_type="bar", max_display=50)