# Random Forest Model

## Imports and Setup

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
import joblib

In [None]:
pd.options.display.max_columns = 999
pd.options.display.max_rows = 999

## Read Data

In [None]:
train_df = pd.read_csv("../data/train.csv")

In [None]:
train_df.head()

## Feature Engineering

In [None]:
features = [column for column in train_df.columns if column not in ["ID_code", "target"]]

In [None]:
train_df["sum"] = train_df[features].sum(axis=1)
train_df["min"] = train_df[features].min(axis=1)
train_df["max"] = train_df[features].max(axis=1)
train_df["mean"] = train_df[features].mean(axis=1)
train_df["std"] = train_df[features].std(axis=1)
train_df["skew"] = train_df[features].skew(axis=1)
train_df["kurtosis"] = train_df[features].kurtosis(axis=1)
train_df["median"] = train_df[features].median(axis=1)

In [None]:
new_features = ["sum", "min", "max", "mean", "std", "skew", "kurtosis", "median"]
features += new_features

In [None]:
train_df[new_features].head()

## Train Test Split

In [None]:
X, y = train_df[features], train_df["target"]

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

## Train Random Forest Classifier

In [None]:
rf = RandomForestClassifier(
    n_estimators=200, max_depth=10, n_jobs=-1, random_state=42, class_weight="balanced"
)

In [None]:
rf.fit(X_train, y_train)

In [None]:
roc_auc_score(y_val, rf.predict_proba(X_val)[:, -1])

In [None]:
print(classification_report(y_val, rf.predict(X_val)))

In [None]:
joblib.dump(rf, "../ml/models/rf_model.joblib")