In [2]:
import pandas as pd
import seaborn as sns

from utils import GetTrainDF, GetTestDF

In [3]:
train_df = GetTrainDF()
test_df = GetTestDF()

In [4]:
train_df.isna().sum().sum(), test_df.isna().sum().sum()

(0, 0)

In [5]:
train_df.shape

(14838, 33)

In [6]:
from sklearn.model_selection import train_test_split

def get_Xy(target: str):

  if not target in ("EC1", "EC2"):
    raise ValueError("target must be EC1 or EC2")

  return train_test_split(
      train_df.drop(columns=["EC1", "EC2"], axis=1),
      train_df[target],
      test_size=0.2,
      random_state=42
  )

X_train1, X_test1, y_train1, y_test1 = get_Xy("EC1")
X_train2, X_test2, y_train2, y_test2 = get_Xy("EC2")

In [7]:
# build a 3 step pipeline:
# 1. normalize the data
# 2. select the best features
# 3. train a random forest classifier

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.ensemble import RandomForestClassifier

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("selector", SelectKBest(f_classif)),
    ("rf", RandomForestClassifier())
])

In [8]:
# we will use grid search to find the best parameters for the pipeline
from sklearn.model_selection import GridSearchCV

# we will use the same parameters for both targets
params = {
    "selector__k": [20, 25, 30, "all"],
    "rf__n_estimators": [100, 120],
    "rf__max_depth": [None, 5, 7, 9],
    "rf__min_samples_split": [2, 4, 5],
    "rf__min_samples_leaf": [2, 3, 4, 5],
}

# we will use the same grid search for both targets
grid = GridSearchCV(
    estimator=pipe,
    param_grid=params,
    scoring="accuracy",
    n_jobs=-1,
    cv=4,
    verbose=1
)

from copy import deepcopy

# we will use the same grid search for both targets
ec1_grid = deepcopy(grid)
ec2_grid = deepcopy(grid)

In [9]:
# target: EC1

ec1_grid.fit(X_train1, y_train1)
ec1_params = ec1_grid.best_params_
print(f"EC1 best params: {ec1_params}")


Fitting 4 folds for each of 384 candidates, totalling 1536 fits


EC1 best params: {'rf__max_depth': 7, 'rf__min_samples_leaf': 4, 'rf__min_samples_split': 2, 'rf__n_estimators': 100, 'selector__k': 20}


In [12]:
# create a new pipeline with the best parametersn for EC1
ec1_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("selector", RFE(estimator=RandomForestClassifier(), n_features_to_select=ec1_params["selector__k"])),
    ("rf", RandomForestClassifier(
        n_estimators=ec1_params["rf__n_estimators"],
        max_depth=ec1_params["rf__max_depth"],
        min_samples_split=ec1_params["rf__min_samples_split"],
        min_samples_leaf=ec1_params["rf__min_samples_leaf"]
    ))
])

# create a classification report
from sklearn.metrics import classification_report

ec1_pipe.fit(X_train1, y_train1)
print(classification_report(y_test1, ec1_pipe.predict(X_test1)))

              precision    recall  f1-score   support

       False       0.57      0.36      0.44       976
        True       0.73      0.87      0.79      1992

    accuracy                           0.70      2968
   macro avg       0.65      0.61      0.62      2968
weighted avg       0.68      0.70      0.68      2968



In [13]:
# target: EC2

ec2_grid.fit(X_train2, y_train2)
ec2_params = ec2_grid.best_params_
print(f"EC2 best params: {ec2_params}")

Fitting 4 folds for each of 384 candidates, totalling 1536 fits
EC2 best params: {'rf__max_depth': 9, 'rf__min_samples_leaf': 3, 'rf__min_samples_split': 5, 'rf__n_estimators': 120, 'selector__k': 20}


In [None]:
# create a new pipeline with the best parametersn for EC2
ec2_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("selector", RFE(estimator=RandomForestClassifier(), n_features_to_select=ec2_params["selector__k"])),
    ("rf", RandomForestClassifier(
        n_estimators=ec2_params["rf__n_estimators"],
        max_depth=ec2_params["rf__max_depth"],
        min_samples_split=ec2_params["rf__min_samples_split"],
        min_samples_leaf=ec2_params["rf__min_samples_leaf"]
    ))
])

ec2_pipe.fit(X_train2, y_train2)
print(classification_report(y_test2, ec2_pipe.predict(X_test2)))