# Random Forest Algorithm

In [1]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV


from creditcard_preparation import create_creditcard_pipeline, prepare_creditcard_data

In [4]:
# step 3 using help from assignment 4

# Function to evaluate an algorithm


def evaluate_algo(algo, X_train, y_train, X_dev, y_dev):
    # Create the pipeline

    pipeline = create_creditcard_pipeline()

    # Combine the pipeline and the algorithm
    pipeline_with_algo = Pipeline(steps=[
        ('preprocessor', pipeline),
        ('algo', algo)
    ])

    pipeline_with_algo.fit(X_train, y_train)
    y_pred = pipeline_with_algo.predict(X_dev)
    accuracy = accuracy_score(y_dev, y_pred)
    precision = precision_score(y_dev, y_pred)
    recall = recall_score(y_dev, y_pred)
    f1 = f1_score(y_dev, y_pred)
    return [accuracy, precision, recall, f1]

# Function for LogisticRegression


def evaluate_lr(X_train, y_train, X_dev, y_dev):
    print("Evaluating LogisticRegression...")
    return evaluate_algo(LogisticRegression(max_iter=1000, random_state=42), X_train, y_train, X_dev, y_dev)

# Function for SVC


def evaluate_svc(X_train, y_train, X_dev, y_dev):
    print("Evaluating SVC...")
    return evaluate_algo(SVC(random_state=42), X_train, y_train, X_dev, y_dev)

# Function for KNeighborsClassifier


def evaluate_knn(X_train, y_train, X_dev, y_dev):
    print("Evaluating KNeighborsClassifier...")
    return evaluate_algo(KNeighborsClassifier(), X_train, y_train, X_dev, y_dev)

# Function for DecisionTreeClassifier


def evaluate_dt(X_train, y_train, X_dev, y_dev):
    print("Evaluating DecisionTreeClassifier...")
    return evaluate_algo(DecisionTreeClassifier(random_state=42), X_train, y_train, X_dev, y_dev)

# Function for RandomForestClassifier


def evaluate_rf(X_train, y_train, X_dev, y_dev):
    print("Evaluating RandomForestClassifier...")
    return evaluate_algo(RandomForestClassifier(random_state=42), X_train, y_train, X_dev, y_dev)




# Prepare credit card data for train

X_train, X_dev, X_test, y_train, y_dev, y_test = prepare_creditcard_data(ratios=((1/10), (1/10)))



# Evaluate algorithms
lr_scores = evaluate_lr(X_train, y_train, X_dev, y_dev)
svc_scores = evaluate_svc(X_train, y_train, X_dev, y_dev)
knn_scores = evaluate_knn(X_train, y_train, X_dev, y_dev)
dt_scores = evaluate_dt(X_train, y_train, X_dev, y_dev)
rf_scores = evaluate_rf(X_train, y_train, X_dev, y_dev)

# Create DataFrame to store scores
scores_df = pd.DataFrame([lr_scores, svc_scores, knn_scores, dt_scores, rf_scores],
                         columns=['Accuracy', 'Precision', 'Recall', 'F1'],
                         index=['LogisticRegression', 'SVC', 'KNeighborsClassifier',
                                'DecisionTreeClassifier', 'RandomForestClassifier'])

print(scores_df)

Evaluating LogisticRegression...
Evaluating SVC...
Evaluating KNeighborsClassifier...
Evaluating DecisionTreeClassifier...
Evaluating RandomForestClassifier...
                        Accuracy  Precision    Recall        F1
LogisticRegression      0.965320   0.976911  0.953117  0.964867
SVC                     0.997010   0.996275  0.997747  0.997010
KNeighborsClassifier    0.997731   0.995480  1.000000  0.997735
DecisionTreeClassifier  0.997978   0.997154  0.998803  0.997978
RandomForestClassifier  0.999877   0.999754  1.000000  0.999877


## Grid search for best hyperparameters

In [9]:

def evaluate_rf(X_train, y_train, X_dev, y_dev):
    print("Evaluating RandomForestClassifier...")
    
    # Define parameter grid for Grid Search with fewer candidates
    param_grid = {
        'algo__n_estimators': [50],
        'algo__max_depth': [None, 10],
        'algo__min_samples_split': [2, 5],
        'algo__min_samples_leaf': [1],
        'algo__max_features': ['sqrt'],  # Use 'sqrt' instead of 'auto'
        'algo__criterion': ['gini', 'entropy']
    }

    
    print("Training ...")
    
    # Create the pipeline
    pipeline = create_creditcard_pipeline()

    # Combine the pipeline and the algorithm
    pipeline_with_algo = Pipeline(steps=[
        ('preprocessor', pipeline),
        ('algo', RandomForestClassifier(random_state=42))
    ])
    
    # Perform Grid Search
    grid_search = GridSearchCV(pipeline_with_algo, param_grid, cv=5, scoring='accuracy', verbose=1)
    grid_search.fit(X_train, y_train)
    
    # Get the best estimator
    best_estimator = grid_search.best_estimator_
    
    # Evaluate on development set
    y_pred = best_estimator.predict(X_dev)
    accuracy = accuracy_score(y_dev, y_pred)
    precision = precision_score(y_dev, y_pred)
    recall = recall_score(y_dev, y_pred)
    f1 = f1_score(y_dev, y_pred)
    
    # print best parameters after tuning 
    print("Grid searching is done!")
    print("The best score: ", grid_search.best_score_)
    print("The best hyperparameters:")
    print(grid_search.best_params_)
    
    return [accuracy, precision, recall, f1]

# Evaluate Random Forest with hyperparameter tuning
rf_scores_tuned = evaluate_rf(X_train, y_train, X_dev, y_dev)
    


Evaluating RandomForestClassifier...
Training ...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Grid searching is done!
The best score:  0.9998593111156395
The best hyperparameters:
{'algo__criterion': 'gini', 'algo__max_depth': None, 'algo__max_features': 'sqrt', 'algo__min_samples_leaf': 1, 'algo__min_samples_split': 2, 'algo__n_estimators': 50}
