# Random Forest Algorithm

In [3]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV


from creditcard_preparation import create_creditcard_pipeline, prepare_creditcard_data

In [4]:
# step 3 using help from assignment 4

# Function to evaluate an algorithm


def evaluate_algo(algo, X_train, y_train, X_dev, y_dev):
    # Create the pipeline

    pipeline = create_creditcard_pipeline()

    # Combine the pipeline and the algorithm
    pipeline_with_algo = Pipeline(steps=[
        ('preprocessor', pipeline),
        ('algo', algo)
    ])

    pipeline_with_algo.fit(X_train, y_train)
    y_pred = pipeline_with_algo.predict(X_dev)
    accuracy = accuracy_score(y_dev, y_pred)
    precision = precision_score(y_dev, y_pred)
    recall = recall_score(y_dev, y_pred)
    f1 = f1_score(y_dev, y_pred)
    return [accuracy, precision, recall, f1]



# Function for RandomForestClassifier


def evaluate_rf(X_train, y_train, X_dev, y_dev):
    print("Evaluating RandomForestClassifier...")
    return evaluate_algo(RandomForestClassifier(random_state=42), X_train, y_train, X_dev, y_dev)




# Prepare credit card data for train

X_train, X_dev, X_test, y_train, y_dev, y_test = prepare_creditcard_data(ratios=((1/10), (1/10)))




rf_scores = evaluate_rf(X_train, y_train, X_dev, y_dev)

# Create DataFrame to store scores
scores_df = pd.DataFrame([ rf_scores],
                         columns=['Accuracy', 'Precision', 'Recall', 'F1'],
                         index=['RandomForestClassifier'])

print(scores_df)

Evaluating RandomForestClassifier...


KeyboardInterrupt: 

## Grid search for best hyperparameters

In [9]:

def evaluate_rf(X_train, y_train, X_dev, y_dev):
    print("Evaluating RandomForestClassifier...")
    
    # Define parameter grid for Grid Search with fewer candidates
    param_grid = {
        'algo__n_estimators': [50],
        'algo__max_depth': [None, 10],
        'algo__min_samples_split': [2, 5],
        'algo__min_samples_leaf': [1],
        'algo__max_features': ['sqrt'],  # Use 'sqrt' instead of 'auto'
        'algo__criterion': ['gini', 'entropy']
    }

    
    print("Training ...")
    
    # Create the pipeline
    pipeline = create_creditcard_pipeline()

    # Combine the pipeline and the algorithm
    pipeline_with_algo = Pipeline(steps=[
        ('preprocessor', pipeline),
        ('algo', RandomForestClassifier(random_state=42))
    ])
    
    # Perform Grid Search
    grid_search = GridSearchCV(pipeline_with_algo, param_grid, cv=5, scoring='accuracy', verbose=1)
    grid_search.fit(X_train, y_train)
    
    # Get the best estimator
    best_estimator = grid_search.best_estimator_
    
    # Evaluate on development set
    y_pred = best_estimator.predict(X_dev)
    accuracy = accuracy_score(y_dev, y_pred)
    precision = precision_score(y_dev, y_pred)
    recall = recall_score(y_dev, y_pred)
    f1 = f1_score(y_dev, y_pred)
    
    # print best parameters after tuning 
    print("Grid searching is done!")
    print("The best score: ", grid_search.best_score_)
    print("The best hyperparameters:")
    print(grid_search.best_params_)
    
    return [accuracy, precision, recall, f1]

# Evaluate Random Forest with hyperparameter tuning
rf_scores_tuned = evaluate_rf(X_train, y_train, X_dev, y_dev)
    


Evaluating RandomForestClassifier...
Training ...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Grid searching is done!
The best score:  0.9998593111156395
The best hyperparameters:
{'algo__criterion': 'gini', 'algo__max_depth': None, 'algo__max_features': 'sqrt', 'algo__min_samples_leaf': 1, 'algo__min_samples_split': 2, 'algo__n_estimators': 50}


### Test model on test set using chosen hyperparameters

In [5]:


# Prepare credit card data for train

X_train, X_dev, X_test, y_train, y_dev, y_test = prepare_creditcard_data(ratios=((1/10), (1/10)))
# Create the pipeline
pipeline = create_creditcard_pipeline()

# Create Random Forest algorithm with parameters found during grid search
RF_model = RandomForestClassifier(
    n_estimators=50,
    criterion='gini',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt'
)

# Combine the pipeline and the algorithm
pipeline_with_algo = Pipeline(steps=[
    ('preprocessor', pipeline),
    ('algo', RF_model)
])

# Fit the pipeline on the training data
pipeline_with_algo.fit(X_train, y_train)

# Make predictions on the test data
y_test_pred = pipeline_with_algo.predict(X_test)

In [6]:
accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)


print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.9998417248474404
Precision: 0.9996841772818191
Recall: 1.0
F1 Score: 0.999842063700974
