In [1]:
import os
import warnings
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestClassifier, RandomForestRegressor,
    AdaBoostClassifier, AdaBoostRegressor,
    GradientBoostingClassifier, GradientBoostingRegressor
)
from xgboost import XGBClassifier, XGBRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC, SVR

from sklearn.preprocessing import (
    StandardScaler, MinMaxScaler,
    LabelEncoder, OneHotEncoder,
    QuantileTransformer, PowerTransformer
)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, r2_score,
    classification_report, accuracy_score, f1_score, precision_score
)

warnings.filterwarnings('ignore')

print("Core Libraries: pandas, numpy, matplotlib.pyplot, seaborn, warnings")
print("Train/Test Split: train_test_split, GridSearchCV, RandomizedSearchCV")
print("Models: GaussianNB, BernoulliNB, MultinomialNB, DecisionTreeClassifier, DecisionTreeRegressor, "
      "RandomForestClassifier, RandomForestRegressor, AdaBoostClassifier, AdaBoostRegressor, "
      "GradientBoostingClassifier, GradientBoostingRegressor, XGBClassifier, XGBRegressor, "
      "LogisticRegression, LinearRegression, KNeighborsClassifier, KNeighborsRegressor, SVC, SVR")
print("Preprocessing: StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder, "
      "QuantileTransformer, PowerTransformer, ColumnTransformer, Pipeline")
print("Metrics: mean_squared_error, mean_absolute_error, r2_score, "
      "classification_report, accuracy_score, f1_score, precision_score")
print("Other: pickle")


Train/Test Split: train_test_split, GridSearchCV, RandomizedSearchCV
Models: GaussianNB, BernoulliNB, MultinomialNB, DecisionTreeClassifier, DecisionTreeRegressor, RandomForestClassifier, RandomForestRegressor, AdaBoostClassifier, AdaBoostRegressor, GradientBoostingClassifier, GradientBoostingRegressor, XGBClassifier, XGBRegressor, LogisticRegression, LinearRegression, KNeighborsClassifier, KNeighborsRegressor, SVC, SVR
Preprocessing: StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder, QuantileTransformer, PowerTransformer, ColumnTransformer, Pipeline
Metrics: mean_squared_error, mean_absolute_error, r2_score, classification_report, accuracy_score, f1_score, precision_score
Other: pickle


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

def train_random_forest(data, target):
    # Split data into features and target
    X = data.drop(columns=[target])
    y = data[target]

    # Identify categorical and numerical features
    categorical_features = X.select_dtypes(include=["object", "category"]).columns.tolist()
    numeric_features = X.select_dtypes(exclude=["object", "category"]).columns.tolist()

    # Preprocessor: OneHotEncode categorical, passthrough numeric
    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
            ("num", "passthrough", numeric_features)
        ]
    )

    # Define Random Forest with balanced class weights
    rf = RandomForestClassifier(random_state=42, class_weight="balanced", n_jobs=-1)

    # Create pipeline: preprocessing + model
    pipeline = Pipeline(steps=[("preprocessor", preprocessor),
                               ("model", rf)])

    # Expanded hyperparameter grid
    param_grid = {
        "model__n_estimators": [100, 200, 500],
        "model__max_depth": [None, 10, 20, 30],
        "model__min_samples_split": [2, 5, 10],
        "model__min_samples_leaf": [1, 2, 4],
        "model__max_features": ["sqrt", "log2"]
    }

    # Stratified CV for class balance
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # GridSearch with accuracy
    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        cv=cv,
        scoring="accuracy",
        n_jobs=-1,
        verbose=2
    )

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, stratify=y, random_state=42
    )

    # Fit the grid search
    grid_search.fit(X_train, y_train)

    # Best model
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Evaluate on test set
    y_pred = best_model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")

    print("Best Hyperparameters:", best_params)
    print(f"Test Accuracy: {acc:.4f}")
    print(f"Test F1 Score: {f1:.4f}")
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

    return best_model, best_params, {"accuracy": acc, "f1": f1}
