In [1]:
# building model

In [3]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV

class BirdStrikeModelBuilder:
    def __init__(self, X_train, X_test, y_train, y_test):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.model = RandomForestClassifier(n_estimators=100, random_state=42)

    def train_model(self):
        """Train the RandomForestClassifier on the data, handling categorical encoding inline."""
        # Identify categorical columns
        categorical_columns = self.X_train.select_dtypes(include=['object']).columns
        
        # One-hot encode the categorical columns inline
        self.X_train_encoded = pd.get_dummies(self.X_train, columns=categorical_columns, drop_first=True)
        self.X_test_encoded = pd.get_dummies(self.X_test, columns=categorical_columns, drop_first=True)

        # Ensure that both training and test sets have the same columns after encoding
        self.X_train_encoded, self.X_test_encoded = self.X_train_encoded.align(self.X_test_encoded, join='left', axis=1, fill_value=0)

        # Train the model
        self.model.fit(self.X_train_encoded, self.y_train)

    def evaluate_model(self):
        print("Evaluating model...")
        y_pred = self.model.predict(self.X_test)
        accuracy = accuracy_score(self.y_test, y_pred)
        confusion = confusion_matrix(self.y_test, y_pred)
        report = classification_report(self.y_test, y_pred)

        print(f"Accuracy: {accuracy:.2%}")
        print("Confusion Matrix:")
        print(confusion)
        print("Classification Report:")
        print(report)

        return accuracy, confusion, report
    def plot_feature_importance(self):
        if hasattr(self.model, 'feature_importances_'):
            importance = self.model.feature_importances_
            feature_names = self.X_train.columns

            importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importance})
            importance_df = importance_df.sort_values(by='Importance', ascending=False)

            plt.figure(figsize=(12, 6))
            sns.barplot(x='Importance', y='Feature', data=importance_df, palette='viridis')
            plt.title('Feature Importance')
            plt.show()
        else:
            print("⚠️ Model does not support feature importance.")

    def tune_hyperparameters(self):
        param_grid = {
            'n_estimators': [50, 100, 150],
            'max_depth': [10, 20, 30, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'max_features': ['sqrt', 'log2']
        }

        grid_search = GridSearchCV(estimator=self.model, param_grid=param_grid, 
                                cv=5, n_jobs=-1, verbose=1, scoring='accuracy')

        grid_search.fit(self.X_train, self.y_train)
        
        self.model = grid_search.best_estimator_
        print(f"✅ Best Params: {grid_search.best_params_}")
        print(f"✅ Best Score: {grid_search.best_score_:.2f}")

    def predict(self, input_data):
        if self.model is None:
            raise ValueError("❌ Model not trained yet. Train the model first.")

        # If input is a single sample, reshape it
        if len(input_data.shape) == 1:
            input_data = input_data.reshape(1, -1)

        prediction = self.model.predict(input_data)
        return prediction

# print(dir(BirdStrikeModelBuilder))
