In [9]:
#  handling catogral vals 
# handling muissing val 
# train test split 

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

class BirdStrikeModelPreprocessing:
    def __init__(self, data):
        self.data = data.copy()
        self.target_column = 'Damage'
        self.scaler = StandardScaler()

        # Remove leading/trailing spaces from column names
        self.data.columns = self.data.columns.str.strip()

    def encode_categorical(self):
        categorical_columns = self.data.select_dtypes(include=['object']).columns
        datetime_columns = self.data.select_dtypes(include=['datetime64']).columns
        encoder = LabelEncoder()

        # Encode categorical columns
        for col in categorical_columns:
            try:
                self.data[col] = encoder.fit_transform(self.data[col].astype(str))
            except Exception as e:
                print(f"⚠️ Error encoding column '{col}': {e}")

        # Convert datetime columns to epoch timestamps (if applicable)
        for col in datetime_columns:
            try:
                self.data[col] = pd.to_datetime(self.data[col], errors='coerce').astype(int) // 10**9
            except Exception as e:
                print(f"⚠️ Error converting datetime column '{col}': {e}")

        print("✅ Categorical and datetime features processed.")

    def scale_features(self, X_train, X_test):
        # Drop any non-numeric columns after encoding
        X_train = X_train.select_dtypes(include=[np.number])
        X_test = X_test.select_dtypes(include=[np.number])

        # Handle NaNs after type conversion
        X_train = X_train.fillna(0)
        X_test = X_test.fillna(0)

        # Scale numeric data
        try:
            X_train_scaled = self.scaler.fit_transform(X_train)
            X_test_scaled = self.scaler.transform(X_test)
            print("✅ Features scaled.")
            return X_train_scaled, X_test_scaled
        except Exception as e:
            raise ValueError(f"❌ Error scaling features: {e}")

    def split_data(self, test_size=0.1, random_state=42):
        # Ensure target column is present
        if self.target_column not in self.data.columns:
            raise ValueError(f"❌ Target column '{self.target_column}' not found in data.")

        # Remove rows with missing target values
        self.data = self.data.dropna(subset=[self.target_column])

        # Separate features and target
        X = self.data.drop(columns=[self.target_column])
        y = self.data[self.target_column]

        try:
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=test_size, random_state=random_state
            )
            print("✅ Data split into train and test sets.")
            return X_train, X_test, y_train, y_test
        except Exception as e:
            raise ValueError(f"❌ Error splitting data: {e}")

