In [None]:
import pandas as pd

# File paths
file_path = r'D:\GUVI_Projects\My_Projects\Crimes_-_2001_to_Present.csv'
save_path = r'D:\GUVI_Projects\My_Projects'

# Load dataset with low memory
print("Loading dataset...")
df = pd.read_csv(file_path, low_memory=True)

# Basic inspection
print("\nDataset loaded successfully!")

# Display basic information
print("\nBasic Information:")
df.info()

# Display first few rows
print("\nFirst 5 rows of the dataset:")
print(df.head())

# Check for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())

# Check dataset dimensions
print("\nDataset dimensions:", df.shape)

# Save column names to a text file
columns_file = save_path + '\\columns_list.txt'
print("\nSaving column names to:", columns_file)
with open(columns_file, 'w') as f:
    for column in df.columns:
        f.write(column + '\n')

print("Column names saved successfully!")

# Display summary statistics for numeric columns
print("\nSummary statistics:")
print(df.describe())


In [None]:
import pandas as pd

class DataProcessor:
    def __init__(self, file_path):
        self.file_path = file_path
        self.df = None

    def load_data(self):
        print("Loading dataset...")
        self.df = pd.read_csv(self.file_path, low_memory=True)
        print("Dataset loaded successfully!")

    def inspect_data(self):
        print("\nBasic Information:")
        self.df.info()
        print("\nFirst 5 rows of the dataset:")
        print(self.df.head())
        print("\nMissing values in each column:")
        print(self.df.isnull().sum())
        print("\nDataset dimensions:", self.df.shape)

    @staticmethod
    def reduce_memory(df):
        for col in df.select_dtypes(include=['float64']):
            df[col] = pd.to_numeric(df[col], downcast='float')
        for col in df.select_dtypes(include=['int64']):
            df[col] = pd.to_numeric(df[col], downcast='integer')
        for col in df.select_dtypes(include=['object']):
            df[col] = df[col].astype('category')
        return df

    def handle_missing_values(self):
        # Fill missing numeric columns with 0
        numeric_cols = self.df.select_dtypes(include=['float', 'int']).columns
        self.df[numeric_cols] = self.df[numeric_cols].fillna(0)

        # Fill missing object columns with 'Unknown'
        object_cols = self.df.select_dtypes(include=['object', 'category']).columns
        for col in object_cols:
            if self.df[col].dtype.name == 'category':
                self.df[col] = self.df[col].cat.add_categories(['Unknown']).fillna('Unknown')
            else:
                self.df[col] = self.df[col].fillna('Unknown')

        # Fill missing boolean columns with False
        bool_cols = self.df.select_dtypes(include=['bool']).columns
        self.df[bool_cols] = self.df[bool_cols].fillna(False)

    def handle_dates(self):
        # Convert date columns to datetime
        date_columns = ['Date', 'Updated On']
        for col in date_columns:
            self.df[col] = pd.to_datetime(self.df[col], errors='coerce')

        # Fill missing dates with a placeholder
        self.df[date_columns] = self.df[date_columns].fillna(pd.Timestamp('2000-01-01'))

    def drop_unnecessary_columns(self):
        # Drop X Coordinate and Y Coordinate columns
        columns_to_drop = ['X Coordinate', 'Y Coordinate']
        self.df = self.df.drop(columns=columns_to_drop, errors='ignore')

    def save_cleaned_data(self, save_path):
        cleaned_file = save_path + '\\cleaned_crime.csv'
        print("\nSaving cleaned data to:", cleaned_file)
        self.df.to_csv(cleaned_file, index=False)
        print("Cleaned data saved successfully!")

    def save_column_mappings(self, save_path):
        csv_file = save_path + '\\cleaned_mappings_crime.csv'
        print("\nSaving column mappings to:", csv_file)
        mappings = []
        for column in self.df.columns:
            unique_values = self.df[column].cat.categories if self.df[column].dtype.name == 'category' else self.df[column].unique()
            mappings.append({"Column": column, "Unique Values": list(unique_values)})
        mappings_df = pd.DataFrame(mappings)
        mappings_df.to_csv(csv_file, index=False)
        print("Column mappings saved successfully!")

    def process_data(self, save_path):
        self.load_data()
        self.df = self.reduce_memory(self.df)
        self.handle_missing_values()
        self.handle_dates()
        self.drop_unnecessary_columns()
        self.save_cleaned_data(save_path)
        self.save_column_mappings(save_path)

if __name__ == "__main__":
    file_path = r'D:\GUVI_Projects\My_Projects\Crimes_-_2001_to_Present.csv'
    save_path = r'D:\GUVI_Projects\My_Projects'

    processor = DataProcessor(file_path)
    processor.process_data(save_path)
    print("\nData processing complete!")


In [None]:
import polars as pl
import pandas as pd
import numpy as np
import logging
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
import pickle
import os

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class DataProcessor:
    def __init__(self, data_file_path, mappings_file_path):
        self.data_file_path = data_file_path
        self.mappings_file_path = mappings_file_path
        self.df = None
        self.scaler = None
        self.label_encoders = {}

    @staticmethod
    def reduce_memory(df):
        logging.info("Reducing memory usage...")
        start_mem = df.memory_usage().sum() / 1024 ** 2
        logging.info(f"Initial memory usage: {start_mem:.2f} MB")

        for col in df.columns:
            col_type = df[col].dtype
            if col_type in ['int64', 'float64']:
                c_min = df[col].min()
                c_max = df[col].max()
                if col_type == 'int64':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                elif col_type == 'float64':
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)

        end_mem = df.memory_usage().sum() / 1024 ** 2
        logging.info(f"Reduced memory usage: {end_mem:.2f} MB ({((start_mem - end_mem) / start_mem) * 100:.1f}% reduction)")
        return df

    def load_data(self):
        logging.info("Loading dataset using Polars...")
        try:
            polars_df = pl.read_csv(self.data_file_path)
            self.df = polars_df.to_pandas()
            logging.info(f"Dataset loaded successfully with shape: {self.df.shape}")

            # Sample large dataset for development with stratified sampling
            if self.df.shape[0] > 100000:
                if 'Arrest' in self.df.columns:
                    _, self.df = train_test_split(
                        self.df, 
                        train_size=100000, 
                        stratify=self.df['Arrest'], 
                        random_state=42
                    )
                    logging.info(f"Dataset stratified sampled to shape: {self.df.shape}")
                else:
                    logging.warning("Stratified sampling skipped as 'Arrest' column is not present. Performing random sampling.")
                    self.df = self.df.sample(n=100000, random_state=42)
        except Exception as e:
            logging.error(f"Error loading dataset: {e}")
            raise

    def clean_unknown_values(self):
        logging.info("Cleaning unknown values in the dataset...")
        for col in self.df.columns:
            if self.df[col].dtype == 'object':
                self.df[col] = self.df[col].fillna('Unknown').astype(str)
            else:
                self.df[col] = self.df[col].apply(pd.to_numeric, errors='coerce').fillna(0).astype(float)
        logging.info("Unknown values cleaned and data types corrected.")

    def preprocess_data(self):
        logging.info("Preprocessing data...")
        self.clean_unknown_values()

        # Convert categorical columns to numeric using LabelEncoder
        categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns
        if len(categorical_cols) > 0:
            logging.info("Converting categorical columns using LabelEncoder...")
            for col in categorical_cols:
                le = LabelEncoder()
                self.df[col] = le.fit_transform(self.df[col])
                self.label_encoders[col] = le

        # Standardize numeric columns
        numeric_cols = self.df.select_dtypes(include=['int64', 'float64']).columns
        if len(numeric_cols) > 0:
            logging.info("Standardizing numeric columns...")
            self.scaler = StandardScaler()
            self.df[numeric_cols] = self.scaler.fit_transform(self.df[numeric_cols])

        logging.info("Data preprocessing complete.")

    def balance_data(self, target_col):
        logging.info("Balancing data using SMOTE...")

        # Ensure target column is binary
        if self.df[target_col].dtype == bool:
            logging.info(f"Encoding boolean target '{target_col}' to 0/1...")
            self.df[target_col] = self.df[target_col].astype(int)
        elif self.df[target_col].dtype in ['float64', 'float32']:
            logging.info(f"Binarizing continuous target '{target_col}'...")
            median_value = self.df[target_col].median()
            self.df[target_col] = (self.df[target_col] > median_value).astype(int)
        elif self.df[target_col].nunique() > 2:
            logging.info(f"Converting multi-class target '{target_col}' to binary (simplified for demonstration)...")
            self.df[target_col] = (self.df[target_col] == self.df[target_col].mode()[0]).astype(int)

        # Prepare features and target
        X = self.df.drop(columns=[target_col])
        y = self.df[target_col]

        try:
            smote = SMOTE(random_state=42)
            X_resampled, y_resampled = smote.fit_resample(X, y)
            self.df = pd.concat([pd.DataFrame(X_resampled, columns=X.columns),
                                 pd.DataFrame(y_resampled, columns=[target_col])], axis=1)
            logging.info(f"Data balanced with new shape: {self.df.shape}")
        except ValueError as e:
            logging.error(f"SMOTE ValueError: {e}")
            raise

    def build_and_train_model(self, target_col, save_path):
        logging.info("Building and training models...")
        X = self.df.drop(columns=[target_col])
        y = self.df[target_col]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Logistic Regression
        logistic_model = LogisticRegression(penalty='l2', solver='liblinear', random_state=42)
        param_grid_logistic = {'C': [0.01, 0.1, 1, 10, 100]}
        grid_logistic = GridSearchCV(logistic_model, param_grid_logistic, cv=5, scoring='accuracy')
        grid_logistic.fit(X_train, y_train)
        best_logistic = grid_logistic.best_estimator_
        logging.info(f"Best Logistic Regression Parameters: {grid_logistic.best_params_}")

        y_pred_logistic = best_logistic.predict(X_test)
        logging.info("Logistic Regression Results:")
        logging.info(confusion_matrix(y_test, y_pred_logistic))
        logging.info(classification_report(y_test, y_pred_logistic))

        # Random Forest
        rf_model = RandomForestClassifier(random_state=42)
        param_grid_rf = {'n_estimators': [50, 100, 200], 'max_depth': [10, 20, None], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2]}
        grid_rf = GridSearchCV(rf_model, param_grid_rf, cv=5, scoring='accuracy')
        grid_rf.fit(X_train, y_train)
        best_rf = grid_rf.best_estimator_
        logging.info(f"Best Random Forest Parameters: {grid_rf.best_params_}")

        y_pred_rf = best_rf.predict(X_test)
        logging.info("Random Forest Results:")
        logging.info(confusion_matrix(y_test, y_pred_rf))
        logging.info(classification_report(y_test, y_pred_rf))

        # Save models and preprocessing objects
        os.makedirs(save_path, exist_ok=True)
        with open(os.path.join(save_path, 'logistic_model.pkl'), 'wb') as f:
            pickle.dump(best_logistic, f)
        with open(os.path.join(save_path, 'random_forest_model.pkl'), 'wb') as f:
            pickle.dump(best_rf, f)
        with open(os.path.join(save_path, 'scaler.pkl'), 'wb') as f:
            pickle.dump(self.scaler, f)
        with open(os.path.join(save_path, 'label_encoders.pkl'), 'wb') as f:
            pickle.dump(self.label_encoders, f)

        logging.info("Models and preprocessing objects saved.")

if __name__ == "__main__":
    data_file_path = r'D:\GUVI_Projects\My_Projects\cleaned_crime.csv'
    mappings_file_path = r'D:\GUVI_Projects\My_Projects\cleaned_mappings_crime.csv'
    save_path = r'D:\GUVI_Projects\My_Projects'

    processor = DataProcessor(data_file_path, mappings_file_path)
    processor.load_data()
    processor.df = DataProcessor.reduce_memory(processor.df)  # Optimize memory
    processor.preprocess_data()
    processor.balance_data(target_col='Arrest')
    processor.build_and_train_model(target_col='Arrest', save_path=save_path)
    logging.info("Pipeline completed successfully.")
    
# 2025-01-02 18:51:51,584 - INFO - Loading dataset using Polars...
# 2025-01-02 18:51:53,434 - INFO - Dataset loaded successfully with shape: (1048575, 20)
# 2025-01-02 18:51:54,653 - INFO - Dataset stratified sampled to shape: (948575, 20)
# 2025-01-02 18:51:54,681 - INFO - Reducing memory usage...
# 2025-01-02 18:51:54,687 - INFO - Initial memory usage: 139.31 MB
# 2025-01-02 18:51:54,738 - INFO - Reduced memory usage: 94.99 MB (31.8% reduction)
# 2025-01-02 18:51:54,738 - INFO - Preprocessing data...
# 2025-01-02 18:51:54,738 - INFO - Cleaning unknown values in the dataset...
# 2025-01-02 18:52:14,181 - INFO - Unknown values cleaned and data types corrected.
# 2025-01-02 18:52:14,667 - INFO - Converting categorical columns using LabelEncoder...
# 2025-01-02 18:52:22,601 - INFO - Standardizing numeric columns...
# 2025-01-02 18:52:22,924 - INFO - Data preprocessing complete.
# 2025-01-02 18:52:22,926 - INFO - Balancing data using SMOTE...
# 2025-01-02 18:52:22,928 - INFO - Binarizing continuous target 'Arrest'...
# 2025-01-02 18:53:57,558 - INFO - Data balanced with new shape: (1397218, 20)
# 2025-01-02 18:53:57,575 - INFO - Building and training models...
# 2025-01-02 19:02:19,449 - INFO - Best Logistic Regression Parameters: {'C': 0.1}
# 2025-01-02 19:02:19,521 - INFO - Logistic Regression Results:
# 2025-01-02 19:02:19,569 - INFO - [[88307 51338]
#  [50310 89489]]
# 2025-01-02 19:02:19,942 - INFO -               precision    recall  f1-score   support

#            0       0.64      0.63      0.63    139645
#            1       0.64      0.64      0.64    139799

#     accuracy                           0.64    279444
#    macro avg       0.64      0.64      0.64    279444
# weighted avg       0.64      0.64      0.64    279444

# 2025-01-03 13:49:18,800 - INFO - Best Random Forest Parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
# 2025-01-03 13:49:31,620 - INFO - Random Forest Results:
# 2025-01-03 13:49:31,646 - INFO - [[135074   4571]
#  [ 18064 121735]]
# 2025-01-03 13:49:31,961 - INFO -               precision    recall  f1-score   support

#            0       0.88      0.97      0.92    139645
#            1       0.96      0.87      0.91    139799

#     accuracy                           0.92    279444
#    macro avg       0.92      0.92      0.92    279444
# weighted avg       0.92      0.92      0.92    279444

# 2025-01-03 13:49:35,835 - INFO - Models and preprocessing objects saved.
# 2025-01-03 13:49:35,983 - INFO - Pipeline completed successfully.


In [None]:
import polars as pl
import pandas as pd
import numpy as np
import logging
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
import pickle
import os
from pprint import pprint

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class DataProcessor:
    def __init__(self, data_file_path, mappings_file_path):
        self.data_file_path = data_file_path
        self.mappings_file_path = mappings_file_path
        self.df = None
        self.scaler = None
        self.label_encoders = {}

    def reduce_memory(self, df):
        logging.info("Reducing memory usage...")
        start_mem = df.memory_usage().sum() / 1024 ** 2
        logging.info(f"Initial memory usage: {start_mem:.2f} MB")

        for col in df.columns:
            col_type = df[col].dtype
            if col_type in ['int64', 'float64']:
                c_min = df[col].min()
                c_max = df[col].max()
                if col_type == 'int64':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                elif col_type == 'float64':
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)

        end_mem = df.memory_usage().sum() / 1024 ** 2
        logging.info(f"Reduced memory usage: {end_mem:.2f} MB ({((start_mem - end_mem) / start_mem) * 100:.1f}% reduction)")
        return df

    def load_data(self):
        logging.info("Loading dataset using Polars...")
        try:
            polars_df = pl.read_csv(self.data_file_path)
            self.df = polars_df.to_pandas()
            logging.info(f"Dataset loaded successfully with shape: {self.df.shape}")

            # Sample large dataset for development with stratified sampling
            if self.df.shape[0] > 100000:
                if 'Arrest' in self.df.columns:
                    _, self.df = train_test_split(
                        self.df,
                        train_size=100000,
                        stratify=self.df['Arrest'],
                        random_state=42
                    )
                    logging.info(f"Dataset stratified sampled to shape: {self.df.shape}")
                else:
                    logging.warning("Stratified sampling skipped as 'Arrest' column is not present. Performing random sampling.")
                    self.df = self.df.sample(n=100000, random_state=42)
        except Exception as e:
            logging.error(f"Error loading dataset: {e}")
            raise

    def clean_unknown_values(self):
        logging.info("Cleaning unknown values in the dataset...")
        for col in self.df.columns:
            if self.df[col].dtype == 'object':
                self.df[col] = self.df[col].fillna('Unknown').astype(str)
            else:
                self.df[col] = self.df[col].apply(pd.to_numeric, errors='coerce').fillna(0).astype(float)
        logging.info("Unknown values cleaned and data types corrected.")

    def preprocess_data(self):
        logging.info("Preprocessing data...")
        self.clean_unknown_values()

        # Convert categorical columns to numeric using LabelEncoder
        categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns
        if len(categorical_cols) > 0:
            logging.info("Converting categorical columns using LabelEncoder...")
            for col in categorical_cols:
                le = LabelEncoder()
                self.df[col] = le.fit_transform(self.df[col])
                self.label_encoders[col] = le

        # Standardize numeric columns
        numeric_cols = self.df.select_dtypes(include=['int64', 'float64']).columns
        if len(numeric_cols) > 0:
            logging.info("Standardizing numeric columns...")
            self.scaler = StandardScaler()
            self.df[numeric_cols] = self.scaler.fit_transform(self.df[numeric_cols])

        logging.info("Data preprocessing complete.")

    def balance_data(self, target_col):
        logging.info("Balancing data using SMOTE...")
        try:
            # Ensure the target column is binary
            if self.df[target_col].dtype not in ['int64', 'int32', 'category']:
                self.df[target_col] = self.df[target_col].astype(int)
            
            if self.df[target_col].nunique() > 2:
                raise ValueError(f"Target column '{target_col}' must be binary for SMOTE.")

            X = self.df.drop(columns=[target_col])
            y = self.df[target_col]

            smote = SMOTE(random_state=42)
            X_resampled, y_resampled = smote.fit_resample(X, y)

            self.df = pd.concat([pd.DataFrame(X_resampled, columns=X.columns),
                                 pd.DataFrame(y_resampled, columns=[target_col])], axis=1)
            logging.info(f"Data balanced with new shape: {self.df.shape}")
        except ValueError as e:
            logging.error(f"SMOTE ValueError: {e}")
            raise
        except Exception as e:
            logging.error(f"Unexpected error during SMOTE: {e}")
            raise

    def build_and_train_model(self, target_col, save_path):
        logging.info("Building and training models...")
        try:
            X = self.df.drop(columns=[target_col])
            y = self.df[target_col]

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

            # Neural Network Model with GridSearch for best parameters
            param_grid = {
                'hidden_layer_sizes': [(100,), (100, 50), (100, 100, 50)],
                'max_iter': [200, 300, 400],
                'activation': ['relu', 'tanh'],
                'solver': ['adam']
            }
            nn_model = MLPClassifier(random_state=42)
            grid_search = GridSearchCV(estimator=nn_model, param_grid=param_grid, cv=5, scoring='accuracy')
            grid_search.fit(X_train, y_train)

            best_nn = grid_search.best_estimator_
            logging.info("Best Neural Network Parameters:")
            pprint(grid_search.best_params_)

            y_pred_nn = best_nn.predict(X_test)
            logging.info("Neural Network Results:")
            logging.info(confusion_matrix(y_test, y_pred_nn))
            logging.info(classification_report(y_test, y_pred_nn))

            # Save Neural Network model and preprocessing objects
            os.makedirs(save_path, exist_ok=True)
            with open(os.path.join(save_path, 'nn_model.pkl'), 'wb') as f:
                pickle.dump(best_nn, f)
            with open(os.path.join(save_path, 'scaler.pkl'), 'wb') as f:
                pickle.dump(self.scaler, f)
            with open(os.path.join(save_path, 'label_encoders.pkl'), 'wb') as f:
                pickle.dump(self.label_encoders, f)

            logging.info("Model and preprocessing objects saved.")
        except Exception as e:
            logging.error(f"Error during model training: {e}")
            raise

if __name__ == "__main__":
    data_file_path = r'D:\GUVI_Projects\My_Projects\cleaned_crime.csv'
    mappings_file_path = r'D:\GUVI_Projects\My_Projects\cleaned_mappings_crime.csv'
    save_path = r'D:\GUVI_Projects\My_Projects'

    processor = DataProcessor(data_file_path, mappings_file_path)
    processor.load_data()
    processor.df = processor.reduce_memory(processor.df)  # Use instance method
    processor.preprocess_data()
    processor.balance_data(target_col='Arrest')
    processor.build_and_train_model(target_col='Arrest', save_path=save_path)
    logging.info("Pipeline completed successfully.")
    
# 2025-01-09 15:36:29,168 - INFO - Loading dataset using Polars...
# 2025-01-09 15:36:31,015 - INFO - Dataset loaded successfully with shape: (1048575, 20)
# 2025-01-09 15:36:32,037 - INFO - Dataset stratified sampled to shape: (948575, 20)
# 2025-01-09 15:36:32,055 - INFO - Reducing memory usage...
# 2025-01-09 15:36:32,061 - INFO - Initial memory usage: 139.31 MB
# 2025-01-09 15:36:32,119 - INFO - Reduced memory usage: 94.99 MB (31.8% reduction)
# 2025-01-09 15:36:32,120 - INFO - Preprocessing data...
# 2025-01-09 15:36:32,120 - INFO - Cleaning unknown values in the dataset...
# 2025-01-09 15:36:52,393 - INFO - Unknown values cleaned and data types corrected.
# 2025-01-09 15:36:52,959 - INFO - Converting categorical columns using LabelEncoder...
# 2025-01-09 15:37:00,619 - INFO - Standardizing numeric columns...
# 2025-01-09 15:37:00,890 - INFO - Data preprocessing complete.
# 2025-01-09 15:37:00,891 - INFO - Balancing data using SMOTE...
# 2025-01-09 15:38:21,374 - INFO - Data balanced with new shape: (1397218, 20)
# 2025-01-09 15:38:21,399 - INFO - Building and training models...
# 2025-01-09 21:44:13,616 - INFO - Best Neural Network Parameters:
# {'activation': 'tanh',
#  'hidden_layer_sizes': (100, 100, 50),
#  'max_iter': 200,
#  'solver': 'adam'}
# 2025-01-09 21:44:14,850 - INFO - Neural Network Results:
# 2025-01-09 21:44:14,906 - INFO - [[79345 60300]
#  [66876 72923]]
# 2025-01-09 21:44:15,254 - INFO -               precision    recall  f1-score   support

#            0       0.54      0.57      0.56    139645
#            1       0.55      0.52      0.53    139799

#     accuracy                           0.54    279444
#    macro avg       0.55      0.54      0.54    279444
# weighted avg       0.55      0.54      0.54    279444

# 2025-01-09 21:44:16,265 - INFO - Model and preprocessing objects saved.
# 2025-01-09 21:44:16,289 - INFO - Pipeline completed successfully.
