In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import joblib
from datetime import datetime

def load_data(path):
    """Load insurance claims data from Excel file."""
    return pd.read_excel(path)

def preprocess_data(df):
    """Clean and preprocess the data for modeling."""
    df_processed = df.copy()
    df_processed = df_processed.dropna()
    
    datetime_cols = df_processed.select_dtypes(include=['datetime64']).columns
    for col in datetime_cols:
        df_processed.loc[:, col] = (df_processed[col] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1d')
    
    df_processed = pd.get_dummies(df_processed, drop_first=True)
    return df_processed

def split_and_balance(df, target_col='fraud_reported_Y'):
    """Split the dataset into train and test and balance using SMOTE."""
    X = df.drop(target_col, axis=1).copy()
    y = df[target_col].copy()
    X = X.apply(pd.to_numeric, errors='ignore')
    
    sm = SMOTE(random_state=42)
    X_resampled, y_resampled = sm.fit_resample(X, y)
    return train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

def train_model(X_train, y_train):
    """Train Random Forest classifier."""
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test, report_path='performance_report.txt'):
    """Evaluate model performance and save to file."""
    y_pred = model.predict(X_test)
    
    # Generate performance metrics
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    
    # Print to console
    print("Confusion Matrix:\n", conf_matrix)
    print("\nClassification Report:\n", class_report)
    
    # Save to file
    with open(report_path, 'w') as f:
        f.write(f"Model Performance Report - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write("="*50 + "\n\n")
        f.write("Confusion Matrix:\n")
        f.write(str(conf_matrix) + "\n\n")
        f.write("Classification Report:\n")
        f.write(class_report + "\n\n")
        f.write(f"Model trained on {X_test.shape[0]} samples with {X_test.shape[1]} features.\n")
    
    print(f"\nPerformance report saved to {report_path}")

def save_model(model, path='rf_model.pkl'):
    """Save trained model to disk."""
    joblib.dump(model, path)
    print(f"\nModel saved to {path}")

if __name__ == '__main__':
    file_path = "C:/Users/navee/Downloads/archive/insurance_claims.xlsx"
    report_path = "C:/Users/navee/Downloads/archive/performance_report.txt"
    
    df = load_data(file_path)
    df = preprocess_data(df)
    X_train, X_test, y_train, y_test = split_and_balance(df, 'fraud_reported_Y')
    
    model = train_model(X_train, y_train)
    evaluate_model(model, X_test, y_test, report_path)
    save_model(model, 'C:/Users/navee/Downloads/archive/rf_model.pkl')

Confusion Matrix:
 [[197   7]
 [ 35 162]]

Classification Report:
               precision    recall  f1-score   support

       False       0.85      0.97      0.90       204
        True       0.96      0.82      0.89       197

    accuracy                           0.90       401
   macro avg       0.90      0.89      0.89       401
weighted avg       0.90      0.90      0.89       401


Performance report saved to C:/Users/navee/Downloads/archive/performance_report.txt

Model saved to C:/Users/navee/Downloads/archive/rf_model.pkl
