Data Repository SFCR Template Information 2023

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
data = pd.read_csv('Data Repository SFCR Template Information 2023.csv')  # Adjust this to your dataset

# Step 1: Data Preprocessing
pivot_data = data.pivot_table(
    index=['InstitutionName', 'ReportingDate', 'InstitutionType'],
    columns='row_full_name',
    values='Euro_value_or_percentage_value',
    aggfunc='first'
).reset_index()

pivot_data = pivot_data.dropna(subset=['Assets    Total assets', 'Liabilities    Total liabilities'])

# Step 2: Feature Engineering
pivot_data['Asset_Liability_Ratio'] = pivot_data['Assets    Total assets'] / pivot_data['Liabilities    Total liabilities']
pivot_data['Target'] = np.where(pivot_data['Asset_Liability_Ratio'] < 1.2, 1, 0)
pivot_data['Cash_to_Assets'] = pivot_data['Assets    Cash and cash equivalents'] / pivot_data['Assets    Total assets']
pivot_data['Excess_to_Assets'] = pivot_data.get('Excess of assets over liabilities', 0) / pivot_data['Assets    Total assets']

print("Target Distribution:\n", pivot_data['Target'].value_counts())
if pivot_data['Target'].nunique() < 2:
    raise ValueError("Target variable has only one class.")

# Step 3: Handle Missing Values and Encoding
numerical_cols = pivot_data.select_dtypes(include=[np.number]).columns
imputer = SimpleImputer(strategy='median')
pivot_data[numerical_cols] = imputer.fit_transform(pivot_data[numerical_cols])

le = LabelEncoder()
pivot_data['InstitutionType_Encoded'] = le.fit_transform(pivot_data['InstitutionType'])

# Step 4: Feature Selection (exclude Asset_Liability_Ratio to prevent leakage)
feature_cols = [
    'Assets    Total assets', 'Liabilities    Total liabilities',
    'Assets    Cash and cash equivalents', 'Assets    Investments (other than assets held for index-linked and unit-linked contracts)',
    'Liabilities    Technical provisions - non-life', 'Cash_to_Assets',
    'Excess_to_Assets', 'InstitutionType_Encoded'
]
X = pivot_data[feature_cols]
y = pivot_data['Target']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 5: Model Setup
models = {
    'Logistic Regression': LogisticRegression(random_state=42, class_weight='balanced'),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    'SVM': SVC(kernel='rbf', random_state=42, class_weight='balanced'),
    'Neural Network': MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
}

# Step 6: Cross-Validation and Evaluation
results = {}
for name, model in models.items():
    # Use 5-fold cross-validation
    accuracy = cross_val_score(model, X_scaled, y, cv=5, scoring='accuracy').mean()
    precision = cross_val_score(model, X_scaled, y, cv=5, scoring='precision').mean()
    recall = cross_val_score(model, X_scaled, y, cv=5, scoring='recall').mean()
    f1 = cross_val_score(model, X_scaled, y, cv=5, scoring='f1').mean()
    results[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1
    }

# Step 7: Compare Results
results_df = pd.DataFrame(results).T
print("\nModel Comparison (Cross-Validation):\n", results_df)

# Train-test split for detailed report and feature importance
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\nClassification Report for {name}:\n", classification_report(y_test, y_pred, zero_division=0))

# Step 8: Feature Importance (Random Forest)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_model.fit(X_train, y_train)
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)
print("\nRandom Forest Feature Importance:\n", feature_importance)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance)
plt.title('Feature Importance from Random Forest')
plt.show()

# Step 9: Prediction on New Data
new_data = pd.DataFrame({
    'Assets    Total assets': [50000000],
    'Liabilities    Total liabilities': [45000000],
    'Assets    Cash and cash equivalents': [5000000],
    'Assets    Investments (other than assets held for index-linked and unit-linked contracts)': [30000000],
    'Liabilities    Technical provisions - non-life': [35000000],
    'Cash_to_Assets': [5000000 / 50000000],
    'Excess_to_Assets': [5000000 / 50000000],
    'InstitutionType_Encoded': [le.transform(['Insurance Company Non-Life'])[0]]
})
new_data_scaled = scaler.transform(new_data[X.columns])

print("\nPredictions for New Data (0 = Stable, 1 = At Risk):")
for name, model in models.items():
    model.fit(X_scaled, y)  # Fit on full data for prediction
    pred = model.predict(new_data_scaled)
    print(f"{name}: {pred[0]}")