# Simple Data Simulation Analysis

This notebook demonstrates the basic data simulation functionality with:
- ROC curve analysis
- Deciles analysis of outcomes by total score

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, auc

# Import data simulation
from data_simulation import Scorecard

## 0. Investigate loss rates

In [None]:
# look at how many alerts would be missed if using the total_score to discount e.g. 50% of alerts. 

def loss_rate(df_analysis,reduction_percent=0.5):
    rows = int(np.ceil(len(df_analysis)* reduction_percent))
    found = int(df_analysis.sort_values("total_score",ascending=False)[0:rows]["binary_outcome"].sum())
    total = int(df_analysis["binary_outcome"].sum())
    return found/total
    
def loss_by_beta(beta,reduction_percent=0.5,scorecard=None):
    if scorecard:
        temp_scores = scorecard.generate_binary_outcome(beta=beta)
        df_analysis = pd.DataFrame({
            'total_score': scorecard.total_scores['total_score'],
            'binary_outcome': temp_scores
            })
    else:
        temp_scorecard = Scorecard(
            n_rows=5000,
            n_features=8,
            binary_prevalence=0.10,
            random_state=None,
            beta=beta
        )
        df_analysis = pd.DataFrame({
                'total_score': temp_scorecard.total_scores['total_score'],
                'binary_outcome': temp_scorecard.binary_outcome
                })
    return(loss_rate(df_analysis,reduction_percent))

# calculate and display loss rate for a range of beta values
loss_rate_by_beta_df = None
for b in [0,0.5,1,2,3,4]:
    s = pd.Series([loss_by_beta(beta=b,scorecard=None) for i in range(200)])
    _df = pd.DataFrame({"beta":[b],"mean":[s.mean()],"std":[s.std()]})
    try:
        loss_rate_by_beta_df = pd.concat([loss_rate_by_beta_df,_df])
    except NameError:
        loss_rate_by_beta_df = _df
    loss_rate_by_beta_df.reset_index(drop=True, inplace=True)
    
loss_rate_by_beta_df.style.format({"beta":"{:3.1f}","mean":"{:.1%}"})

### Results

By varying beta from 0 to 4 we can achieve a model which is the same as random, to highly predictive >99.5% of outcomes found in first half of the data. 

| Beta 	| SARs found in first 50% of file	| 
|--- | --- |
|	0	| 50.1% | 
|	0.5	| 68.1% | 
|	1.0	| 82.1% | 
|	2.0	| 95.1% | 
|	3.0	| 98.8% | 
|	4.0	| 99.7% | 

This table is much less stable if the same scorecard is used for all simulations, but it's quicker to calculate.

## 1. Generate Simulated Data

In [None]:
# Create scorecard with simulated data
scorecard = Scorecard(
    n_rows=5000,
    n_features=3,
    binary_prevalence=0.20,
    random_state=None,
    beta=1
)

print(f"Generated {len(scorecard.total_scores)} samples")
print(f"Binary outcome prevalence: {scorecard.binary_outcome.mean():.3f}")
print(f"Total score range: {scorecard.total_scores['total_score'].min():.2f} - {scorecard.total_scores['total_score'].max():.2f}")

## 2. ROC Curve Analysis

In [None]:
# Calculate ROC curve
y_true = scorecard.binary_outcome
y_scores = scorecard.total_scores['total_score']

fpr, tpr, thresholds = roc_curve(y_true, y_scores)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.3f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve: Binary Outcome vs Total Score')
plt.legend(loc="lower right")
plt.grid(True, alpha=0.3)
plt.show()

print(f"AUC: {roc_auc:.3f}")

## 3. Deciles Analysis

In [None]:
# Create deciles analysis
df_analysis = pd.DataFrame({
    'total_score': scorecard.total_scores['total_score'],
    'binary_outcome': scorecard.binary_outcome
})

# Calculate deciles (handle duplicate edges with duplicates='drop')
df_analysis['decile'] = pd.qcut(df_analysis['total_score'], 
                               q=10, 
                               labels=False,
                               duplicates='drop')
# Convert to string labels
df_analysis['decile'] = df_analysis['decile'].apply(lambda x: f'D{x+1}')

# Calculate outcome rates by decile
decile_stats = df_analysis.groupby('decile').agg({
    'total_score': ['count', 'mean', 'min', 'max'],
    'binary_outcome': ['sum', 'mean']
}).round(4)

# Flatten column names
decile_stats.columns = ['count', 'avg_score', 'min_score', 'max_score', 'positive_outcomes', 'outcome_rate']

# Sort by descending average score (highest scoring decile first)
decile_stats = decile_stats.sort_values('avg_score', ascending=False)

# Add cumulative columns
decile_stats['cumulative_total'] = decile_stats['count'].cumsum()
decile_stats['cumulative_outcomes'] = decile_stats['positive_outcomes'].cumsum()
decile_stats['cumulative_rate'] = (decile_stats['cumulative_outcomes'] / decile_stats['cumulative_total']).round(4)
decile_stats['cumulative_total_percent'] = (decile_stats['cumulative_total'] / len(df_analysis)*100).round(4)
decile_stats['cumulative_outcomes_percent'] = (decile_stats['cumulative_outcomes'] / df_analysis["binary_outcome"].sum()*100).round(4)

print("Deciles Analysis:")
display(decile_stats.drop(["min_score","max_score"],axis=1))

# Plot outcome rates by decile
plt.figure(figsize=(6, 4))
decile_stats['outcome_rate'].plot(kind='bar', color='steelblue', alpha=0.7)
plt.title('Binary Outcome Rate by Total Score Deciles')
plt.xlabel('Decile')
plt.ylabel('Outcome Rate')
plt.xticks(rotation=0)
plt.grid(True, alpha=0.3)

# Add overall rate line
overall_rate = df_analysis['binary_outcome'].mean()
plt.axhline(y=overall_rate, color='red', linestyle='--', 
            label=f'Overall Rate: {overall_rate:.3f}')
plt.legend()
plt.tight_layout()
plt.show()

## 4. Score Distribution

In [None]:
# Plot total score distribution by outcome
plt.figure(figsize=(6, 4))

# Separate scores by outcome
scores_negative = df_analysis[df_analysis['binary_outcome'] == 0]['total_score']
scores_positive = df_analysis[df_analysis['binary_outcome'] == 1]['total_score']

plt.hist(scores_negative, bins=30, alpha=0.7, label='Negative Outcome', color='lightblue')
plt.hist(scores_positive, bins=30, alpha=0.7, label='Positive Outcome', color='orange')

plt.xlabel('Total Score')
plt.ylabel('Frequency')
plt.title('Total Score Distribution by Binary Outcome')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"Mean score for negative outcomes: {scores_negative.mean():.3f}")
print(f"Mean score for positive outcomes: {scores_positive.mean():.3f}")