In [None]:
import os, re
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

BASE = "thesis_artifacts"
os.makedirs(BASE, exist_ok=True)
df = pd.read_csv(os.path.join(BASE,"evidence_matrix.csv"))

def bar_pair(categories, hc_counts, fin_counts, title, fname, ylabel="Number of systems"):
    x = np.arange(len(categories)); width = 0.35
    fig, ax = plt.subplots(figsize=(8.5,5))
    ax.bar(x - width/2, hc_counts, width, label='Healthcare')
    ax.bar(x + width/2, fin_counts, width, label='Finance')
    ax.set_xticks(x); ax.set_xticklabels(categories, rotation=10)
    ax.set_ylabel(ylabel); ax.set_title(title); ax.legend()
    for i,v in enumerate(hc_counts): ax.text(i - width/2, v + 0.02, str(v), ha='center', va='bottom', fontsize=8)
    for i,v in enumerate(fin_counts): ax.text(i + width/2, v + 0.02, str(v), ha='center', va='bottom', fontsize=8)
    fig.tight_layout(); path=os.path.join(BASE,fname); fig.savefig(path, dpi=220); plt.close(fig)

# Figure 4.1 — Pipeline evidence by sector
def count(df, sector, col, starts):
    return sum(1 for _,r in df.iterrows() if r['sector']==sector and str(r[col]).lower().startswith(starts))
hc_stream = count(df,'Healthcare','pipeline','stream'); hc_batch = count(df,'Healthcare','pipeline','batch'); hc_unclear = sum(1 for _,r in df.iterrows() if r['sector']=='Healthcare' and r['pipeline']=='Unclear')
fin_stream= count(df,'Finance','pipeline','stream'); fin_batch= count(df,'Finance','pipeline','batch'); fin_unclear = sum(1 for _,r in df.iterrows() if r['sector']=='Finance' and r['pipeline']=='Unclear')
bar_pair(['Streaming (explicit)','Batch (explicit)','Unclear'],
         [hc_stream, hc_batch, hc_unclear], [fin_stream, fin_batch, fin_unclear],
         "Figure 4.1 – Pipeline evidence by sector (evidence-only)", "Figure_4_1_Pipeline_by_Sector.png")

# Figure 4.2 — Latency class by sector
def is_rt(v): s=str(v).lower(); return s.startswith('real') or s.startswith('near')
def is_batch(v): return str(v).lower().startswith('batch')
hc_rt = sum(1 for _,r in df.iterrows() if r['sector']=='Healthcare' and is_rt(r['latency']))
hc_bd = sum(1 for _,r in df.iterrows() if r['sector']=='Healthcare' and is_batch(r['latency']))
hc_lu = sum(1 for _,r in df.iterrows() if r['sector']=='Healthcare' and r['latency']=='Unclear')
fin_rt= sum(1 for _,r in df.iterrows() if r['sector']=='Finance' and is_rt(r['latency']))
fin_bd= sum(1 for _,r in df.iterrows() if r['sector']=='Finance' and is_batch(r['latency']))
fin_lu= sum(1 for _,r in df.iterrows() if r['sector']=='Finance' and r['latency']=='Unclear')
bar_pair(['Real-time/near real-time (explicit)','Batch/Delayed (explicit)','Unclear'],
         [hc_rt, hc_bd, hc_lu], [fin_rt, fin_bd, fin_lu],
         "Figure 4.2 – Latency class by sector (evidence-only)", "Figure_4_2_Latency_by_Sector.png")


# Figure 4.4 — Deployment model by sector
def dep_count(df, sector, prefix): return sum(1 for _,r in df.iterrows() if r['sector']==sector and str(r['deployment']).lower().startswith(prefix.lower()))
hc = [dep_count(df,'Healthcare','Cloud'),dep_count(df,'Healthcare','Hybrid'),dep_count(df,'Healthcare','On-prem'),dep_count(df,'Healthcare','Unclear')]
fin= [dep_count(df,'Finance','Cloud'),dep_count(df,'Finance','Hybrid'),dep_count(df,'Finance','On-prem'),dep_count(df,'Finance','Unclear')]
bar_pair(['Cloud','Hybrid','On-prem','Unclear'], hc, fin, "Figure 4.4 – Deployment model by sector (evidence-only)", "Figure_4_4_Deployment_by_Sector.png")


# Transparency Index (0–5)
def is_explicit(val): return isinstance(val, str) and val.strip()!="" and val.strip().lower()!="unclear"
df['pipeline']  = df['pipeline'].apply(is_explicit).astype(int)
df['latency']   = df['latency'].apply(is_explicit).astype(int)
df['compliance'] = df['compliance_noted'].apply(is_explicit).astype(int)
df['deployment'] = df['deployment'].apply(is_explicit).astype(int)
df['scale']      = df['scalability'].apply(is_explicit).astype(int)
df['transparency_index']=df[['pipeline','latency','compliance','deployment','scale']].sum(axis=1)
df[['system','sector','pipeline','latency','compliance','deployment','scale','transparency_index']].to_csv(os.path.join(BASE,"transparency_index.csv"), index=False)
sector = df.groupby('sector')['transparency_index'].agg(['count','mean']).reset_index()
sector.to_csv(os.path.join(BASE,"sector_summary.csv"), index=False)

plt.figure(figsize=(10,5)); plt.barh(df['system'], df['transparency_index'])
plt.xlabel("Transparency Index (0–5)"); plt.ylabel("System"); plt.title("Figure 5.x – Transparency Index by System (evidence-only)"); plt.tight_layout()
plt.savefig(os.path.join(BASE,"Figure_5_Transparency_Index_By_System.png"), dpi=220); plt.close()

plt.figure(figsize=(6,4)); plt.bar(sector['sector'], sector['mean'])
plt.ylabel("Average Transparency Index (0–5)"); plt.title("Figure 5.y – Average Transparency Index by Sector (evidence-only)"); plt.tight_layout()
plt.savefig(os.path.join(BASE,"Figure_5_Transparency_Index_By_Sector.png"), dpi=220); plt.close()

print("All figures regenerated.")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Set up clean styling
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['font.size'] = 11
plt.rcParams['font.family'] = 'sans-serif'

# Figure 4.3 — Latency class by sector
data = {
    "system": [
        "PEACH (Perioperative AI Chatbot)",
        "Premera Scout (Azure Health Bot)",
        "NHS Digital Access Point (Sensely)",
        "Nuance DAX Copilot embedded in Epic",
        "Microsoft Healthcare Bot + Infermedica",
        "Cleveland Clinic ambient AI scribe (Ambience)",
        "Amazon Finance Automation Q&A (Bedrock)",
        "JPMorgan COiN (Contract Intelligence)",
        "Morgan Stanley – Assistant & Debrief",
        "HSBC internal policy chatbot (Dialogflow)",
        "ING generative assistant (McKinsey)",
        "NatWest Cora / Cora+ (IBM watsonx; OpenAI)",
    ],
    "sector": [
        "Healthcare", "Healthcare", "Healthcare", "Healthcare", "Healthcare", "Healthcare",
        "Finance", "Finance", "Finance", "Finance", "Finance", "Finance"
    ],
    "tool_stack": [
        "Claude 3.5 (Pair Chat framework); peri-operative protocols",
        "Azure Health Bot; Cognitive Services",
        "Sensely platform",
        "Nuance DAX Copilot; Azure OpenAI; Epic",
        "Azure Healthcare Bot; Infermedica APIs",
        "Ambience Healthcare platform; EHR integration",
        "Amazon Bedrock (RAG); Kendra/OpenSearch; embeddings",
        "Internal document-processing stack",
        "GPT-4 based Assistant; Debrief meeting summarization",
        "Dialogflow; Google Cloud AI",
        "Multi-step retrieval + ranking pipeline",
        "IBM watsonx (Cora+); OpenAI collaboration",
    ]
}

df = pd.DataFrame(data)

# Simplified tool categories matching your actual data precisely
tool_groups = {
    "Cloud Platforms": {
        "Azure Services": ["azure"],
        "AWS Services": ["amazon", "bedrock", "kendra", "opensearch"],
        "Google Cloud": ["dialogflow", "google cloud"],
        "IBM Watson": ["watsonx"]
    },
    "AI Models": {
        "Claude": ["claude"],
        "GPT/OpenAI": ["gpt", "openai"],
    },
    "Healthcare Tech": {
        "Epic": ["epic"],
        "Nuance": ["nuance"],
        "Sensely": ["sensely"],
        "Ambience": ["ambience"],
        "Infermedica": ["infermedica"]
    }
}

# Count occurrences by sector and group
records = []
for group_name, tools in tool_groups.items():
    for tool_name, keywords in tools.items():
        for _, row in df.iterrows():
            tool_stack_lower = row["tool_stack"].lower()
            if any(keyword in tool_stack_lower for keyword in keywords):
                records.append({
                    "tool": tool_name,
                    "sector": row["sector"],
                    "group": group_name
                })

tool_df = pd.DataFrame(records)
counts = tool_df.groupby(["tool", "sector", "group"]).size().reset_index(name="count")

# Calculate x-positions for grouped layout
x_positions = {}
x_pos = 0
group_boundaries = []

for group_name, tools in tool_groups.items():
    group_start = x_pos
    for tool_name in tools.keys():
        x_positions[tool_name] = x_pos
        x_pos += 1
    group_boundaries.append((group_start, x_pos - 1, group_name))
    x_pos += 0.8  # Space between groups

# Create the scatter plot
fig, ax = plt.subplots(figsize=(14, 8))

# Define colors and styling for sectors
sector_styles = {
    "Healthcare": {
        "color": "#2563EB",      # Blue
        "marker": "o",
        "size": 200,
        "alpha": 1.0,
        "edge_color": "white",
        "edge_width": 2
    },
    "Finance": {
        "color": "#DC2626",      # Red
        "marker": "s",
        "size": 200,
        "alpha": 1.0,
        "edge_color": "white",
        "edge_width": 2
    }
}

# Add subtle background for groups
group_colors = ["#F8FAFC", "#F1F5F9", "#E2E8F0", "#CBD5E1"]
for i, (start, end, group_name) in enumerate(group_boundaries):
    ax.axvspan(start - 0.4, end + 0.4,
               alpha=0.3,
               color=group_colors[i % len(group_colors)],
               zorder=0)

# Plot scatter points for each sector
for sector in ["Healthcare", "Finance"]:
    subset = counts[counts["sector"] == sector]
    if len(subset) > 0:
        x_vals = [x_positions[tool] for tool in subset["tool"]]
        y_vals = subset["count"]
        style = sector_styles[sector]

        scatter = ax.scatter(x_vals, y_vals,
                           c=style["color"],
                           marker=style["marker"],
                           s=style["size"],
                           alpha=style["alpha"],
                           edgecolors=style["edge_color"],
                           linewidth=style["edge_width"],
                           label=sector,
                           zorder=3)

# Customize axes and labels
tool_names = []
x_tick_positions = []
for group_name, tools in tool_groups.items():
    for tool_name in tools.keys():
        if tool_name in x_positions:
            tool_names.append(tool_name)
            x_tick_positions.append(x_positions[tool_name])

ax.set_xticks(x_tick_positions)
ax.set_xticklabels(tool_names, rotation=45, ha="right", fontsize=10)

# Add group labels
for start, end, group_name in group_boundaries:
    mid_point = (start + end) / 2
    ax.text(mid_point, ax.get_ylim()[1] * 0.95,
            group_name,
            ha='center', va='center',
            fontsize=12, fontweight='bold',
            bbox=dict(boxstyle="round,pad=0.4",
                     facecolor='white',
                     edgecolor='#6B7280',
                     linewidth=1),
            zorder=4)

# Set titles and labels - removed main title to make space
ax.set_xlabel('Tools & Platforms (Grouped by Category)',
              fontsize=13, fontweight='bold',
              color='#374151')

ax.set_ylabel('Number of Implementations',
              fontsize=13, fontweight='bold',
              color='#374151')

# Style the legend - positioned higher in upper right corner
legend = ax.legend(loc='upper right',
                  bbox_to_anchor=(0.98, 1.05),
                  fontsize=12,
                  title='Sectors',
                  title_fontsize=13,
                  frameon=True,
                  fancybox=True,
                  framealpha=0.95,
                  edgecolor='#D1D5DB')
legend.get_frame().set_facecolor('white')

# Add subtle grid
ax.grid(True, alpha=0.3, linestyle='-', linewidth=0.5, color='#E5E7EB')
ax.set_axisbelow(True)

# Set y-axis to start from 0 and add some padding
ax.set_ylim(0, max(counts["count"]) * 1.15)

# Clean up the plot appearance
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_color('#D1D5DB')
ax.spines['bottom'].set_color('#D1D5DB')

# Adjust layout and save
plt.tight_layout()

scatter_path = "/content/path"
plt.savefig(scatter_path, dpi=300, bbox_inches='tight',
           facecolor='white', edgecolor='none')
plt.show()

print(f"🎨 Enhanced visualization saved to: {scatter_path}")
print("\n📊 Detailed Analysis:")
print("="*50)

# Debug: Show what tools were actually found
print("\nActual tool matches found:")
for _, row in tool_df.iterrows():
    print(f"  - {row['tool']} ({row['sector']}) from group: {row['group']}")

print(f"\nFinal counts:")
for _, row in counts.iterrows():
    print(f"  {row['tool']} ({row['sector']}): {row['count']} uses")

print("\nTool usage summary:")
print("-" * 40)
for sector in ["Healthcare", "Finance"]:
    sector_data = counts[counts["sector"] == sector]
    total = sector_data["count"].sum()
    print(f"{sector}: {total} total implementations across {len(sector_data)} tools")

In [None]:
# Figure 4.5 — Tool class by sector

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Set clean styling
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['font.size'] = 11
plt.rcParams['font.family'] = 'sans-serif'


data = {
    "system": [
        "PEACH (Perioperative AI Chatbot)",
        "Premera Scout (Azure Health Bot)",
        "NHS Digital Access Point (Sensely)",
        "Nuance DAX Copilot embedded in Epic",
        "Microsoft Healthcare Bot + Infermedica",
        "Cleveland Clinic ambient AI scribe (Ambience)",
        "Amazon Finance Automation Q&A (Bedrock)",
        "JPMorgan COiN (Contract Intelligence)",
        "Morgan Stanley – Assistant & Debrief",
        "HSBC internal policy chatbot (Dialogflow)",
        "ING generative assistant (McKinsey)",
        "NatWest Cora / Cora+ (IBM watsonx; OpenAI)",
    ],
    "sector": [
        "Healthcare", "Healthcare", "Healthcare", "Healthcare", "Healthcare", "Healthcare",
        "Finance", "Finance", "Finance", "Finance", "Finance", "Finance"
    ],
    "compliance_noted": [
        '"Secure" LLM environment; institutional protocols',
        "HIPAA-aligned on Azure (privacy compliance claimed; mechanisms not enumerated)",
        "NHS data-protection governance implied; mechanisms not itemised",
        'Azure OpenAI + Epic workflow integration ("secure"); mechanisms not enumerated',
        "HIPAA-aligned Azure service (mechanisms not enumerated)",
        "Hospital/EHR governance (no mechanism list)",
        "Enterprise controls/guardrails; logging (per AWS materials)",
        "Compliance emphasis (access/audit reported in press)",
        "Client consent; internal-content-only; guardrails (press)",
        "Enterprise controls on Google Cloud (no mechanism list)",
        "Risk-coded guardrails (case study)",
        "Bank-grade governance; OpenAI collaboration (press)",
    ]
}

df = pd.DataFrame(data)

# Enhanced compliance mechanism groups with specific keywords
compliance_groups = {
    "Encryption": ["encryption", "encrypted", "secure", "security"],
    "Access Control": ["access", "consent", "client consent", "controls"],
    "Audit/Guardrails": ["audit", "guardrails", "logging", "governance", "controls"],
    "EHR Integration": ["ehr", "epic", "hospital", "workflow integration"],
    "Regulatory Compliance": ["hipaa", "gdpr", "nhs", "data-protection", "privacy compliance", "bank-grade"]
}

# Count mechanism mentions by sector
records = []
for group_name, keywords in compliance_groups.items():
    for _, row in df.iterrows():
        compliance_text = str(row["compliance_noted"]).lower()
        # Check if any keyword from this group appears
        if any(keyword.lower() in compliance_text for keyword in keywords):
            records.append({
                "mechanism": group_name,
                "sector": row["sector"],
                "system": row["system"]
            })

comp_df = pd.DataFrame(records)
counts = comp_df.groupby(["mechanism", "sector"]).size().reset_index(name="count")

# Calculate x-positions for grouped layout
mechanisms = list(compliance_groups.keys())
x_positions = {mechanism: i for i, mechanism in enumerate(mechanisms)}

# Create the  scatter plot
fig, ax = plt.subplots(figsize=(12, 7))


sector_styles = {
    "Healthcare": {
        "color": "#2563EB",      # Blue
        "marker": "o",
        "size": 180,
        "alpha": 1.0,
        "edge_color": "white",
        "edge_width": 2
    },
    "Finance": {
        "color": "#DC2626",      # Red
        "marker": "s",
        "size": 180,
        "alpha": 1.0,
        "edge_color": "white",
        "edge_width": 2
    }
}

# Add subtle background shading
for i, mechanism in enumerate(mechanisms):
    ax.axvspan(i - 0.4, i + 0.4,
               alpha=0.1,
               color=["#F8FAFC", "#F1F5F9"][i % 2],
               zorder=0)

# Plot scatter points for each sector
for sector in ["Healthcare", "Finance"]:
    subset = counts[counts["sector"] == sector]
    if len(subset) > 0:
        x_vals = [x_positions[mechanism] for mechanism in subset["mechanism"]]
        y_vals = subset["count"]
        style = sector_styles[sector]

        scatter = ax.scatter(x_vals, y_vals,
                           c=style["color"],
                           marker=style["marker"],
                           s=style["size"],
                           alpha=style["alpha"],
                           edgecolors=style["edge_color"],
                           linewidth=style["edge_width"],
                           label=sector,
                           zorder=3)

# Customize the plot
ax.set_xticks(range(len(mechanisms)))
ax.set_xticklabels(mechanisms, fontsize=11, fontweight='bold')

ax.set_xlabel('Compliance Mechanism Category',
              fontsize=12, fontweight='bold',
              color='#374151')

ax.set_ylabel('Number of Systems Mentioning',
              fontsize=12, fontweight='bold',
              color='#374151')

# Style the legend
legend = ax.legend(loc='upper right',
                  bbox_to_anchor=(0.98, 0.98),
                  fontsize=12,
                  title='Sector',
                  title_fontsize=13,
                  frameon=True,
                  fancybox=True,
                  framealpha=0.95,
                  edgecolor='#D1D5DB')
legend.get_frame().set_facecolor('white')

# Add grid for better readability
ax.grid(True, alpha=0.3, linestyle='-', linewidth=0.5, color='#E5E7EB', axis='y')
ax.set_axisbelow(True)

# Set y-axis to start from 0
ax.set_ylim(0, max(counts["count"]) * 1.15 if len(counts) > 0 else 5)

# Clean up the plot appearance
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_color('#D1D5DB')
ax.spines['bottom'].set_color('#D1D5DB')

plt.tight_layout()

# Save the figure
compliance_fig_path = "/content/path"
plt.savefig(compliance_fig_path, dpi=300, bbox_inches='tight',
           facecolor='white', edgecolor='none')
plt.show()

print(f"📊 Compliance analysis saved to: {compliance_fig_path}")
print("\n🔍 Key Findings:")
print("="*50)

# Analysis summary for checking
for mechanism in mechanisms:
    mech_data = counts[counts["mechanism"] == mechanism]
    if len(mech_data) > 0:
        healthcare_count = mech_data[mech_data["sector"] == "Healthcare"]["count"].sum()
        finance_count = mech_data[mech_data["sector"] == "Finance"]["count"].sum()
        print(f"{mechanism}:")
        print(f"  Healthcare: {healthcare_count} | Finance: {finance_count}")
    else:
        print(f"{mechanism}: No mentions found")

print(f"\n📈 Sector Analysis:")
healthcare_total = counts[counts["sector"] == "Healthcare"]["count"].sum()
finance_total = counts[counts["sector"] == "Finance"]["count"].sum()
print(f"• Healthcare total compliance mentions: {healthcare_total}")
print(f"• Finance total compliance mentions: {finance_total}")

print(f"\n🎯 Expected Pattern Verification:")
print("✓ Finance should show stronger audit/guardrails emphasis")
print("✓ Healthcare should show EHR integration and regulatory compliance")