# Case Study Analysis: Transformers vs Capsules

This notebook provides initial analysis of the collected data for the Transformers vs Capsules case study.

## Overview

This analysis supports the Conveyance Framework research by examining differential adoption patterns between:

- **Transformers**: "Attention Is All You Need" (arXiv:1706.03762)
- **Capsule Networks**: "Dynamic Routing Between Capsules" (arXiv:1710.09829)

## Data Sources

1. Citation timelines from Semantic Scholar
2. GitHub repository implementations
3. Paper content and embeddings
4. Boundary objects (documentation, code)
5. Semantic embeddings in ArangoDB

In [None]:
import json
from pathlib import Path
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from datetime import datetime

## 1. Load Data

In [None]:
# Define data paths
data_dir = Path("../data/case_study")

# Load citation data
with open(data_dir / "citations" / "transformers_citations.json") as f:
    transformers_citations = json.load(f)

with open(data_dir / "citations" / "capsules_citations.json") as f:
    capsules_citations = json.load(f)

print(f"Transformers total citations: {transformers_citations['total_citations']}")
print(f"Capsules total citations: {capsules_citations['total_citations']}")

In [None]:
# Load repository data
with open(data_dir / "implementations" / "transformers_repos.json") as f:
    transformers_repos = json.load(f)

with open(data_dir / "implementations" / "capsules_repos.json") as f:
    capsules_repos = json.load(f)

print(f"Transformers repositories: {transformers_repos['total_repositories']}")
print(f"Capsules repositories: {capsules_repos['total_repositories']}")

## 2. Citation Analysis

In [None]:
# Convert to pandas DataFrames
def monthly_to_df(monthly_data, paper_name):
    df = pd.DataFrame(monthly_data)
    df['date'] = pd.to_datetime(df[['year', 'month']].assign(day=1))
    df['paper'] = paper_name
    df['cumulative'] = df['count'].cumsum()
    return df

trans_df = monthly_to_df(transformers_citations['monthly_citations'], 'Transformers')
caps_df = monthly_to_df(capsules_citations['monthly_citations'], 'Capsules')

citation_df = pd.concat([trans_df, caps_df])

# Display summary statistics
print("\nCitation Statistics:")
print(citation_df.groupby('paper')['count'].describe())

In [None]:
# Plot citation growth
fig = px.line(
    citation_df,
    x='date',
    y='cumulative',
    color='paper',
    title='Cumulative Citation Growth Comparison',
    labels={'cumulative': 'Cumulative Citations', 'date': 'Date'},
    template='plotly_white'
)
fig.update_layout(hovermode='x unified')
fig.show()

In [None]:
# Calculate growth rates
def calculate_growth_rate(df):
    # Monthly growth rate
    df['growth_rate'] = df['count'].pct_change() * 100
    return df

trans_df = calculate_growth_rate(trans_df)
caps_df = calculate_growth_rate(caps_df)

print("\nAverage monthly growth rate (first year):")
print(f"Transformers: {trans_df.head(12)['growth_rate'].mean():.2f}%")
print(f"Capsules: {caps_df.head(12)['growth_rate'].mean():.2f}%")

## 3. Repository Analysis

In [None]:
# Repository type distribution
print("\nRepository Type Distribution:")
print("\nTransformers:")
for repo_type, count in transformers_repos['type_counts'].items():
    print(f"  {repo_type}: {count}")

print("\nCapsules:")
for repo_type, count in capsules_repos['type_counts'].items():
    print(f"  {repo_type}: {count}")

In [None]:
# Calculate total stars
trans_stars = sum(repo['stars'] for repo in transformers_repos['repositories'])
caps_stars = sum(repo['stars'] for repo in capsules_repos['repositories'])

print(f"\nTotal GitHub stars:")
print(f"Transformers: {trans_stars:,}")
print(f"Capsules: {caps_stars:,}")
print(f"Ratio: {trans_stars / caps_stars:.2f}x")

In [None]:
# Time to first implementation
def days_to_first_repo(paper_config, repos_data):
    pub_date = datetime.fromisoformat(paper_config['published_date'])
    
    first_repo = min(
        repos_data['repositories'],
        key=lambda r: datetime.fromisoformat(r['created_at'].replace('Z', ''))
    )
    
    first_date = datetime.fromisoformat(first_repo['created_at'].replace('Z', ''))
    days = (first_date - pub_date).days
    
    return days, first_repo

# This would require config data - placeholder for now
print("\nTime to first implementation analysis:")
print("(Requires publication date data)")

## 4. Boundary Objects Analysis

In [None]:
# Load boundary objects
try:
    with open(data_dir / "boundary_objects" / "transformers" / "boundary_objects.json") as f:
        trans_bo = json.load(f)
    
    with open(data_dir / "boundary_objects" / "capsules" / "boundary_objects.json") as f:
        caps_bo = json.load(f)
    
    print("Boundary Objects Count:")
    print(f"Transformers: {trans_bo['total_objects']}")
    print(f"Capsules: {caps_bo['total_objects']}")
    
    # Type distribution
    print("\nTransformers boundary object types:")
    from collections import Counter
    trans_types = Counter(obj['type'] for obj in trans_bo['boundary_objects'])
    for obj_type, count in trans_types.items():
        print(f"  {obj_type}: {count}")
    
    print("\nCapsules boundary object types:")
    caps_types = Counter(obj['type'] for obj in caps_bo['boundary_objects'])
    for obj_type, count in caps_types.items():
        print(f"  {obj_type}: {count}")
        
except FileNotFoundError:
    print("Boundary objects not yet collected. Run script 04.")

## 5. Semantic Analysis (From Embeddings)

In [None]:
# Load embedding coordinates
try:
    with open(data_dir / "visualizations" / "embedding_coordinates.json") as f:
        coords_data = json.load(f)
    
    coords = np.array(coords_data['coordinates'])
    labels = coords_data['labels']
    metadata = coords_data['metadata']
    
    print(f"Loaded {len(coords)} embedding coordinates")
    
    # Analyze semantic clustering
    from sklearn.metrics import silhouette_score
    
    # Create cluster labels by paper
    cluster_labels = [meta['paper'] for meta in metadata]
    unique_papers = list(set(cluster_labels))
    cluster_ids = [unique_papers.index(label) for label in cluster_labels]
    
    silhouette = silhouette_score(coords, cluster_ids)
    print(f"\nSilhouette score (paper clustering): {silhouette:.3f}")
    print("(Higher = better separation between papers)")
    
except FileNotFoundError:
    print("Embeddings not yet generated. Run scripts 05 and 06.")

## 6. Conveyance Framework Metrics (Placeholder)

This section will calculate Conveyance Framework metrics:

- **W** (Writeability): Quality of paper writing
- **R** (Readability): Ease of understanding
- **C_ext** (External Context): Availability of boundary objects
- **P_ij** (Prior Knowledge): Community readiness
- **T** (Trust): Author/institution credibility

These require manual annotation and additional data sources.

In [None]:
# Placeholder for C_ext calculation
def calculate_c_ext(boundary_objects, implementations):
    """
    Calculate external context score.
    
    Factors:
    - Number of boundary objects
    - Quality of documentation
    - Official code availability
    - Tutorial availability
    """
    # Placeholder implementation
    score = 0.0
    
    # Official code bonus
    has_official = any(repo['type'] == 'official' for repo in implementations['repositories'])
    if has_official:
        score += 0.3
    
    # Boundary objects
    bo_count = boundary_objects['total_objects']
    score += min(bo_count / 20, 0.4)  # Max 0.4 for BO count
    
    # Tutorial availability
    tutorial_count = sum(1 for repo in implementations['repositories'] if repo['type'] == 'tutorial')
    score += min(tutorial_count / 10, 0.3)  # Max 0.3 for tutorials
    
    return min(score, 1.0)

# Calculate for both papers (if data available)
print("\nC_ext scores (External Context):")
print("(Placeholder calculation - needs refinement)")

try:
    trans_c_ext = calculate_c_ext(trans_bo, transformers_repos)
    caps_c_ext = calculate_c_ext(caps_bo, capsules_repos)
    
    print(f"Transformers: {trans_c_ext:.3f}")
    print(f"Capsules: {caps_c_ext:.3f}")
    print(f"Ratio: {trans_c_ext / caps_c_ext:.2f}x")
except:
    print("Data not available yet")

## 7. Summary Report

In [None]:
# Generate summary report
print("="*60)
print("CASE STUDY SUMMARY: Transformers vs Capsules")
print("="*60)

print("\n1. CITATION IMPACT")
print(f"   Transformers: {transformers_citations['total_citations']:,} citations")
print(f"   Capsules: {capsules_citations['total_citations']:,} citations")
print(f"   Ratio: {transformers_citations['total_citations'] / capsules_citations['total_citations']:.2f}x")

print("\n2. IMPLEMENTATION ADOPTION")
print(f"   Transformers: {transformers_repos['total_repositories']} repositories")
print(f"   Capsules: {capsules_repos['total_repositories']} repositories")
print(f"   Ratio: {transformers_repos['total_repositories'] / capsules_repos['total_repositories']:.2f}x")

print("\n3. COMMUNITY ENGAGEMENT")
print(f"   Transformers: {trans_stars:,} total stars")
print(f"   Capsules: {caps_stars:,} total stars")
print(f"   Ratio: {trans_stars / caps_stars:.2f}x")

try:
    print("\n4. BOUNDARY OBJECTS")
    print(f"   Transformers: {trans_bo['total_objects']} objects")
    print(f"   Capsules: {caps_bo['total_objects']} objects")
    print(f"   Ratio: {trans_bo['total_objects'] / caps_bo['total_objects']:.2f}x")
except:
    print("\n4. BOUNDARY OBJECTS")
    print("   Not yet collected")

print("\n" + "="*60)
print("Next steps:")
print("1. Manual annotation for W, R, P_ij, T metrics")
print("2. Statistical significance testing")
print("3. Qualitative analysis of adoption patterns")
print("4. Conveyance model validation")
print("="*60)

## 8. Export Results

In [None]:
# Export summary data for further analysis
summary = {
    'transformers': {
        'citations': transformers_citations['total_citations'],
        'repositories': transformers_repos['total_repositories'],
        'stars': trans_stars,
        'type_distribution': transformers_repos['type_counts']
    },
    'capsules': {
        'citations': capsules_citations['total_citations'],
        'repositories': capsules_repos['total_repositories'],
        'stars': caps_stars,
        'type_distribution': capsules_repos['type_counts']
    },
    'analysis_date': datetime.now().isoformat()
}

with open(data_dir / 'summary_report.json', 'w') as f:
    json.dump(summary, f, indent=2)

print("Summary exported to data/case_study/summary_report.json")