# Open-Source Model Ecosystem Analysis

## Overview

The open-source LLM ecosystem is rapidly evolving with diverse models, licensing schemes, and deployment strategies. This notebook covers:

- **Model Checkpoints**: Management, versioning, and distribution systems
- **License Analysis**: Compatibility checking and compliance frameworks
- **Ecosystem Metrics**: Performance tracking and adoption analysis
- **Model Selection**: Decision frameworks for choosing appropriate models

Let's build practical tools for navigating the open-source model landscape.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import hashlib
import requests
from datetime import datetime, timedelta
from collections import defaultdict, Counter
from typing import Dict, List, Any, Optional, Tuple
from dataclasses import dataclass, field
from enum import Enum
import re

print("Libraries imported successfully!")

## 1. Model Checkpoint Management System

Let's implement a comprehensive system for managing model checkpoints with versioning and metadata:

In [None]:
class LicenseType(Enum):
    MIT = "MIT"
    APACHE_2 = "Apache-2.0"
    GPL_3 = "GPL-3.0"
    BSD_3 = "BSD-3-Clause"
    CC_BY_4 = "CC-BY-4.0"
    CC_BY_SA_4 = "CC-BY-SA-4.0"
    LLAMA_2 = "Llama-2-Custom"
    OPENRAIL = "OpenRAIL-M"
    PROPRIETARY = "Proprietary"

@dataclass
class ModelCheckpoint:
    model_id: str
    name: str
    version: str
    organization: str
    license_type: LicenseType
    parameter_count: int
    model_size_gb: float
    architecture: str
    training_data: str
    release_date: datetime
    download_url: str = ""
    paper_url: str = ""
    code_url: str = ""
    performance_metrics: Dict[str, float] = field(default_factory=dict)
    tags: List[str] = field(default_factory=list)
    download_count: int = 0
    rating: float = 0.0
    commercial_use: bool = True
    modification_allowed: bool = True
    redistribution_allowed: bool = True

class ModelRegistry:
    """Comprehensive model registry with search and analysis capabilities"""
    
    def __init__(self):
        self.models = {}
        self.license_compatibility = self._init_license_compatibility()
        self.search_index = defaultdict(set)
        self.analytics = {
            'total_models': 0,
            'total_downloads': 0,
            'license_distribution': Counter(),
            'organization_distribution': Counter(),
            'architecture_distribution': Counter()
        }
        self._populate_sample_models()
    
    def _init_license_compatibility(self):
        """Initialize license compatibility matrix"""
        # Simplified compatibility matrix (True = compatible)
        compatibility = {
            (LicenseType.MIT, LicenseType.MIT): True,
            (LicenseType.MIT, LicenseType.APACHE_2): True,
            (LicenseType.MIT, LicenseType.BSD_3): True,
            (LicenseType.APACHE_2, LicenseType.APACHE_2): True,
            (LicenseType.APACHE_2, LicenseType.MIT): True,
            (LicenseType.GPL_3, LicenseType.GPL_3): True,
            (LicenseType.GPL_3, LicenseType.MIT): False,  # GPL is more restrictive
            (LicenseType.CC_BY_4, LicenseType.CC_BY_4): True,
            (LicenseType.CC_BY_SA_4, LicenseType.CC_BY_SA_4): True,
            (LicenseType.LLAMA_2, LicenseType.LLAMA_2): True,
            (LicenseType.PROPRIETARY, LicenseType.PROPRIETARY): False
        }
        return compatibility
    
    def _populate_sample_models(self):
        """Populate registry with sample models"""
        sample_models = [
            ModelCheckpoint(
                model_id="llama-2-7b",
                name="Llama 2 7B",
                version="1.0",
                organization="Meta",
                license_type=LicenseType.LLAMA_2,
                parameter_count=7_000_000_000,
                model_size_gb=13.5,
                architecture="Transformer",
                training_data="Custom dataset (2T tokens)",
                release_date=datetime(2023, 7, 18),
                performance_metrics={"hellaswag": 0.776, "mmlu": 0.459, "truthfulqa": 0.389},
                tags=["chat", "instruct", "general"],
                download_count=150000,
                rating=4.2,
                commercial_use=True
            ),
            ModelCheckpoint(
                model_id="mistral-7b",
                name="Mistral 7B",
                version="0.1",
                organization="Mistral AI",
                license_type=LicenseType.APACHE_2,
                parameter_count=7_300_000_000,
                model_size_gb=14.2,
                architecture="Transformer",
                training_data="Web crawl + curated datasets",
                release_date=datetime(2023, 9, 27),
                performance_metrics={"hellaswag": 0.813, "mmlu": 0.624, "truthfulqa": 0.425},
                tags=["general", "efficient", "open"],
                download_count=89000,
                rating=4.5,
                commercial_use=True
            ),
            ModelCheckpoint(
                model_id="falcon-7b",
                name="Falcon 7B",
                version="1.0",
                organization="TII",
                license_type=LicenseType.APACHE_2,
                parameter_count=7_000_000_000,
                model_size_gb=14.0,
                architecture="Transformer",
                training_data="RefinedWeb (1.5T tokens)",
                release_date=datetime(2023, 6, 5),
                performance_metrics={"hellaswag": 0.743, "mmlu": 0.353, "truthfulqa": 0.344},
                tags=["general", "multilingual"],
                download_count=67000,
                rating=3.9,
                commercial_use=True
            ),
            ModelCheckpoint(
                model_id="vicuna-7b",
                name="Vicuna 7B",
                version="1.5",
                organization="LMSYS",
                license_type=LicenseType.LLAMA_2,
                parameter_count=7_000_000_000,
                model_size_gb=13.5,
                architecture="Transformer",
                training_data="ShareGPT conversations",
                release_date=datetime(2023, 7, 20),
                performance_metrics={"hellaswag": 0.766, "mmlu": 0.471, "truthfulqa": 0.392},
                tags=["chat", "conversation", "fine-tuned"],
                download_count=45000,
                rating=4.1,
                commercial_use=False  # Due to training data restrictions
            ),
            ModelCheckpoint(
                model_id="code-llama-7b",
                name="Code Llama 7B",
                version="1.0",
                organization="Meta",
                license_type=LicenseType.LLAMA_2,
                parameter_count=7_000_000_000,
                model_size_gb=13.5,
                architecture="Transformer",
                training_data="Code datasets + Llama 2",
                release_date=datetime(2023, 8, 24),
                performance_metrics={"humaneval": 0.299, "mbpp": 0.374},
                tags=["code", "programming", "specialized"],
                download_count=78000,
                rating=4.3,
                commercial_use=True
            )
        ]
        
        for model in sample_models:
            self.register_model(model)
    
    def register_model(self, model: ModelCheckpoint):
        """Register a new model in the registry"""
        self.models[model.model_id] = model
        
        # Update search index
        self._update_search_index(model)
        
        # Update analytics
        self._update_analytics(model)
    
    def _update_search_index(self, model: ModelCheckpoint):
        """Update search index for efficient querying"""
        # Index by various attributes
        self.search_index['license'][model.license_type].add(model.model_id)
        self.search_index['organization'][model.organization.lower()].add(model.model_id)
        self.search_index['architecture'][model.architecture.lower()].add(model.model_id)
        
        # Index by tags
        for tag in model.tags:
            self.search_index['tag'][tag.lower()].add(model.model_id)
        
        # Index by parameter size ranges
        param_range = self._get_parameter_range(model.parameter_count)
        self.search_index['param_range'][param_range].add(model.model_id)
    
    def _update_analytics(self, model: ModelCheckpoint):
        """Update registry analytics"""
        self.analytics['total_models'] += 1
        self.analytics['total_downloads'] += model.download_count
        self.analytics['license_distribution'][model.license_type] += 1
        self.analytics['organization_distribution'][model.organization] += 1
        self.analytics['architecture_distribution'][model.architecture] += 1
    
    def _get_parameter_range(self, param_count):
        """Categorize parameter count into ranges"""
        if param_count < 1e9:
            return "<1B"
        elif param_count < 10e9:
            return "1B-10B"
        elif param_count < 100e9:
            return "10B-100B"
        else:
            return ">100B"
    
    def search_models(self, query_params: Dict[str, Any]) -> List[ModelCheckpoint]:
        """Search models based on various criteria"""
        candidate_ids = set(self.models.keys())
        
        # Filter by license
        if 'license' in query_params:
            license_type = query_params['license']
            if isinstance(license_type, str):
                license_type = LicenseType(license_type)
            candidate_ids &= self.search_index['license'][license_type]
        
        # Filter by organization
        if 'organization' in query_params:
            org = query_params['organization'].lower()
            candidate_ids &= self.search_index['organization'][org]
        
        # Filter by tags
        if 'tags' in query_params:
            for tag in query_params['tags']:
                candidate_ids &= self.search_index['tag'][tag.lower()]
        
        # Filter by parameter range
        if 'param_range' in query_params:
            param_range = query_params['param_range']
            candidate_ids &= self.search_index['param_range'][param_range]
        
        # Filter by commercial use
        if 'commercial_use' in query_params:
            commercial_required = query_params['commercial_use']
            candidate_ids = {
                mid for mid in candidate_ids 
                if self.models[mid].commercial_use >= commercial_required
            }
        
        # Filter by minimum rating
        if 'min_rating' in query_params:
            min_rating = query_params['min_rating']
            candidate_ids = {
                mid for mid in candidate_ids 
                if self.models[mid].rating >= min_rating
            }
        
        # Return matching models
        results = [self.models[mid] for mid in candidate_ids]
        
        # Sort by relevance (download count * rating)
        results.sort(key=lambda m: m.download_count * m.rating, reverse=True)
        
        return results
    
    def check_license_compatibility(self, model_ids: List[str]) -> Dict[str, Any]:
        """Check license compatibility between multiple models"""
        models = [self.models[mid] for mid in model_ids if mid in self.models]
        
        if len(models) < 2:
            return {'compatible': True, 'issues': []}
        
        compatibility_issues = []
        
        # Check pairwise compatibility
        for i, model1 in enumerate(models):
            for model2 in models[i+1:]:
                license_pair = (model1.license_type, model2.license_type)
                reverse_pair = (model2.license_type, model1.license_type)
                
                compatible = (
                    self.license_compatibility.get(license_pair, False) or
                    self.license_compatibility.get(reverse_pair, False)
                )
                
                if not compatible:
                    compatibility_issues.append({
                        'model1': model1.name,
                        'model2': model2.name,
                        'license1': model1.license_type.value,
                        'license2': model2.license_type.value,
                        'issue': 'License incompatibility'
                    })
        
        # Check commercial use restrictions
        commercial_restricted = [m for m in models if not m.commercial_use]
        if commercial_restricted:
            compatibility_issues.append({
                'models': [m.name for m in commercial_restricted],
                'issue': 'Commercial use restricted'
            })
        
        return {
            'compatible': len(compatibility_issues) == 0,
            'issues': compatibility_issues,
            'models_checked': [m.name for m in models]
        }
    
    def get_model_recommendations(self, requirements: Dict[str, Any]) -> List[Tuple[ModelCheckpoint, float]]:
        """Get model recommendations based on requirements"""
        all_models = list(self.models.values())
        recommendations = []
        
        for model in all_models:
            score = self._calculate_recommendation_score(model, requirements)
            if score > 0:
                recommendations.append((model, score))
        
        # Sort by score
        recommendations.sort(key=lambda x: x[1], reverse=True)
        
        return recommendations[:10]  # Top 10 recommendations
    
    def _calculate_recommendation_score(self, model: ModelCheckpoint, requirements: Dict[str, Any]) -> float:
        """Calculate recommendation score for a model"""
        score = 0.0
        
        # Base score from rating and popularity
        score += model.rating * 0.3
        score += min(model.download_count / 100000, 1.0) * 0.2
        
        # License compatibility
        if 'license_preference' in requirements:
            preferred_license = requirements['license_preference']
            if model.license_type == preferred_license:
                score += 0.3
            elif self.license_compatibility.get((model.license_type, preferred_license), False):
                score += 0.15
        
        # Commercial use requirement
        if requirements.get('commercial_use', False) and not model.commercial_use:
            return 0.0  # Disqualify if commercial use required but not allowed
        
        # Task-specific requirements
        if 'task_type' in requirements:
            task_type = requirements['task_type'].lower()
            if task_type in [tag.lower() for tag in model.tags]:
                score += 0.4
        
        # Performance requirements
        if 'min_performance' in requirements:
            benchmark = requirements.get('benchmark', 'mmlu')
            min_perf = requirements['min_performance']
            
            if benchmark in model.performance_metrics:
                actual_perf = model.performance_metrics[benchmark]
                if actual_perf >= min_perf:
                    score += 0.3 * (actual_perf - min_perf)
                else:
                    score *= 0.5  # Penalize for not meeting requirements
        
        # Size constraints
        if 'max_size_gb' in requirements:
            max_size = requirements['max_size_gb']
            if model.model_size_gb <= max_size:
                score += 0.1
            else:
                score *= 0.7  # Penalize for being too large
        
        return max(0.0, min(5.0, score))  # Clamp between 0 and 5
    
    def get_ecosystem_analytics(self) -> Dict[str, Any]:
        """Get comprehensive ecosystem analytics"""
        models = list(self.models.values())
        
        analytics = {
            'overview': self.analytics.copy(),
            'performance_analysis': self._analyze_performance_trends(models),
            'license_analysis': self._analyze_license_landscape(models),
            'temporal_analysis': self._analyze_release_timeline(models),
            'size_analysis': self._analyze_model_sizes(models)
        }
        
        return analytics
    
    def _analyze_performance_trends(self, models: List[ModelCheckpoint]) -> Dict[str, Any]:
        """Analyze performance trends across models"""
        benchmarks = set()
        for model in models:
            benchmarks.update(model.performance_metrics.keys())
        
        performance_data = {}
        for benchmark in benchmarks:
            scores = [m.performance_metrics.get(benchmark, 0) for m in models if benchmark in m.performance_metrics]
            if scores:
                performance_data[benchmark] = {
                    'mean': np.mean(scores),
                    'std': np.std(scores),
                    'min': np.min(scores),
                    'max': np.max(scores),
                    'count': len(scores)
                }
        
        return performance_data
    
    def _analyze_license_landscape(self, models: List[ModelCheckpoint]) -> Dict[str, Any]:
        """Analyze license distribution and implications"""
        license_stats = Counter(m.license_type for m in models)
        commercial_friendly = sum(1 for m in models if m.commercial_use)
        
        return {
            'distribution': dict(license_stats),
            'commercial_friendly_ratio': commercial_friendly / len(models),
            'most_common_license': license_stats.most_common(1)[0] if license_stats else None
        }
    
    def _analyze_release_timeline(self, models: List[ModelCheckpoint]) -> Dict[str, Any]:
        """Analyze model release timeline"""
        release_dates = [m.release_date for m in models]
        
        if not release_dates:
            return {}
        
        earliest = min(release_dates)
        latest = max(release_dates)
        
        # Group by month
        monthly_releases = defaultdict(int)
        for date in release_dates:
            month_key = f"{date.year}-{date.month:02d}"
            monthly_releases[month_key] += 1
        
        return {
            'earliest_release': earliest,
            'latest_release': latest,
            'total_timespan_days': (latest - earliest).days,
            'monthly_distribution': dict(monthly_releases),
            'release_velocity': len(models) / max(1, (latest - earliest).days / 30)  # Models per month
        }
    
    def _analyze_model_sizes(self, models: List[ModelCheckpoint]) -> Dict[str, Any]:
        """Analyze model size distribution"""
        param_counts = [m.parameter_count for m in models]
        model_sizes = [m.model_size_gb for m in models]
        
        return {
            'parameter_stats': {
                'mean': np.mean(param_counts),
                'median': np.median(param_counts),
                'min': np.min(param_counts),
                'max': np.max(param_counts)
            },
            'size_stats': {
                'mean_gb': np.mean(model_sizes),
                'median_gb': np.median(model_sizes),
                'min_gb': np.min(model_sizes),
                'max_gb': np.max(model_sizes)
            },
            'size_distribution': dict(Counter(self._get_parameter_range(pc) for pc in param_counts))
        }

# Initialize model registry
model_registry = ModelRegistry()
print("Model registry initialized with sample models!")

### Testing Model Registry and Search

Let's test our model registry with various search queries and analysis:

In [None]:
# Test various search scenarios
search_scenarios = [
    {
        'name': 'Commercial Use Models',
        'query': {'commercial_use': True, 'min_rating': 4.0},
        'description': 'Models suitable for commercial use with good ratings'
    },
    {
        'name': 'Apache Licensed Models',
        'query': {'license': LicenseType.APACHE_2},
        'description': 'Models with Apache 2.0 license'
    },
    {
        'name': 'Code-Focused Models',
        'query': {'tags': ['code']},
        'description': 'Models specialized for code generation'
    },
    {
        'name': 'Chat Models',
        'query': {'tags': ['chat'], 'param_range': '1B-10B'},
        'description': 'Chat models in the 1B-10B parameter range'
    },
    {
        'name': 'Meta Models',
        'query': {'organization': 'Meta'},
        'description': 'Models released by Meta'
    }
]

print("Testing model search functionality...\n")

search_results = []

for scenario in search_scenarios:
    print(f"Search: {scenario['name']}")
    print(f"Description: {scenario['description']}")
    print(f"Query: {scenario['query']}")
    
    results = model_registry.search_models(scenario['query'])
    
    print(f"Found {len(results)} models:")
    for model in results:
        print(f"  - {model.name} ({model.organization}) - Rating: {model.rating}, Downloads: {model.download_count:,}")
    
    search_results.append({
        'scenario': scenario['name'],
        'query_complexity': len(scenario['query']),
        'results_count': len(results),
        'avg_rating': np.mean([m.rating for m in results]) if results else 0
    })
    
    print("-" * 60)

# Test license compatibility
print("\nTesting license compatibility...")

compatibility_tests = [
    ['llama-2-7b', 'vicuna-7b'],  # Same license family
    ['mistral-7b', 'falcon-7b'],  # Both Apache 2.0
    ['llama-2-7b', 'mistral-7b']  # Different licenses
]

for test_models in compatibility_tests:
    compatibility = model_registry.check_license_compatibility(test_models)
    model_names = [model_registry.models[mid].name for mid in test_models if mid in model_registry.models]
    
    print(f"\nCompatibility check: {' + '.join(model_names)}")
    print(f"Compatible: {compatibility['compatible']}")
    
    if compatibility['issues']:
        print("Issues found:")
        for issue in compatibility['issues']:
            print(f"  - {issue}")

# Test model recommendations
print("\n" + "="*60)
print("TESTING MODEL RECOMMENDATIONS")
print("="*60)

recommendation_scenarios = [
    {
        'name': 'Commercial Chat Application',
        'requirements': {
            'commercial_use': True,
            'task_type': 'chat',
            'min_performance': 0.4,
            'benchmark': 'mmlu',
            'max_size_gb': 15
        }
    },
    {
        'name': 'Open Source Code Assistant',
        'requirements': {
            'license_preference': LicenseType.APACHE_2,
            'task_type': 'code',
            'commercial_use': True
        }
    },
    {
        'name': 'Research Project',
        'requirements': {
            'commercial_use': False,
            'min_performance': 0.45,
            'benchmark': 'mmlu'
        }
    }
]

recommendation_results = []

for scenario in recommendation_scenarios:
    print(f"\nScenario: {scenario['name']}")
    print(f"Requirements: {scenario['requirements']}")
    
    recommendations = model_registry.get_model_recommendations(scenario['requirements'])
    
    print(f"\nTop recommendations:")
    for i, (model, score) in enumerate(recommendations[:3], 1):
        print(f"  {i}. {model.name} (Score: {score:.2f})")
        print(f"     License: {model.license_type.value}, Commercial: {model.commercial_use}")
        print(f"     Rating: {model.rating}, Size: {model.model_size_gb:.1f}GB")
    
    recommendation_results.append({
        'scenario': scenario['name'],
        'recommendations_count': len(recommendations),
        'top_score': recommendations[0][1] if recommendations else 0,
        'avg_score': np.mean([score for _, score in recommendations]) if recommendations else 0
    })

# Visualize search and recommendation results
df_search = pd.DataFrame(search_results)
df_recommendations = pd.DataFrame(recommendation_results)

plt.figure(figsize=(16, 12))

# Search results distribution
plt.subplot(3, 3, 1)
plt.bar(df_search['scenario'], df_search['results_count'], color='skyblue')
plt.xlabel('Search Scenario')
plt.ylabel('Results Count')
plt.title('Search Results by Scenario')
plt.xticks(rotation=45, ha='right')

# Average rating of search results
plt.subplot(3, 3, 2)
plt.bar(df_search['scenario'], df_search['avg_rating'], color='lightgreen')
plt.xlabel('Search Scenario')
plt.ylabel('Average Rating')
plt.title('Quality of Search Results')
plt.xticks(rotation=45, ha='right')
plt.ylim(0, 5)

# Recommendation scores
plt.subplot(3, 3, 3)
plt.bar(df_recommendations['scenario'], df_recommendations['top_score'], color='coral')
plt.xlabel('Recommendation Scenario')
plt.ylabel('Top Recommendation Score')
plt.title('Recommendation Quality')
plt.xticks(rotation=45, ha='right')

# Model registry overview
analytics = model_registry.get_ecosystem_analytics()

# License distribution
plt.subplot(3, 3, 4)
license_dist = analytics['license_analysis']['distribution']
license_names = [license.value for license in license_dist.keys()]
license_counts = list(license_dist.values())
plt.pie(license_counts, labels=license_names, autopct='%1.1f%%')
plt.title('License Distribution')

# Organization distribution
plt.subplot(3, 3, 5)
org_dist = analytics['overview']['organization_distribution']
plt.bar(org_dist.keys(), org_dist.values(), color='lightblue')
plt.xlabel('Organization')
plt.ylabel('Model Count')
plt.title('Models by Organization')
plt.xticks(rotation=45)

# Parameter size distribution
plt.subplot(3, 3, 6)
size_dist = analytics['size_analysis']['size_distribution']
plt.bar(size_dist.keys(), size_dist.values(), color='gold')
plt.xlabel('Parameter Range')
plt.ylabel('Model Count')
plt.title('Model Size Distribution')

# Performance comparison (MMLU scores)
plt.subplot(3, 3, 7)
models_with_mmlu = [m for m in model_registry.models.values() if 'mmlu' in m.performance_metrics]
model_names = [m.name.split()[0] for m in models_with_mmlu]  # Shortened names
mmlu_scores = [m.performance_metrics['mmlu'] for m in models_with_mmlu]
plt.bar(model_names, mmlu_scores, color='lightcoral')
plt.xlabel('Model')
plt.ylabel('MMLU Score')
plt.title('MMLU Performance Comparison')
plt.xticks(rotation=45)

# Download popularity
plt.subplot(3, 3, 8)
all_models = list(model_registry.models.values())
model_names_short = [m.name.split()[0] for m in all_models]
download_counts = [m.download_count for m in all_models]
plt.bar(model_names_short, download_counts, color='mediumpurple')
plt.xlabel('Model')
plt.ylabel('Download Count')
plt.title('Model Popularity (Downloads)')
plt.xticks(rotation=45)

# Commercial use vs Rating
plt.subplot(3, 3, 9)
commercial_models = [m for m in all_models if m.commercial_use]
non_commercial_models = [m for m in all_models if not m.commercial_use]

commercial_ratings = [m.rating for m in commercial_models]
non_commercial_ratings = [m.rating for m in non_commercial_models]

plt.boxplot([commercial_ratings, non_commercial_ratings], 
           labels=['Commercial OK', 'Non-Commercial'])
plt.ylabel('Rating')
plt.title('Rating Distribution by Commercial Use')

plt.tight_layout()
plt.show()

print(f"\n=== ECOSYSTEM ANALYTICS SUMMARY ===")
print(f"Total models in registry: {analytics['overview']['total_models']}")
print(f"Total downloads: {analytics['overview']['total_downloads']:,}")
print(f"Commercial-friendly models: {analytics['license_analysis']['commercial_friendly_ratio']:.1%}")
print(f"Most common license: {analytics['license_analysis']['most_common_license'][0].value if analytics['license_analysis']['most_common_license'] else 'N/A'}")
print(f"Average model size: {analytics['size_analysis']['size_stats']['mean_gb']:.1f} GB")
print(f"Release velocity: {analytics['temporal_analysis']['release_velocity']:.1f} models/month")

if 'mmlu' in analytics['performance_analysis']:
    mmlu_stats = analytics['performance_analysis']['mmlu']
    print(f"Average MMLU performance: {mmlu_stats['mean']:.3f} (Â±{mmlu_stats['std']:.3f})")
    print(f"Best MMLU performance: {mmlu_stats['max']:.3f}")