# Data Collection and Filtering Pipeline

This notebook demonstrates our systematic approach to creating high-quality negative datasets for protein-protein interaction prediction.

## Overview

We implement a comprehensive filtering pipeline that:
- Excludes known protein interactors
- Filters by cellular compartment
- Filters by tissue expression
- Validates through STRING network analysis

This ensures our negative dataset contains proteins that are truly unlikely to interact with our target protein.

In [None]:

import requests
import pandas as pd
from typing import Set, List, Dict, Tuple
import time
import json
from io import StringIO
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


## ProteinFilter Class

Our comprehensive protein filtering system that implements multiple validation layers:

In [None]:
class ProteinFilter:
    """
    Comprehensive protein filtering system for creating high-quality negative datasets.
    
    This class implements multiple validation layers to ensure that proteins
    in our negative dataset are truly unlikely to interact with our target protein.
    """
    
    def __init__(self):
        # API endpoints
        self.string_api_url = "https://string-db.org/api"
        self.uniprot_api_url = "https://rest.uniprot.org"
        
        # Load known interactors
        self.known_interactors: Set[str] = self._load_known_interactors()
        
        # Define relevant compartments to exclude
        self.exclude_compartments = {
            'cell membrane', 'plasma membrane', 'membrane', 'cell surface',
            'extracellular', 'secreted', 'extracellular space',
            'synapse', 'synaptic', 'growth cone', 'neurite'
        }
        
        # Define relevant tissues to exclude
        self.exclude_tissues = {
            'brain', 'cortex', 'neuron', 'neural', 'cerebrospinal',
            'astrocyte', 'glial', 'hippocampus', 'cerebellum'
        }

    def _load_known_interactors(self) -> Set[str]:
        """Load known protein interactors from CSV"""
        try:
            # Skip the first row which contains "gene_to_uniprot_mappings"
            df = pd.read_csv("data/gene_to_uniprot_mappings.csv", skiprows=[0])
            return set(df['Entry'].unique())
        except Exception as e:
            print(f"Error loading known interactors: {e}")
            return set()

    def get_string_id(self, protein_id: str) -> str:
        """Convert UniProt ID to STRING ID"""
        try:
            response = requests.get(
                f"{self.string_api_url}/json/get_string_ids",
                params={
                    'identifiers': protein_id,
                    'species': 9606,  # Human
                },
                timeout=10
            )
            
            if response.status_code == 200:
                data = response.json()
                if data:
                    return data[0]['stringId']
        except Exception as e:
            print(f"Error getting STRING ID for {protein_id}: {e}")
        return ""

    def get_string_interactors(self, protein_id: str) -> Set[str]:
        """Get all interactors from STRING database"""
        interactors = set()
        string_id = self.get_string_id(protein_id)
        
        if not string_id:
            return interactors
            
        try:
            response = requests.get(
                f"{self.string_api_url}/json/interaction_partners",
                params={
                    'identifier': string_id,
                    'species': 9606,  # Human
                    'required_score': 400  # Medium confidence
                },
                timeout=10
            )
            
            if response.status_code == 200:
                data = response.json()
                for interaction in data:
                    interactors.add(interaction['stringId_B'])
                    
        except Exception as e:
            print(f"Error getting STRING interactors for {protein_id}: {e}")
            
        return interactors

    def check_location(self, protein_id: str) -> bool:
        """Check if protein is in excluded cellular compartments"""
        try:
            headers = {
                'Accept': 'application/json',
                'User-Agent': 'Python/3.8 (Contact: risa@bioberry.ai)'
            }
            
            response = requests.get(
                f"{self.uniprot_api_url}/uniprotkb/search",
                params={
                    'query': f'accession:{protein_id}',
                    'format': 'json',
                    'fields': 'annotation_score,cc_subcellular_location'
                },
                headers=headers,
                timeout=10
            )
            
            if response.status_code != 200:
                return False
                
            data = response.json()
            if not data.get('results'):
                return False
                
            # Extract locations from response
            locations = set()
            for result in data['results']:
                for comment in result.get('comments', []):
                    if comment.get('commentType') == 'SUBCELLULAR LOCATION':
                        for loc in comment.get('subcellularLocations', []):
                            if loc.get('location', {}).get('value'):
                                locations.add(loc['location']['value'].lower())
            
            # Check if any locations match excluded compartments
            return not bool(locations & self.exclude_compartments)
            
        except Exception as e:
            print(f"Error checking location: {e}")
            return False

    def check_tissue_expression(self, protein_id: str) -> bool:
        """Check if protein is expressed in excluded tissues"""
        try:
            headers = {
                'Accept': 'application/json',
                'User-Agent': 'Python/3.8 (Contact: risa@bioberry.ai)'
            }
            
            response = requests.get(
                f"{self.uniprot_api_url}/uniprotkb/search",
                params={
                    'query': f'accession:{protein_id}',
                    'format': 'json',
                    'fields': 'cc_tissue_specificity'
                },
                headers=headers,
                timeout=10
            )
            
            if response.status_code != 200:
                return False
                
            data = response.json()
            if not data.get('results'):
                return False
            
            # Extract tissue text
            tissue_text = ""
            for result in data['results']:
                for comment in result.get('comments', []):
                    if comment.get('commentType') == 'TISSUE SPECIFICITY':
                        for text in comment.get('texts', []):
                            if text.get('value'):
                                tissue_text += text['value'].lower() + " "
            
            # Check if any excluded tissues are mentioned
            for tissue in self.exclude_tissues:
                if tissue in tissue_text:
                    return False
            
            return True
            
        except Exception as e:
            print(f"Error checking tissue expression: {e}")
            return False

    def is_suitable_negative(self, protein_id: str) -> bool:
        """Determine if a protein is suitable for negative dataset"""
        
        # Check if protein is a known interactor
        if protein_id in self.known_interactors:
            return False
        
        # Check STRING network
        string_id = self.get_string_id(protein_id)
        if not string_id:  # If we can't get STRING ID, be conservative and exclude
            return False
            
        string_interactors = self.get_string_interactors(protein_id)
        if string_id in string_interactors:  # Compare STRING IDs
            return False
        
        # Check cellular location
        if not self.check_location(protein_id):
            return False
        
        # Check tissue expression
        if not self.check_tissue_expression(protein_id):
            return False
        
        return True

    def get_human_proteome(self) -> List[str]:
        """Get list of human proteins from UniProt"""
        try:
            print("\nFetching human proteins from UniProt...")
            response = requests.get(
                f"{self.uniprot_api_url}/uniprotkb/stream",
                params={
                    'format': 'tsv',
                    'fields': 'accession,gene_names',
                    'query': 'organism_id:9606 AND reviewed:true'
                },
                timeout=30
            )
            
            if response.status_code != 200:
                print("Error fetching proteins from UniProt")
                return []
            
            df = pd.read_csv(StringIO(response.text), sep='\t')
            all_proteins = df['Entry'].tolist()
            
            # Remove known interactors
            proteins_to_analyze = [p for p in all_proteins if p not in self.known_interactors]
            
            print(f"Total proteins: {len(all_proteins)}")
            print(f"Known interactors removed: {len(self.known_interactors)}")
            print(f"Proteins to analyze: {len(proteins_to_analyze)}")
            
            return proteins_to_analyze
            
        except Exception as e:
            print(f"Error getting human proteome: {e}")
            return []

## Demonstration

Let's demonstrate our filtering pipeline with a small test batch:

In [None]:
# Initialize the filter
protein_filter = ProteinFilter()

# Run a test batch
print("Running test batch with 50 proteins...")
test_proteins = protein_filter.get_human_proteome()[:50]

results = {
    "suitable_negatives": [],
    "rejected": [],
    "rejection_reasons": {}
}

for protein_id in test_proteins:
    print(f"\nAnalyzing {protein_id}...")
    
    if protein_filter.is_suitable_negative(protein_id):
        results["suitable_negatives"].append(protein_id)
        print(f" {protein_id} - SUITABLE")
    else:
        results["rejected"].append(protein_id)
        print(f" {protein_id} - REJECTED")
        
        # Get rejection reasons
        if protein_id in protein_filter.known_interactors:
            results["rejection_reasons"][protein_id] = "Known interactor"
        elif not protein_filter.check_location(protein_id):
            results["rejection_reasons"][protein_id] = "Location excluded"
        elif not protein_filter.check_tissue_expression(protein_id):
            results["rejection_reasons"][protein_id] = "Tissue expression excluded"
        else:
            results["rejection_reasons"][protein_id] = "STRING network excluded"

print("\n" + "="*50)
print("FILTERING RESULTS")
print("="*50)
print(f"Proteins tested: {len(test_proteins)}")
print(f"Suitable negatives found: {len(results['suitable_negatives'])}")
print(f"Rejected: {len(results['rejected'])}")
print(f"Success rate: {len(results['suitable_negatives'])/len(test_proteins)*100:.1f}%")

print("\nRejection reasons:")
reason_counts = {}
for reason in results["rejection_reasons"].values():
    reason_counts[reason] = reason_counts.get(reason, 0) + 1
for reason, count in reason_counts.items():
    print(f"  {reason}: {count}")

## Visualization

Let's visualize the filtering results:

In [None]:
# Create visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Overall results
labels = ['Suitable Negatives', 'Rejected']
sizes = [len(results['suitable_negatives']), len(results['rejected'])]
colors = ['#2ecc71', '#e74c3c']

ax1.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
ax1.set_title('Protein Filtering Results', fontsize=14, fontweight='bold')

# Rejection reasons
if results['rejection_reasons']:
    reason_counts = {}
    for reason in results['rejection_reasons'].values():
        reason_counts[reason] = reason_counts.get(reason, 0) + 1
    
    reasons = list(reason_counts.keys())
    counts = list(reason_counts.values())
    
    ax2.bar(reasons, counts, color='#e74c3c', alpha=0.7)
    ax2.set_title('Rejection Reasons', fontsize=14, fontweight='bold')
    ax2.set_ylabel('Count')
    ax2.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

print(f"\nFiltering Pipeline Summary:")
print(f"- Total proteins analyzed: {len(test_proteins)}")
print(f"- High-quality negatives found: {len(results['suitable_negatives'])}")
print(f"- Filtering success rate: {len(results['suitable_negatives'])/len(test_proteins)*100:.1f}%")
print(f"\nThis demonstrates our systematic approach to creating")
print(f"high-quality negative datasets for machine learning.")