In [None]:
import requests
import gzip
import json
import pandas as pd
from io import StringIO, BytesIO
from pathlib import Path
import time
from tqdm import tqdm
import logging
import os

In [None]:
class UniProtAPIHandler:
    """
    Comprehensive handler for UniProt REST API with support for different data formats
    """

    def __init__(self, rate_limit=2):
        self.base_url = "https://rest.uniprot.org"
        self.session = requests.Session()
        self.rate_limit = rate_limit

        # Proteome IDs for your target species
        self.proteome_ids = {
            "human": "UP000005640",
            "fruit_fly": "UP000000803",
            "e_coli": "UP000000625",
            "mouse": "UP000000589",
            "yeast": "UP000002311"
        }

    def get_proteome_fasta(self, proteome_id, compressed=True, chunk_size=8192):
        """
        Download FASTA sequences for entire proteome
        """
        # Construct API URL
        if compressed:
            api_url = f"{self.base_url}/uniprotkb/stream?compressed=true&format=fasta&query=(proteome:{proteome_id})"
        else:
            api_url = f"{self.base_url}/uniprotkb/stream?format=fasta&query=(proteome:{proteome_id})"

        try:
            print(f"Downloading proteome {proteome_id}...")
            response = requests.get(api_url, stream=True, timeout=300)
            response.raise_for_status()

            # Handle compressed data
            if compressed:
                # Get compressed content
                compressed_data = b''
                for chunk in response.iter_content(chunk_size=chunk_size):
                    compressed_data += chunk

                # Decompress
                with gzip.GzipFile(fileobj=BytesIO(compressed_data)) as gz_file:
                    fasta_content = gz_file.read().decode('utf-8')
            else:
                fasta_content = response.text

            print(f"Downloaded {len(fasta_content):,} characters of FASTA data")
            return fasta_content

        except requests.exceptions.RequestException as e:
            print(f"Error downloading proteome {proteome_id}: {e}")
            return None

    def parse_fasta_content(self, fasta_content):
        """
        Parse FASTA content into structured data
        """
        sequences = []
        current_header = None
        current_sequence = []

        for line in fasta_content.split('\n'):
            line = line.strip()

            if line.startswith('>'):
                # Save previous sequence if exists
                if current_header and current_sequence:
                    sequences.append({
                        'header': current_header,
                        'sequence': ''.join(current_sequence),
                        'length': len(''.join(current_sequence))
                    })

                # Start new sequence
                current_header = line[1:]  # Remove '>'
                current_sequence = []

            elif line and not line.startswith('>'):
                current_sequence.append(line)

        # Don't forget the last sequence
        if current_header and current_sequence:
            sequences.append({
                'header': current_header,
                'sequence': ''.join(current_sequence),
                'length': len(''.join(current_sequence))
            })

        return sequences

    def extract_uniprot_info(self, fasta_header):
        """
        Extract structured information from FASTA header
        """
        info = {
            'uniprot_id': '',
            'entry_name': '',
            'protein_name': '',
            'organism': '',
            'organism_id': '',
            'gene_name': '',
            'protein_existence': '',
            'sequence_version': ''
        }

        try:
            # Split header into parts
            parts = fasta_header.split()

            # Extract UniProt ID and entry name
            if len(parts) > 0:
                id_part = parts[0]  # "sp|P04637|P53_HUMAN"
                if '|' in id_part:
                    components = id_part.split('|')
                    if len(components) >= 3:
                        info['uniprot_id'] = components[1]
                        info['entry_name'] = components[2]

            # Extract other information
            header_str = ' '.join(parts[1:]) if len(parts) > 1 else ''

            # Organism (OS=)
            if 'OS=' in header_str:
                start = header_str.find('OS=') + 3
                end = header_str.find(' OX=', start) if ' OX=' in header_str[start:] else len(header_str)
                info['organism'] = header_str[start:end].strip()

            # Organism ID (OX=)
            if 'OX=' in header_str:
                start = header_str.find('OX=') + 3
                end = header_str.find(' ', start) if ' ' in header_str[start:] else len(header_str)
                info['organism_id'] = header_str[start:end].strip()

            # Gene name (GN=)
            if 'GN=' in header_str:
                start = header_str.find('GN=') + 3
                end = header_str.find(' ', start) if ' ' in header_str[start:] else len(header_str)
                info['gene_name'] = header_str[start:end].strip()

            # Protein existence (PE=)
            if 'PE=' in header_str:
                start = header_str.find('PE=') + 3
                end = header_str.find(' ', start) if ' ' in header_str[start:] else len(header_str)
                info['protein_existence'] = header_str[start:end].strip()

            # Sequence version (SV=)
            if 'SV=' in header_str:
                start = header_str.find('SV=') + 3
                end = header_str.find(' ', start) if ' ' in header_str[start:] else len(header_str)
                info['sequence_version'] = header_str[start:end].strip()

            # Protein name (everything before OS=)
            if 'OS=' in header_str:
                protein_name_end = header_str.find(' OS=')
                info['protein_name'] = header_str[:protein_name_end].strip()
            else:
                info['protein_name'] = header_str.strip()

        except Exception as e:
            print(f"Warning: Error parsing header: {e}")

        return info

    def get_proteome_data(self, species_name, max_sequences=None, save_to_file=True):
        """
        Get complete proteome data with parsed information
        """
        if species_name not in self.proteome_ids:
            raise ValueError(f"Unknown species: {species_name}. Available: {list(self.proteome_ids.keys())}")

        proteome_id = self.proteome_ids[species_name]

        # Download FASTA data
        fasta_content = self.get_proteome_fasta(proteome_id, compressed=True)

        if not fasta_content:
            return None

        # Parse FASTA content
        print("Parsing FASTA sequences...")
        sequences = self.parse_fasta_content(fasta_content)

        # Extract detailed information
        print("Extracting protein information...")
        proteome_data = []

        sequences_to_process = sequences[:max_sequences] if max_sequences else sequences

        for i, seq_data in enumerate(tqdm(sequences_to_process, desc="Processing sequences")):
            # Extract information from header
            protein_info = self.extract_uniprot_info(seq_data['header'])

            # Filter out very short sequences
            if seq_data['length'] < 20:
                continue

            # Combine all information
            protein_entry = {
                'species': species_name,
                'proteome_id': proteome_id,
                'uniprot_id': protein_info['uniprot_id'],
                'entry_name': protein_info['entry_name'],
                'protein_name': protein_info['protein_name'],
                'gene_name': protein_info['gene_name'],
                'organism': protein_info['organism'],
                'organism_id': protein_info['organism_id'],
                'protein_existence': protein_info['protein_existence'],
                'sequence_version': protein_info['sequence_version'],
                'sequence': seq_data['sequence'],
                'sequence_length': seq_data['length'],
                'full_header': seq_data['header']
            }

            proteome_data.append(protein_entry)

        print(f"Processed {len(proteome_data)} protein sequences for {species_name}")

        # Save to file if requested
        if save_to_file:
            output_file = f"{species_name}_proteome_data.json"
            with open(output_file, 'w') as f:
                json.dump(proteome_data, f, indent=2)
            print(f"Data saved to {output_file}")

        return proteome_data

In [None]:
print("Initializing UniProt API Handler...")
handler = UniProtAPIHandler(rate_limit=2)
print(f"Handler initialized for species: {list(handler.proteome_ids.keys())}")

# Step 4: Test with E. coli sample
print("\nTesting with E. coli sample...")
try:
    ecoli_sample = handler.get_proteome_data("e_coli", max_sequences=50)

    if ecoli_sample:
        print(f"Successfully downloaded {len(ecoli_sample)} E. coli proteins")

        # Convert to DataFrame for easy analysis
        df = pd.DataFrame(ecoli_sample)
        print("\nQuick analysis:")
        print(f"Average sequence length: {df['sequence_length'].mean():.1f}")
        print(f"Sequence length range: {df['sequence_length'].min()} - {df['sequence_length'].max()}")
        print(f"Unique gene names: {df['gene_name'].nunique()}")

        # Show sample protein
        print(f"\n Sample protein:")
        sample = ecoli_sample[0]
        print(f"  UniProt ID: {sample['uniprot_id']}")
        print(f"  Gene name: {sample['gene_name']}")
        print(f"  Protein name: {sample['protein_name'][:60]}...")
        print(f"  Sequence length: {sample['sequence_length']}")
        print(f"  Sequence preview: {sample['sequence'][:60]}...")

    else:
        print("Failed to download E. coli data")

except Exception as e:
    print(f"Error: {e}")

print("\nReady to proceed with full data collection!")
print("Next steps:")
print("1. Run handler.get_proteome_data('species_name', max_sequences=N) for each species")
print("2. Adjust max_sequences based on your needs")
print("3. Combine all species data for your ML pipeline")

Initializing UniProt API Handler...
Handler initialized for species: ['human', 'fruit_fly', 'e_coli', 'mouse', 'yeast']

Testing with E. coli sample...
Downloading proteome UP000000625...
Downloaded 1,890,840 characters of FASTA data
Parsing FASTA sequences...
Extracting protein information...


Processing sequences: 100%|██████████| 50/50 [00:00<00:00, 57582.43it/s]

Processed 50 protein sequences for e_coli
Data saved to e_coli_proteome_data.json
Successfully downloaded 50 E. coli proteins

Quick analysis:
Average sequence length: 516.4
Sequence length range: 31 - 1073
Unique gene names: 50

 Sample protein:
  UniProt ID: A5A616
  Gene name: mgtS
  Protein name: Small protein MgtS...
  Sequence length: 31
  Sequence preview: MLGNMNVFMAVLGIILFSGFLAAYFSHKWDD...

Ready to proceed with full data collection!
Next steps:
1. Run handler.get_proteome_data('species_name', max_sequences=N) for each species
2. Adjust max_sequences based on your needs
3. Combine all species data for your ML pipeline





In [None]:
def collect_all_species_data(handler, species_targets=None):
    """
    Collect protein data for all target species
    """

    # Default targets (adjust based on your computational resources)
    if species_targets is None:
        species_targets = {
            "e_coli": 400,      # Smallest - good for testing
            "yeast": 600,       # Small-medium
            "fruit_fly": 800,   # Medium
            "mouse": 1000,      # Large
            "human": 1200       # Largest
        }

    print("Starting data collection for all species...")
    print(f"Targets: {species_targets}")

    all_protein_data = {}
    collection_summary = {}

    for species, max_count in species_targets.items():
        print(f"\n{'='*60}")
        print(f"PROCESSING: {species.upper()}")
        print(f"Target: {max_count} proteins")
        print(f"{'='*60}")

        try:
            start_time = time.time()

            # Download data
            protein_data = handler.get_proteome_data(
                species,
                max_sequences=max_count,
                save_to_file=True
            )

            end_time = time.time()

            if protein_data:
                all_protein_data[species] = protein_data

                # Calculate statistics
                lengths = [p['sequence_length'] for p in protein_data]
                collection_summary[species] = {
                    'count': len(protein_data),
                    'avg_length': sum(lengths) / len(lengths),
                    'min_length': min(lengths),
                    'max_length': max(lengths),
                    'download_time': end_time - start_time
                }

                print(f"{species}: {len(protein_data)} proteins collected")
                print(f"   Average length: {collection_summary[species]['avg_length']:.1f}")
                print(f"   Time taken: {collection_summary[species]['download_time']:.1f} seconds")
            else:
                print(f" Failed to collect {species} data")
                collection_summary[species] = {'count': 0, 'error': 'Download failed'}

        except Exception as e:
            print(f"Error processing {species}: {e}")
            collection_summary[species] = {'count': 0, 'error': str(e)}

        # Small delay between species
        time.sleep(2)

    # Print final summary
    print(f"\n{'='*60}")
    print("DATA COLLECTION COMPLETE!")
    print(f"{'='*60}")

    total_proteins = 0
    for species, summary in collection_summary.items():
        count = summary.get('count', 0)
        total_proteins += count

        if count > 0:
            print(f"{species:<12}: {count:>5,} proteins (avg length: {summary['avg_length']:>6.1f})")
        else:
            error = summary.get('error', 'Unknown error')
            print(f"{species:<12}: Failed - {error}")

    print(f"\nTOTAL COLLECTED: {total_proteins:,} proteins across {len(all_protein_data)} species")

    return all_protein_data, collection_summary

def create_combined_dataset(all_protein_data):
    """
    Create a combined dataset from all species
    """
    print("\n Creating combined dataset...")

    combined_sequences = []
    combined_metadata = []

    for species, protein_list in all_protein_data.items():
        print(f"  Adding {len(protein_list)} proteins from {species}...")

        for protein in protein_list:
            combined_sequences.append(protein['sequence'])
            combined_metadata.append({
                'species': species,
                'uniprot_id': protein['uniprot_id'],
                'gene_name': protein['gene_name'],
                'protein_name': protein['protein_name'],
                'length': protein['sequence_length'],
                'organism': protein['organism']
            })

    # Create final dataset
    final_dataset = {
        'sequences': combined_sequences,
        'metadata': combined_metadata,
        'collection_info': {
            'total_sequences': len(combined_sequences),
            'species_count': len(all_protein_data),
            'collection_date': time.strftime('%Y-%m-%d %H:%M:%S')
        }
    }

    # Save combined dataset
    with open('combined_protein_dataset.json', 'w') as f:
        json.dump(final_dataset, f, indent=2)

    print(f" Combined dataset created: {len(combined_sequences):,} sequences")
    print(f" Saved to: combined_protein_dataset.json")

    return final_dataset

def analyze_dataset(final_dataset):
    """
    Perform basic analysis of the combined dataset
    """
    print("\n DATASET ANALYSIS")
    print("="*50)

    # Create DataFrame for analysis
    df = pd.DataFrame(final_dataset['metadata'])
    df['sequence'] = final_dataset['sequences']

    # Basic statistics
    print(f"Total proteins: {len(df):,}")
    print(f"Average length: {df['length'].mean():.1f}")
    print(f"Length std: {df['length'].std():.1f}")
    print(f"Length range: {df['length'].min()} - {df['length'].max()}")

    print(f"\n SPECIES DISTRIBUTION:")
    species_counts = df['species'].value_counts()
    for species, count in species_counts.items():
        percentage = (count / len(df)) * 100
        print(f"  {species:<12}: {count:>5,} ({percentage:>5.1f}%)")

    print(f"\n LENGTH STATISTICS BY SPECIES:")
    length_stats = df.groupby('species')['length'].agg(['count', 'mean', 'std', 'min', 'max']).round(1)
    print(length_stats)

    # Amino acid composition analysis
    print(f"\n AMINO ACID COMPOSITION ANALYSIS:")
    all_sequences = ''.join(df['sequence'])
    aa_counts = {}
    for aa in 'ACDEFGHIKLMNPQRSTVWY':
        count = all_sequences.count(aa)
        percentage = (count / len(all_sequences)) * 100
        aa_counts[aa] = {'count': count, 'percentage': percentage}

    # Sort by frequency
    sorted_aa = sorted(aa_counts.items(), key=lambda x: x[1]['count'], reverse=True)

    print("  Top 10 most frequent amino acids:")
    for aa, stats in sorted_aa[:10]:
        print(f"    {aa}: {stats['count']:>8,} ({stats['percentage']:>5.2f}%)")

    return df

# Main execution function
def run_full_collection():
    """
    Run the complete data collection pipeline
    """
    print("STARTING FULL PROTEIN DATA COLLECTION PIPELINE")
    print("="*60)

    # Make sure handler is initialized
    if 'handler' not in globals():
        print(" Error: UniProtAPIHandler not found. Please run the previous code block first.")
        return None

    # Step 1: Collect data for all species
    all_data, summary = collect_all_species_data(handler)

    if not all_data:
        print(" No data collected. Aborting.")
        return None

    # Step 2: Create combined dataset
    combined_dataset = create_combined_dataset(all_data)

    # Step 3: Analyze the dataset
    df_analysis = analyze_dataset(combined_dataset)

    print(f"\n PIPELINE COMPLETE!")
    print(f" Individual species files saved (JSON format)")
    print(f" Combined dataset saved: combined_protein_dataset.json")
    print(f" Ready for deep learning pipeline!")

    return combined_dataset, df_analysis

In [None]:
combined_dataset, analysis_df = run_full_collection()

STARTING FULL PROTEIN DATA COLLECTION PIPELINE
Starting data collection for all species...
Targets: {'e_coli': 400, 'yeast': 600, 'fruit_fly': 800, 'mouse': 1000, 'human': 1200}

PROCESSING: E_COLI
Target: 400 proteins
Downloading proteome UP000000625...
Downloaded 1,890,840 characters of FASTA data
Parsing FASTA sequences...
Extracting protein information...


Processing sequences: 100%|██████████| 400/400 [00:00<00:00, 40274.66it/s]

Processed 400 protein sequences for e_coli
Data saved to e_coli_proteome_data.json
e_coli: 400 proteins collected
   Average length: 375.9
   Time taken: 1.5 seconds






PROCESSING: YEAST
Target: 600 proteins
Downloading proteome UP000002311...
Downloaded 3,855,264 characters of FASTA data
Parsing FASTA sequences...
Extracting protein information...


Processing sequences: 100%|██████████| 600/600 [00:00<00:00, 34395.52it/s]

Processed 600 protein sequences for yeast
Data saved to yeast_proteome_data.json
yeast: 600 proteins collected
   Average length: 552.0
   Time taken: 2.7 seconds






PROCESSING: FRUIT_FLY
Target: 800 proteins
Downloading proteome UP000000803...
Downloaded 17,595,317 characters of FASTA data
Parsing FASTA sequences...
Extracting protein information...


Processing sequences: 100%|██████████| 800/800 [00:00<00:00, 51642.07it/s]

Processed 800 protein sequences for fruit_fly
Data saved to fruit_fly_proteome_data.json
fruit_fly: 800 proteins collected
   Average length: 1733.5
   Time taken: 15.4 seconds






PROCESSING: MOUSE
Target: 1000 proteins
Downloading proteome UP000000589...
Downloaded 29,420,930 characters of FASTA data
Parsing FASTA sequences...
Extracting protein information...


Processing sequences: 100%|██████████| 1000/1000 [00:00<00:00, 122115.59it/s]

Processed 1000 protein sequences for mouse
Data saved to mouse_proteome_data.json
mouse: 1000 proteins collected
   Average length: 964.8
   Time taken: 30.4 seconds






PROCESSING: HUMAN
Target: 1200 proteins
Downloading proteome UP000005640...
Downloaded 40,649,282 characters of FASTA data
Parsing FASTA sequences...
Extracting protein information...


Processing sequences: 100%|██████████| 1200/1200 [00:00<00:00, 26482.68it/s]

Processed 1199 protein sequences for human
Data saved to human_proteome_data.json





human: 1199 proteins collected
   Average length: 680.1
   Time taken: 30.9 seconds

DATA COLLECTION COMPLETE!
e_coli      :   400 proteins (avg length:  375.9)
yeast       :   600 proteins (avg length:  552.0)
fruit_fly   :   800 proteins (avg length: 1733.5)
mouse       : 1,000 proteins (avg length:  964.8)
human       : 1,199 proteins (avg length:  680.1)

TOTAL COLLECTED: 3,999 proteins across 5 species

 Creating combined dataset...
  Adding 400 proteins from e_coli...
  Adding 600 proteins from yeast...
  Adding 800 proteins from fruit_fly...
  Adding 1000 proteins from mouse...
  Adding 1199 proteins from human...
 Combined dataset created: 3,999 sequences
 Saved to: combined_protein_dataset.json

 DATASET ANALYSIS
Total proteins: 3,999
Average length: 912.4
Length std: 1385.8
Length range: 31 - 35213

 SPECIES DISTRIBUTION:
  human       : 1,199 ( 30.0%)
  mouse       : 1,000 ( 25.0%)
  fruit_fly   :   800 ( 20.0%)
  yeast       :   600 ( 15.0%)
  e_coli      :   400 ( 10.0%)

