# Retrieve UniProt Sequences

### Ayush Noori

Import required libraries.

In [3]:
# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Directory management
from pathlib import Path
import os

# Get sequence data
from getSequence import getseq

# Get current directory
cwd = Path(os.getcwd())
save_dir = cwd.parents[1] / 'Data' / 'AF2_sequences'

Get list of UniProt IDs.

In [4]:
# Dictionary of UniProt IDs
GFAP_id = 'P14136'
VIM_id = 'P08670'

experimental_conditions = {
    'GFAP-RAB10': (GFAP_id, 'P61026'),
    'GFAP-RAB7A': (GFAP_id, 'P51149'),
    'VIM-RAB10': (VIM_id, 'P61026'),
    'GFAP-GFAP-GFAP-GFAP': (GFAP_id, GFAP_id, GFAP_id, GFAP_id),
    'VIM-VIM-VIM-VIM': (VIM_id, VIM_id, VIM_id, VIM_id),
    'GFAP-GFAP-VIM-VIM': (GFAP_id, GFAP_id, VIM_id, VIM_id)
}

positive_controls = {
    'GFAP-GFAP': (GFAP_id, GFAP_id),
    'GFAP-VIM': (GFAP_id, VIM_id),
    'GFAP-CRYAB': (GFAP_id, 'P02511'),
    'GFAP-HSPB1': (GFAP_id, 'P04792'),
    'GFAP-LAMP2': (GFAP_id, 'P13473'),
    'VIM-RAB5A': (VIM_id, 'P20339'),
}

negative_controls = {
    'GFAP-OLIG2': (GFAP_id, 'Q13516'),
    'GFAP-AIF1': (GFAP_id, 'P55008'),
    'GFAP-RBFOX3': (GFAP_id, 'A6NFN3'),
}

# Label experimental, positive, and negative controls
experimental_conditions = {k: (v, 'exp') for k, v in experimental_conditions.items()}
positive_controls = {k: (v, 'pos') for k, v in positive_controls.items()}
negative_controls = {k: (v, 'neg') for k, v in negative_controls.items()}

# Combine all dictionaries
af2_experiments = {**experimental_conditions, **positive_controls, **negative_controls}

Retrieve sequences and write to file.

In [5]:
# Iterate over controls
for comparison, (uniprot_ids, label) in af2_experiments.items():
    
    # Save sequence
    with open(save_dir / f'{comparison}-{label}.fasta', 'w') as f:

        # Iterate over UniProt IDs
        for uniprot_id in uniprot_ids:

            # Get sequence
            seq = getseq(uniprot_id, uniprot_id=True)

            # Write header to file
            f.write(f'>{seq[0]}\n')

            # Divide sequence into chunks of 60 characters
            for i in range(0, len(seq[1]), 60):
                f.write(f'{seq[1][i:i+60]}\n')

# Visualize PDB Structures