# Overlap
Some sequences from the training data can significantly overlap with data from the kinases dataset. Let's fix this data leakage.

## Prepare data for mmseqs2
Extract sequences from the training data and from the kinases data

In [2]:
# Read the training data sequences and input them into a .fasta file
import csv 
with open('/home/vit/Projects/LBS-pLM/data/full-LIGYSIS/full-train.txt', 'r') as f:
    reader = csv.reader(f, delimiter=';')
    with open('/home/vit/Projects/LBS-pLM/data/filtering/sequences.fasta', 'w') as fasta_file:
        for row in reader:
            seq_id = row[0]
            sequence = row[4]
            fasta_file.write(f'>{seq_id}-TRAIN\n{sequence}\n')

## MMSEQS2  
After this step, the `run-mseq.sh` script needs to be run. Here, the `min-seq-id` was set to 0.3.

In [None]:
# Read each PDB ID, take the sequence from the MMCIF file and input it into a .fasta file
CIF_FILES_PATH = '/home/vit/Projects/deeplife-project/data/cif_files'

mapping = {'Aba': 'A', 'Ace': 'X', 'Acr': 'X', 'Ala': 'A', 'Aly': 'K', 'Arg': 'R', 'Asn': 'N', 'Asp': 'D', 'Cas': 'C',
           'Ccs': 'C', 'Cme': 'C', 'Csd': 'C', 'Cso': 'C', 'Csx': 'C', 'Cys': 'C', 'Dal': 'A', 'Dbb': 'T', 'Dbu': 'T',
           'Dha': 'S', 'Gln': 'Q', 'Glu': 'E', 'Gly': 'G', 'Glz': 'G', 'His': 'H', 'Hse': 'S', 'Ile': 'I', 'Leu': 'L',
           'Llp': 'K', 'Lys': 'K', 'Men': 'N', 'Met': 'M', 'Mly': 'K', 'Mse': 'M', 'Nh2': 'X', 'Nle': 'L', 'Ocs': 'C',
           'Pca': 'E', 'Phe': 'F', 'Pro': 'P', 'Ptr': 'Y', 'Sep': 'S', 'Ser': 'S', 'Thr': 'T', 'Tih': 'A', 'Tpo': 'T',
           'Trp': 'W', 'Tyr': 'Y', 'Unk': 'X', 'Val': 'V', 'Ycm': 'C', 'Sec': 'U', 'Pyl': 'O', 'Mhs': 'H', 'Snm': 'S',
           'Mis': 'S', 'Seb': 'S', 'Hic': 'H', 'Fme': 'M', 'Asb': 'D', 'Sah': 'C', 'Smc': 'C', 'Tpq': 'Y', 'Onl': 'X',
           'Tox': 'W', '5x8': 'X', 'Ddz': 'A'}


def three_to_one(three_letter_code):
    if three_letter_code[0].upper() + three_letter_code[1:].lower() not in mapping:
        return 'X'
    return mapping[three_letter_code[0].upper() + three_letter_code[1:].lower()]

def get_sequence(pdb_id: str, chain_id: str) -> str:
    """
    Get the amino acid sequence of a specific chain from a PDB structure.
    """
    import biotite.database.rcsb as rcsb
    import biotite.structure.io.pdbx as pdbx
    from biotite.structure.io.pdbx import get_structure
    from biotite.structure import get_residues

    cif_file_path = rcsb.fetch(pdb_id, "cif", CIF_FILES_PATH)
    cif_file = pdbx.CIFFile.read(cif_file_path)
    
    protein = get_structure(cif_file, model=1)
    protein = protein[(protein.atom_name == "CA") 
                        & (protein.element == "C") 
                        & (protein.chain_id == chain_id) ]
    _, residue_types = get_residues(protein)

    sequence = ''
    for i in range(len(residue_types)):
        amino_acid = three_to_one(residue_types[i])

        sequence += amino_acid

    return sequence

with open('/home/vit/Projects/LBS-pLM/data/kinase_pdb_chain_list.csv', 'r') as f:
    reader = csv.reader(f, delimiter=',')
    next(reader)  # Skip header
    with open('/home/vit/Projects/LBS-pLM/data/filtering/sequences.fasta', 'a') as fasta_file:
        for row in reader:
            pdb_id = row[0]
            chain_id = row[1]
            print(f'Processing {pdb_id} chain {chain_id}')
            sequence = get_sequence(pdb_id, chain_id)
            fasta_file.write(f'>{pdb_id}{chain_id}-TEST\n{sequence}\n')

Processing 1A9U chain A
Processing 1AD5 chain A
Processing 1AD5 chain B
Processing 1AGW chain A
Processing 1AGW chain B
Processing 1AQ1 chain A
Processing 1ATP chain E
Processing 1B38 chain A
Processing 1B39 chain A
Processing 1BKX chain A
Processing 1BL6 chain A
Processing 1BL7 chain A
Processing 1BMK chain A
Processing 1BX6 chain A
Processing 1BYG chain A
Processing 1CDK chain A
Processing 1CDK chain B
Processing 1CKJ chain A
Processing 1CKJ chain B
Processing 1CKP chain A
Processing 1CM8 chain A
Processing 1CM8 chain B
Processing 1CSN chain A
Processing 1DAW chain A
Processing 1DAY chain A
Processing 1DI8 chain A
Processing 1DI9 chain A
Processing 1DM2 chain A
Processing 1DS5 chain A
Processing 1DS5 chain B
Processing 1DS5 chain C
Processing 1DS5 chain D
Processing 1E1V chain A
Processing 1E1X chain A
Processing 1E9H chain A
Processing 1E9H chain C
Processing 1EH4 chain A
Processing 1EH4 chain B
Processing 1F0Q chain A
Processing 1FGI chain A
Processing 1FGI chain B
Processing 1FIN 