# Programatically Extracting Endogenous ZFNs from Databases

In [1]:
import requests
import json
import bs4
import pandas as pd
import numpy as np
import coreapi
import sys

In [None]:
#to increase memory usage, use this command in the terminal to launch the jupyter notebook
#jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10

## Load in ZFNs from UniProt

In [2]:
BASE = 'http://www.uniprot.org'
KB_ENDPOINT = '/uniprot/'
TOOL_ENDPOINT = '/uploadlists/'

payload = {'query': 'annotation:(type:zn_fing) AND reviewed:yes',
'format': 'tab', 'columns': 'id,entry_name,protein_names,genes,feature(ZINC FINGER),sequence,organism,reviewed'}
#go here for column names: https://www.uniprot.org/help/uniprotkb_column_names
# request approach adapted from https://www.ebi.ac.uk/training/online/sites/ebi.ac.uk.training.online/files/UniProt_programmatically_py3.pdf

result = requests.get(BASE + KB_ENDPOINT, params=payload)

if result.ok:
    #print(result.text)
    print("Data loaded in correctly")
else:
    print('Something went wrong ', result.status_code)

Data loaded in correctly


In [3]:
f = open("uniprot_search.txt", "w")
f.write(result.text)
f.close()

In [4]:
table = pd.read_csv("uniprot_search.txt", sep = "\t")
table.to_excel("endogenous.xlsx")

In [6]:
table.head()

Unnamed: 0,Entry,Entry name,Protein names,Gene names,Zinc finger,Sequence,Organism,Status
0,O15553,MEFV_HUMAN,Pyrin (Marenostrin),MEFV MEF TRIM20,"ZN_FING 370..412; /note=""B box-type""; /evide...",MAKTPSDHLLSTLEELVPYDFEKFKFKLQNTSVQKEHSRIPRSQIQ...,Homo sapiens (Human),reviewed
1,Q8C5W4,MOR2B_MOUSE,ATPase MORC2B (EC 3.6.1.-) (MORC family CW-typ...,Morc2b Tce6,"ZN_FING 490..544; /note=""CW-type""; /evidence...",MAFTNYSTLNRAQLTFDYLHTNSTTHAFLFGALAELIDNARDADAT...,Mus musculus (Mouse),reviewed
2,Q9VHM6,OUIB_DROME,Transcription factor Ouib (Protein ouija board),Ouib CG11762,"ZN_FING 167..189; /note=""C2H2-type 1""; /evid...",MLNIVCRVCGRQKICEKSLNLFDLVNRKYLKHLHMISGLRLVDLDD...,Drosophila melanogaster (Fruit fly),reviewed
3,P04585,POL_HV1H2,Gag-Pol polyprotein (Pr160Gag-Pol) [Cleaved in...,gag-pol,"ZN_FING 390..407; /note=""CCHC-type 1""; /evid...",MGARASVLSGGELDRWEKIRLRPGGKKKYKLKHIVWASRELERFAV...,Human immunodeficiency virus type 1 group M su...,reviewed
4,P14078,POL_HTL1C,Gag-Pro-Pol polyprotein (Pr160Gag-Pro-Pol) [Cl...,gag-pro-pol,"ZN_FING 355..372; /note=""CCHC-type 1""; /evid...",MGQIFSRSASPIPRPPRGLAAHHWLNFLQAAYRLEPGPSSYDFHQL...,Human T-cell leukemia virus 1 (isolate Caribbe...,reviewed


In [7]:
len(table)

12948

In [5]:
filtered = table
for index, row in filtered.iterrows():
    zinc = row['Zinc finger'].split("; ")
    sequence = row['Sequence']
    filtered_zinc = list(filter(lambda x: "ZN_FING" in x, zinc))
    zinc_sequences = []
    for zinc_seq in filtered_zinc:
        position = zinc_seq.split(" ")[1]
        start = position.split("..")[0]
        start = int(''.join([i for i in start if i.isdigit()])) - 1
        end = position.split("..")[1]
        end = int(''.join([i for i in end if i.isdigit()]))
        zinc_sequences.append(sequence[start:end])
        row['Zinc finger'] = ', '.join(zinc_sequences)
filtered.head()

Unnamed: 0,Entry,Entry name,Protein names,Gene names,Zinc finger,Sequence,Organism,Status
0,O15553,MEFV_HUMAN,Pyrin (Marenostrin),MEFV MEF TRIM20,QPLPQCKRHLKQVQLLFCEDHDEPICLICSLSQEHQGHRVRPI,MAKTPSDHLLSTLEELVPYDFEKFKFKLQNTSVQKEHSRIPRSQIQ...,Homo sapiens (Human),reviewed
1,Q8C5W4,MOR2B_MOUSE,ATPase MORC2B (EC 3.6.1.-) (MORC family CW-typ...,Morc2b Tce6,AMQVPTTIQCDLCLKWRTLPFQLSAVEEGYPINWVCSMNPDPEQDQ...,MAFTNYSTLNRAQLTFDYLHTNSTTHAFLFGALAELIDNARDADAT...,Mus musculus (Mouse),reviewed
2,Q9VHM6,OUIB_DROME,Transcription factor Ouib (Protein ouija board),Ouib CG11762,"YICELCGTHATSKPTFQRHMRKH, FGCKDCDARFLSAGELRAHHR...",MLNIVCRVCGRQKICEKSLNLFDLVNRKYLKHLHMISGLRLVDLDD...,Drosophila melanogaster (Fruit fly),reviewed
3,P04585,POL_HV1H2,Gag-Pol polyprotein (Pr160Gag-Pol) [Cleaved in...,gag-pol,"VKCFNCGKEGHTARNCRA, KGCWKCGKEGHQMKDCTE, DGIDKA...",MGARASVLSGGELDRWEKIRLRPGGKKKYKLKHIVWASRELERFAV...,Human immunodeficiency virus type 1 group M su...,reviewed
4,P14078,POL_HTL1C,Gag-Pro-Pol polyprotein (Pr160Gag-Pro-Pol) [Cl...,gag-pro-pol,"QPCFRCGKAGHWSRDCTQ, GPCPLCQDPTHWKRDCPR",MGQIFSRSASPIPRPPRGLAAHHWLNFLQAAYRLEPGPSSYDFHQL...,Human T-cell leukemia virus 1 (isolate Caribbe...,reviewed


In [9]:
filtered.to_excel("endogenous_filtered.xlsx")

## Get Binding Motifs from JASPAR

Majority of the code is from Winston's notebook:

In [6]:
c2h2zff_url_transfac = "http://jaspar.genereg.net/api/v1/matrix/?class=C2H2+zinc+finger+factors&format=transfac"
c2h2zff_url_json = "http://jaspar.genereg.net/api/v1/matrix/?tf_class=C2H2+zinc+finger+factors"#&format=json"
c2h2zffs = requests.get(c2h2zff_url_json)
c2h2zffs_page_i = json.loads(c2h2zffs.text)
c2h2zffs_page_i # next time, figure out how to go to *next* page

#testing output
json.loads(requests.get("http://jaspar.genereg.net/api/v1/matrix/?tf_class=C2H2+zinc+finger+factors").text)
json.loads(requests.get('http://jaspar.genereg.net/api/v1/matrix/UN0355.1/').text)

{'comment': 'not supported by literature',
 'pubmed_ids': [],
 'family': [],
 'pfm': {'A': [222.0,
   294.0,
   44.0,
   1068.0,
   35.0,
   23.0,
   40.0,
   1077.0,
   887.0,
   234.0,
   325.0],
  'C': [290.0,
   286.0,
   90.0,
   10.0,
   1013.0,
   1078.0,
   5.0,
   7.0,
   139.0,
   362.0,
   205.0],
  'T': [466.0,
   413.0,
   951.0,
   29.0,
   34.0,
   12.0,
   1049.0,
   26.0,
   79.0,
   454.0,
   475.0],
  'G': [146.0, 131.0, 39.0, 17.0, 42.0, 11.0, 30.0, 14.0, 19.0, 74.0, 119.0]},
 'tax_group': 'plants',
 'matrix_id': 'UN0355.1',
 'sequence_logo': 'http://jaspar.genereg.net/static/logos/svg/UN0355.1.svg',
 'pazar_tf_ids': [],
 'versions_url': 'http://jaspar.genereg.net/api/v1/matrix/UN0355/versions',
 'collection': 'UNVALIDATED',
 'base_id': 'UN0355',
 'class': ['C2H2 zinc finger factors'],
 'tffm': None,
 'tfe_ids': [],
 'name': 'AT3G49930',
 'uniprot_ids': ['Q9SN24'],
 'sites_url': 'http://jaspar.genereg.net/api/v1/sites/UN0355.1',
 'centrality_logp': '-379.729',
 'sou

In [12]:
#functions written by Winston
def parse_pages(n_pages):
    """
    n_pages: probably 43 (can be drawn from count)
    """
    lst = []
    for i in range(n_pages):
        url_i = f'http://jaspar.genereg.net/api/v1/matrix/?page={i+1}&tf_class=C2H2+zinc+finger+factors'
        url_i_text = requests.get(url_i).text
        page = json.loads(url_i_text)
        parse_page(page, lst)
    return pd.DataFrame(lst)

def parse_page(page, lst):
    """
    Parses an individual page of results (usually 10)
    page: the page of results
    lst: where the desired pieces of information are appended
    """
    for entry in page['results']:
        entry_url = entry['url']
        entry_url_json = json.loads(requests.get(entry_url).text)
       
        possible_uniprot_id = entry_url_json['uniprot_ids']
        matrix_id = entry_url_json['matrix_id']
        base_id = entry_url_json['base_id']
        name = entry_url_json['name']
        pfm = entry_url_json['pfm']
       
        if len(possible_uniprot_id) == 1:
            possible_uniprot_id = possible_uniprot_id[0]
        else:
            possible_uniprot_id = np.NaN
        lst.append({'Uniprot_IDs': possible_uniprot_id, 
                    'Name': name,
                    'Base_ID': base_id,
                    'Matrix_ID': matrix_id,
                    'PWM': PFM_to_PWM(pfm),
                    'Compressed_Sequence': compressed_sequence(pfm)})
                    #'PFM': PFMdict_to_matrix(pfm)})

def PFMdict_to_matrix(PFMdict):
    """
    Turns an individual PFM from the format in which it is stored (dictionary, from json) into a np array
    PFMdict: a dictionary containing the PFM
    """
    DNAletters = ['A', 'C', 'G', 'T']
    return np.array([PFMdict[i] for i in DNAletters])

def PFM_to_PWM(PFM_dict):
    """
    Turns an individual PFM dict to a PWM dict
    PFMdict: a dictionary containing the PFM
    """
    A_pwm, C_pwm, G_pwm, T_pwm = [], [], [], []
    for index in range(0, len(PFM_dict['A'])):
        total_count = PFM_dict['A'][index] + PFM_dict['C'][index] + PFM_dict['G'][index] + PFM_dict['T'][index]
        A_pwm.append(PFM_dict['A'][index] / total_count)
        C_pwm.append(PFM_dict['C'][index] / total_count)
        G_pwm.append(PFM_dict['G'][index] / total_count)
        T_pwm.append(PFM_dict['T'][index] / total_count)
    return {'A': A_pwm, 'C': C_pwm, 'G': G_pwm, 'T': T_pwm}

def compressed_sequence(PFM_dict):
    """
    Returns the most likely sequence from a PFM (PWM works too)
    PWMdict: a dictionary containing the PFM
    """
    sequence = ""
    for index in range(0, len(PFM_dict['A'])):
        bases = ['A', 'C', 'G', 'T']
        freqs = [PFM_dict['A'][index], PFM_dict['C'][index], PFM_dict['G'][index], PFM_dict['T'][index]]
        max_freq = max(freqs)
        sequence += bases[freqs.index(max_freq)]
    return(sequence)

In [15]:
jaspar = parse_pages(43)
jaspar.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 428 entries, 0 to 427
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Uniprot_IDs          416 non-null    object
 1   Name                 428 non-null    object
 2   Base_ID              428 non-null    object
 3   Matrix_ID            428 non-null    object
 4   PWM                  428 non-null    object
 5   Compressed_Sequence  428 non-null    object
dtypes: object(6)
memory usage: 20.2+ KB


In [16]:
jaspar.to_excel("jaspar_pwm_tfs.xlsx")
jaspar.head(5)

Unnamed: 0,Uniprot_IDs,Name,Base_ID,Matrix_ID,PWM,Compressed_Sequence
0,P21192,ACE2,MA0267,MA0267.1,"{'A': [0.45454545454545453, 0.01, 0.0, 0.9, 0....",ACCAGCA
1,O82155,Adof1,MA1277,MA1277.1,"{'A': [0.46554621848739497, 0.5226890756302521...",AAAAAGAAAAAGTAAAAAAAA
2,P07248,ADR1,MA0268,MA0268.1,"{'A': [0.41, 0.0, 0.0, 0.0, 0.0, 0.64, 0.34], ...",ACCCCAC
3,Q8RWX7,AT1G14580,MA1160,MA1160.1,"{'A': [0.25206611570247933, 0.2520661157024793...",TTTTTTTTTGTCGTTTTGTG
4,Q9SX97,AT1G47655,MA1275,MA1275.1,"{'A': [0.2633333333333333, 0.22, 0.57166666666...",AGAAAAAGTAA


In [20]:
len(jaspar)

428

### Merge the UniProt and JASPAR databases together
1. Loop through the "test" or JASPAR dataframe and check if the Uniprot ID exists in the UniProt dataframe and add ZFN/binding sequence that way

In [14]:
jaspar_table = jaspar
uniprot_table = filtered
jaspar_table['zfn_sequence'] = ""
jaspar_table['sequence'] = ""
jaspar_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 428 entries, 0 to 427
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Uniprot_IDs   416 non-null    object
 1   Name          428 non-null    object
 2   Base_ID       428 non-null    object
 3   Matrix_ID     428 non-null    object
 4   PWM           428 non-null    object
 5   zfn_sequence  428 non-null    object
 6   sequence      428 non-null    object
dtypes: object(7)
memory usage: 23.5+ KB


In [15]:
for index, row in jaspar_table.iterrows():
    uniprotID = row['Uniprot_IDs']
    uniprot_index_list = uniprot_table.index[uniprot_table['Entry'] == uniprotID].tolist()
    if (len(uniprot_index_list) > 0):
        uniprot_index = uniprot_index_list[0]
        jaspar_table.at[index,'zfn_sequence'] = uniprot_table.iloc[uniprot_index]["Zinc finger"]
        jaspar_table.at[index, 'sequence'] = uniprot_table.iloc[uniprot_index]["Sequence"]
jaspar_table.head()

Unnamed: 0,Uniprot_IDs,Name,Base_ID,Matrix_ID,PWM,zfn_sequence,sequence
0,P21192,ACE2,MA0267,MA0267.1,"{'A': [0.45454545454545453, 0.01, 0.0, 0.9, 0....","FECLYPNCNKVFKRRYNIRSHIQTH, YSCDFPGCTKAFVRNHDLI...",MDNVVDPWYINPSGFAKDTQDEEYVQHHDNVNPTIPPPDNYILNNE...
1,O82155,Adof1,MA1277,MA1277.1,"{'A': [0.46554621848739497, 0.5226890756302521...",LKCPRCDSPNTKFCYYNNYNLSQPRHFCKNCRRYWTKGGALRNIPV...,MQDLTSAAAYYHQSMMMTTAKQNQPELPEQEQLKCPRCDSPNTKFC...
2,P07248,ADR1,MA0268,MA0268.1,"{'A': [0.41, 0.0, 0.0, 0.0, 0.0, 0.64, 0.34], ...","FVCEVCTRAFARQEHLKRHYRSH, YPCGLCNRCFTRRDLLIRHAQKIH",MANVEKPNDCSGFPVVDLNSCFSNGFNNEKQEIEMETDDSPILLMS...
3,Q8RWX7,AT1G14580,MA1160,MA1160.1,"{'A': [0.25206611570247933, 0.2520661157024793...","FLCEVCNKGFQREQNLQLHRRGH, YLCPEPSCVHHDPARALGDLT...",MSSSYNTIALSSTPTFLLSSAAAGPGPNNFNRQEAAMTMVQQQPTS...
4,Q9SX97,AT1G47655,MA1275,MA1275.1,"{'A': [0.2633333333333333, 0.22, 0.57166666666...",LPCPRCNSTTTKFCYYNNYNLAQPRYYCKSCRRYWTQGGTLRDVPV...,MPSEPNQTRPTRVQPSTAAYPPPNLAEPLPCPRCNSTTTKFCYYNN...


In [46]:
jaspar_table.to_excel("jaspar_combined.xlsx")

### Merge JASPAR onto UniProt
This makes it easier in the future to keep all the uniprot IDs we have and merge all the other databases onto it to keep some sort of a "master" sheet

In [16]:
jaspar_table = jaspar
uniprot_table = filtered
uniprot_table['jaspar_matrix_id'] = ""
uniprot_table['jaspar_name'] = ""
uniprot_table['jaspar_pfm'] = ""
uniprot_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12948 entries, 0 to 12947
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Entry             12948 non-null  object
 1   Entry name        12948 non-null  object
 2   Protein names     12948 non-null  object
 3   Gene names        12727 non-null  object
 4   Zinc finger       12948 non-null  object
 5   Sequence          12948 non-null  object
 6   Organism          12948 non-null  object
 7   Status            12948 non-null  object
 8   jaspar_matrix_id  12948 non-null  object
 9   jaspar_name       12948 non-null  object
 10  jaspar_pfm        12948 non-null  object
dtypes: object(11)
memory usage: 1.1+ MB


In [17]:
for index, row in uniprot_table.iterrows():
    jasparID = row['Entry']
    jaspar_index_list = jaspar_table.index[jaspar_table['Uniprot_IDs'] == jasparID].tolist()
    if (len(jaspar_index_list) > 0):
        jaspar_index = jaspar_index_list[0]
        uniprot_table.at[index,'jaspar_matrix_id'] = jaspar_table.iloc[jaspar_index]["Matrix_ID"]
        uniprot_table.at[index,'jaspar_name'] = jaspar_table.iloc[jaspar_index]["Name"]
        uniprot_table.at[index,'jaspar_pfm'] = jaspar_table.iloc[jaspar_index]["PWM"]
uniprot_table.head()

Unnamed: 0,Entry,Entry name,Protein names,Gene names,Zinc finger,Sequence,Organism,Status,jaspar_matrix_id,jaspar_name,jaspar_pfm
0,O15553,MEFV_HUMAN,Pyrin (Marenostrin),MEFV MEF TRIM20,QPLPQCKRHLKQVQLLFCEDHDEPICLICSLSQEHQGHRVRPI,MAKTPSDHLLSTLEELVPYDFEKFKFKLQNTSVQKEHSRIPRSQIQ...,Homo sapiens (Human),reviewed,,,
1,Q8C5W4,MOR2B_MOUSE,ATPase MORC2B (EC 3.6.1.-) (MORC family CW-typ...,Morc2b Tce6,AMQVPTTIQCDLCLKWRTLPFQLSAVEEGYPINWVCSMNPDPEQDQ...,MAFTNYSTLNRAQLTFDYLHTNSTTHAFLFGALAELIDNARDADAT...,Mus musculus (Mouse),reviewed,,,
2,Q9VHM6,OUIB_DROME,Transcription factor Ouib (Protein ouija board),Ouib CG11762,"YICELCGTHATSKPTFQRHMRKH, FGCKDCDARFLSAGELRAHHR...",MLNIVCRVCGRQKICEKSLNLFDLVNRKYLKHLHMISGLRLVDLDD...,Drosophila melanogaster (Fruit fly),reviewed,,,
3,P04585,POL_HV1H2,Gag-Pol polyprotein (Pr160Gag-Pol) [Cleaved in...,gag-pol,"VKCFNCGKEGHTARNCRA, KGCWKCGKEGHQMKDCTE, DGIDKA...",MGARASVLSGGELDRWEKIRLRPGGKKKYKLKHIVWASRELERFAV...,Human immunodeficiency virus type 1 group M su...,reviewed,,,
4,P14078,POL_HTL1C,Gag-Pro-Pol polyprotein (Pr160Gag-Pro-Pol) [Cl...,gag-pro-pol,"QPCFRCGKAGHWSRDCTQ, GPCPLCQDPTHWKRDCPR",MGQIFSRSASPIPRPPRGLAAHHWLNFLQAAYRLEPGPSSYDFHQL...,Human T-cell leukemia virus 1 (isolate Caribbe...,reviewed,,,


In [18]:
test = uniprot_table[uniprot_table['jaspar_name'] != ""]
test

Unnamed: 0,Entry,Entry name,Protein names,Gene names,Zinc finger,Sequence,Organism,Status,jaspar_matrix_id,jaspar_name,jaspar_pfm
5,Q6DJT9,PLAG1_HUMAN,Zinc finger protein PLAG1 (Pleiomorphic adenom...,PLAG1,"FPCQLCDKAFNSVEKLKVHSYSH, YKCIQQDCTKAFVSKYKLQRH...",MATVIPGDLSEVRDTQKVPSGKRKRGETKPRKNFPCQLCDKAFNSV...,Homo sapiens (Human),reviewed,MA0163.1,PLAG1,"{'A': [0.0, 0.16666666666666666, 0.0, 0.0, 0.0..."
71,Q15072,OZF_HUMAN,Zinc finger protein OZF (Only zinc finger prot...,ZNF146 OZF,"FACKVCGKVFSHKSNLTEHEHFH, FECNECGKAFSQKQYVIKHQN...",MSHLSQQRIYSGENPFACKVCGKVFSHKSNLTEHEHFHTREKPFEC...,Homo sapiens (Human),reviewed,UN0316.1,ZNF146,"{'A': [0.11582910173349677, 0.4810015759061460..."
157,P78871,RST2_SCHPO,Zinc finger protein rst2,rst2 SPAC6F12.02,"YVCETCTRAFARLEHLKRHIRSH, FTCSEIDGLPTGCGRQFSRRD...",MTRESLAPIASKANTLSESKVSENLMSINSDSGTSNANTPSSVTSN...,Schizosaccharomyces pombe (strain 972 / ATCC 2...,reviewed,MA1431.1,rst2,"{'A': [0.2565130260521042, 0.3921765295887663,..."
181,Q8N2R0,OSR2_HUMAN,Protein odd-skipped-related 2,OSR2,"FICKFCGRHFTKSYNLLIHERTH, YTCDICHKAFRRQDHLRDHRY...",MGSKALPAPIPLHPSLQLTNYSFLQAVNTFPATVDHLQGLYGLSAV...,Homo sapiens (Human),reviewed,MA1646.1,OSR2,"{'A': [0.2996353322528363, 0.28660183684494867..."
438,O95863,SNAI1_HUMAN,Zinc finger protein SNAI1 (Protein snail homol...,SNAI1 SNAH,"FNCKYCNKEYLSLGALKMHIRSH, LPCVCGTCGKAFSRPWLLQGH...",MPRSFLVRKPSDPNRKPNYSELQDSNPEFTFQQPYDQAHLLAAIPP...,Homo sapiens (Human),reviewed,MA1558.1,SNAI1,"{'A': [0.2639243356853325, 0.2913628598808383,..."
...,...,...,...,...,...,...,...,...,...,...,...
12859,O95780,ZN682_HUMAN,Zinc finger protein 682,ZNF682,"FKCMQCGKVFKSHSGLSYHKIIH, CICEECGKTFKWFSYLTKHKR...",MELLTFRDVTIEFSLEEWEFLNPAQQSLYRKVMLENYRNLVSLGLT...,Homo sapiens (Human),reviewed,MA1599.1,ZNF682,"{'A': [0.1937618147448015, 0.28733459357277885..."
12864,Q14592,ZN460_HUMAN,Zinc finger protein 460 (Zinc finger protein 2...,ZNF460 ZNF272,"YDCPECGKAFGKSKHLLQHHIIH, YKCLECGKDFNRRSHLTRHQR...",MAAAWMAPAQESVTFEDVAVTFTQEEWGQLDVTQRALYVEVMLETC...,Homo sapiens (Human),reviewed,MA1596.1,ZNF460,"{'A': [0.2555831265508685, 0.03039702233250620..."
12877,Q9UDV7,ZN282_HUMAN,Zinc finger protein 282 (HTLV-I U5RE-binding p...,ZNF282 HUB1,"YSCPECGKSFGVRKSLIIHHRSH, YECAECEKSFNCHSGLIRHQM...",MQFVSTRPQPQQLGIQGLGLDSGSWSWAQALPPEEVCHQEPALRGE...,Homo sapiens (Human),reviewed,MA1154.1,ZNF282,"{'A': [0.0673076923076923, 0.08764044943820225..."
12928,Q9ZV33,DOF22_ARATH,Dof zinc finger protein DOF2.2 (AtDOF2.2),DOF2.2 At2g28810 F8N16.10,LKCPRCDSANTKFCYFNNYNLTQPRHFCKACRRYWTRGGALRNVPV...,MVFSSVSSFLDPPINWPQSANPNNHPHHHQLQENGSLVSGHHQVLS...,Arabidopsis thaliana (Mouse-ear cress),reviewed,MA1272.1,AT2G28810,"{'A': [0.17333333333333334, 0.175, 0.198333333..."


In [19]:
len(test)

334