In [1]:
# note to reader: the dataset is "FFS_ZF_w_duplicates"

In [63]:
import os
import pandas as pd
import numpy as np
import json
import requests
import bs4
import matplotlib.pyplot as plt

In [64]:
fpB = os.path.join('FFS_data', 'PWMfreq_DatasetB.txt')
fhB = open(fpB)
fhB_lst = (fhB.read()
           .replace('A |', '').replace('C |', '').replace('G |', '').replace('T |', '')
           .replace('\t', ' ').split('\n'))[1:-1]

for i in range(len(fhB_lst)):
    if i % 5 == 0:
        fhB_lst[i] = fhB_lst[i][1:]
    else:
        row = fhB_lst[i].strip().split()
        fhB_lst[i] = [int(elem) for elem in row]

In [65]:
chunks = [fhB_lst[i:(i+5)] for i in range(0, len(fhB_lst), 5)]
chunks = [[lst[0], [lst[1], lst[2], lst[3], lst[4]]] for lst in chunks]
data = []
for chunk in chunks:
    data.append({'temp_ID': chunk[0],
                 'PFM': chunk[1]})

In [66]:
FFS_PWMs = pd.DataFrame(data) # Uniprot IDs are unhelpfully not in here

In [67]:
FFS_PWMs['Flybase_ID'] = FFS_PWMs['temp_ID'].apply(lambda id: id.rpartition('_')[2])

In [68]:
# AHHHHH I'M JUST GOING TO SCRAPE IT

# notes to self:
# 1: Flybase_ID (eg FBgn0259750)
# 3: Uniprot_ID (eg Q24174)
# 5: Domain (eg zf-C2H2)
# 7: Symbol (eg ab)
# 9: Full Name (eg abrupt)
# 11: View (FFS entry)
# even numbered slots are empty

In [69]:
page1_text = requests.get('https://mccb.umassmed.edu/ffs/BrowseData.php?PWM=1').text
page1_important_soup = bs4.BeautifulSoup(page1_text).find('tbody')
page1_results = page1_important_soup.find_all('tr')

In [72]:
def scrape_page(page, data):
    """
    page: page being scraped
    data: add Flybase_ID, Uniprot_ID, and Domain here
    """
    page_text = requests.get(page).text
    page_important_soup = bs4.BeautifulSoup(page_text).find('tbody')
    page_results = page_important_soup.find_all('tr')
    for entry in page_results:
        Flybase_ID = entry.contents[1].contents[0].contents[0].strip()
        Uniprot_ID = entry.contents[3].contents[0].contents[0].strip()
        Domain = entry.contents[5].contents[0].strip()
        if Domain in ['zf-C2H2', 'zf-C4', 'zf-C2HC', 'zf-C3HC4', 'zf-FCS', 'zf-NF-X1']:
            data.append({'Flybase_ID': Flybase_ID,
                         'Uniprot_ID': Uniprot_ID,
                         'Domain': Domain})

In [73]:
data = []
for i in range(1, 8):
    scrape_page(f'https://mccb.umassmed.edu/ffs/BrowseData.php?PWM=1&page={i}', data)

In [74]:
FFS_ZFs = pd.DataFrame(data)
FFS_ZF_w_duplicates = FFS_ZFs.merge(FFS_PWMs, on='Flybase_ID')

In [19]:
FFS_ZF_w_duplicates

Unnamed: 0,Flybase_ID,Uniprot_ID,Domain,temp_ID,PFM
0,FBgn0259750,Q24174,zf-C2H2,ab_SANGER_10_FBgn0259750,"[[0, 10, 4, 7, 0, 0, 20, 0, 0, 11, 7, 1, 4, 9,..."
1,FBgn0259750,Q24174,zf-C2H2,ab_SOLEXA_5_FBgn0259750,"[[131, 133, 112, 90, 148, 102, 190, 1, 0, 449,..."
2,FBgn0005694,A4V287,zf-C2H2,Aef1_FlyReg_FBgn0005694,"[[0, 3, 3, 0, 2, 3, 0, 2], [3, 0, 0, 3, 0, 0, ..."
3,FBgn0005694,A4V287,zf-C2H2,Aef1_SANGER_5_FBgn0005694,"[[6, 0, 12, 12, 0, 12, 12, 0, 12, 9, 2], [2, 1..."
4,FBgn0005694,A4V287,zf-C2H2,Aef1_SOLEXA_FBgn0005694,"[[17, 5, 18, 9, 1, 0, 0, 0, 0, 0, 0, 0, 43, 35..."
...,...,...,...,...,...
219,FBgn0003964,P20153,zf-C4,usp_SANGER_5_FBgn0003964,"[[11, 9, 15, 0, 0, 0, 0, 21, 9], [6, 0, 0, 0, ..."
220,FBgn0259789,Q7YU17,zf-C2H2,vfl_SANGER_5_FBgn0259789,"[[1, 0, 18, 0, 0, 0, 18, 3], [1, 18, 0, 0, 0, ..."
221,FBgn0259789,Q7YU17,zf-C2H2,vfl_SOLEXA_5_FBgn0259789,"[[110, 149, 144, 116, 54, 7, 756, 0, 1, 4, 756..."
222,FBgn0001983,Q8SXL3,zf-C2H2,wor_SANGER_2.5_FBgn0001983,"[[0, 20, 0, 0, 0, 0, 1], [21, 0, 21, 21, 0, 0,..."


## alternative approach - just merge on Uniprot_ID

In [75]:
fp = os.path.join('data', 'Transcription Factors (UniProt).xlsx')
uniprot_zf = pd.read_excel(fp)
uniprot_zf.rename(columns={'Entry': 'Uniprot ID'}, inplace=True)
uniprot_zf = uniprot_zf[uniprot_zf['Uniprot ID'].notnull()]

In [80]:
test_FFS = FFS_ZF_w_duplicates.merge(uniprot_zf, left_on = 'Uniprot_ID', right_on='Uniprot ID', how='left')

In [81]:
test_FFS.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 224 entries, 0 to 223
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Flybase_ID     224 non-null    object 
 1   Uniprot_ID     224 non-null    object 
 2   Domain         224 non-null    object 
 3   temp_ID        224 non-null    object 
 4   PFM            224 non-null    object 
 5   Uniprot ID     85 non-null     object 
 6   Entry name     85 non-null     object 
 7   Status         85 non-null     object 
 8   Protein names  85 non-null     object 
 9   Gene names     85 non-null     object 
 10  Organism       85 non-null     object 
 11  Length         85 non-null     float64
 12  Sequence       85 non-null     object 
 13  Zinc finger    85 non-null     object 
dtypes: float64(1), object(13)
memory usage: 26.2+ KB


## Below is the code for matching to UniProt (from the spreadsheet) ##

In [30]:
#fp = os.path.join('data', 'UniProt_ZF.csv')
#uniprot_zf = pd.read_csv(open(fp)).drop(columns=['Unnamed: 0', 'DNA Binding Sequence (to be added)'])
#uniprot_zf = uniprot_zf.rename(columns={'Entry (Uniprot ID)': 'Uniprot ID'})
fp = os.path.join('data', 'Transcription Factors (UniProt).xlsx')
uniprot_zf = pd.read_excel(fp)
uniprot_zf.rename(columns={'Entry': 'Uniprot ID'}, inplace=True)

In [31]:
uniprot_zf.columns

Index(['Uniprot ID', 'Entry name', 'Status', 'Protein names', 'Gene names',
       'Organism', 'Length', 'Sequence', 'Zinc finger'],
      dtype='object')

In [32]:
uniprot_zf = uniprot_zf[uniprot_zf['Gene names'].notnull()]

In [33]:
def ffs_id2actual_uniprot_id(ffs_id):
    matches=[]
    for actual_uniprot_id in uniprot_zf['Uniprot ID']:
        if ffs_id.lower() in actual_uniprot_id.lower():
            matches.append(ffs_id)
    return matches

In [34]:
#FFS_ZF_w_duplicates['Matches'] = FFS_ZF_w_duplicates['Uniprot_ID'].apply(ffs_id2actual_uniprot_id)
#FFS_ZF_w_duplicates['num_matches'] = FFS_ZF_w_duplicates['Matches'].apply(len)

In [35]:
FFS_ZF_w_duplicates['num_matches'].value_counts()

0    139
1     85
Name: num_matches, dtype: int64

In [36]:
FFS_ZF_w_duplicates['Match'] = FFS_ZF_w_duplicates['Matches'].apply(lambda l: l[0] if len(l) == 1 else np.NaN)

In [37]:
FFS_duplicates_few_matches = FFS_ZF_w_duplicates.merge(uniprot_zf, left_on='Match', right_on='Uniprot ID', how='left')
FFS_duplicates_few_matches.rename(columns={'Uniprot_ID': 'FFS_Uniprot_ID', 'Uniprot ID': 'Actual_Uniprot_ID'}, inplace=True)

In [38]:
FFS_duplicates_no_matches = FFS_duplicates_few_matches[FFS_duplicates_few_matches['Actual_Uniprot_ID'].isnull()]#[['FFS_Uniprot_ID', 'Actual_Uniprot_ID']]

## Trying to match on old spreadsheet

In [47]:
fp = os.path.join('data', 'UniProt_ZF.csv')
uniprot_zf = pd.read_csv(open(fp)).drop(columns=['Unnamed: 0', 'DNA Binding Sequence (to be added)'])
uniprot_zf = uniprot_zf.rename(columns={'Entry (Uniprot ID)': 'Uniprot ID'})
uniprot_zf = uniprot_zf[uniprot_zf['Gene names'].notnull()]

In [48]:
uniprot_zf.columns

Index(['Uniprot ID', 'Entry name', 'Protein names', 'Gene names', 'Organism',
       'Zinc finger', 'Sequence'],
      dtype='object')

In [52]:
FFS_duplicates_no_matches.columns

Index(['Flybase_ID', 'FFS_Uniprot_ID', 'Domain', 'temp_ID', 'PFM', 'Matches',
       'num_matches', 'Match', 'Actual_Uniprot_ID', 'Entry name', 'Status',
       'Protein names', 'Gene names', 'Organism', 'Length', 'Sequence',
       'Zinc finger'],
      dtype='object')

In [53]:
FFS_duplicates_no_matches['Matches'] = FFS_duplicates_no_matches['FFS_Uniprot_ID'].apply(ffs_id2actual_uniprot_id)
FFS_duplicates_no_matches['num_matches'] = FFS_duplicates_no_matches['Matches'].apply(len)
FFS_duplicates_no_matches['num_matches'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  FFS_duplicates_no_matches['Matches'] = FFS_duplicates_no_matches['FFS_Uniprot_ID'].apply(ffs_id2actual_uniprot_id)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  FFS_duplicates_no_matches['num_matches'] = FFS_duplicates_no_matches['Matches'].apply(len)


0    131
1      8
Name: num_matches, dtype: int64

In [93]:
(FFS_duplicates_few_matches[FFS_duplicates_few_matches['num_matches'] == 0]
 ['FFS_Uniprot_ID'].sort_values().iloc[45:55])

102    O61373
103    O61373
23     O77251
24     O77251
83     O96680
123    P13054
85     P15370
210    P16375
86     P17672
80     P23792
Name: FFS_Uniprot_ID, dtype: object

In [56]:
FFS_duplicates_few_matches.to_excel('FFS_duplicates_few_matches.xlsx')

In [57]:
# some testing for CisBP

In [59]:
FFS_duplicates_few_matches[FFS_duplicates_few_matches['Flybase_ID'] == 'FBgn0005630']

Unnamed: 0,Flybase_ID,FFS_Uniprot_ID,Domain,temp_ID,PFM,num_matches,Match,Actual_Uniprot_ID,Entry name,Protein names,Gene names,Organism,Zinc finger,Sequence
132,FBgn0005630,P42283,zf-C2H2,lola_SANGER_5_FBgn0005630,"[[12, 12, 12, 0, 0, 0, 0], [0, 0, 0, 0, 12, 0,...",1,P42283,P42283,LOLA1_DROME,"Longitudinals lacking protein, isoform G",lola CG12052,Drosophila melanogaster (Fruit fly),"YECRHCGKKYRWKSTLRRHENVE, HQCPYCPYKSKQRGNLGVHVRKH",MDDDQQFCLRWNNHQSTLISVFDTLLENETLVDCTLAAEGKFLKAH...
133,FBgn0005630,P42283,zf-C2H2,lola_SOLEXA_5_FBgn0005630,"[[12, 32, 86, 90, 93, 1, 0, 0, 0], [33, 14, 2,...",1,P42283,P42283,LOLA1_DROME,"Longitudinals lacking protein, isoform G",lola CG12052,Drosophila melanogaster (Fruit fly),"YECRHCGKKYRWKSTLRRHENVE, HQCPYCPYKSKQRGNLGVHVRKH",MDDDQQFCLRWNNHQSTLISVFDTLLENETLVDCTLAAEGKFLKAH...
134,FBgn0005630,P42283,zf-C2H2,lola-PU_SANGER_5_FBgn0005630,"[[2, 0, 0, 0, 12, 12, 3, 8, 6], [2, 12, 2, 12,...",1,P42283,P42283,LOLA1_DROME,"Longitudinals lacking protein, isoform G",lola CG12052,Drosophila melanogaster (Fruit fly),"YECRHCGKKYRWKSTLRRHENVE, HQCPYCPYKSKQRGNLGVHVRKH",MDDDQQFCLRWNNHQSTLISVFDTLLENETLVDCTLAAEGKFLKAH...
135,FBgn0005630,P42283,zf-C2H2,lola-PY_SANGER_2.5_FBgn0005630,"[[7, 10, 0, 0, 0, 0, 5, 6, 2], [1, 1, 11, 11, ...",1,P42283,P42283,LOLA1_DROME,"Longitudinals lacking protein, isoform G",lola CG12052,Drosophila melanogaster (Fruit fly),"YECRHCGKKYRWKSTLRRHENVE, HQCPYCPYKSKQRGNLGVHVRKH",MDDDQQFCLRWNNHQSTLISVFDTLLENETLVDCTLAAEGKFLKAH...
136,FBgn0005630,P42283,zf-C2H2,lola-PD_SANGER_5_FBgn0005630,"[[0, 0, 1, 0, 1, 1, 10, 4, 2], [11, 10, 9, 11,...",1,P42283,P42283,LOLA1_DROME,"Longitudinals lacking protein, isoform G",lola CG12052,Drosophila melanogaster (Fruit fly),"YECRHCGKKYRWKSTLRRHENVE, HQCPYCPYKSKQRGNLGVHVRKH",MDDDQQFCLRWNNHQSTLISVFDTLLENETLVDCTLAAEGKFLKAH...
137,FBgn0005630,P42283,zf-C2H2,lola-PO_SANGER_5_FBgn0005630,"[[0, 0, 0, 0, 0, 6, 4, 6], [5, 0, 8, 8, 8, 0, ...",1,P42283,P42283,LOLA1_DROME,"Longitudinals lacking protein, isoform G",lola CG12052,Drosophila melanogaster (Fruit fly),"YECRHCGKKYRWKSTLRRHENVE, HQCPYCPYKSKQRGNLGVHVRKH",MDDDQQFCLRWNNHQSTLISVFDTLLENETLVDCTLAAEGKFLKAH...
138,FBgn0005630,P42283,zf-C2H2,lola-PQ_SANGER_5_FBgn0005630,"[[8, 3, 6, 10, 8, 0, 11, 6, 0], [2, 3, 1, 0, 0...",1,P42283,P42283,LOLA1_DROME,"Longitudinals lacking protein, isoform G",lola CG12052,Drosophila melanogaster (Fruit fly),"YECRHCGKKYRWKSTLRRHENVE, HQCPYCPYKSKQRGNLGVHVRKH",MDDDQQFCLRWNNHQSTLISVFDTLLENETLVDCTLAAEGKFLKAH...
139,FBgn0005630,P42283,zf-C2H2,lola-PW_SANGER_5_FBgn0005630,"[[0, 0, 0, 0, 0, 0, 11], [3, 0, 10, 0, 0, 11, ...",1,P42283,P42283,LOLA1_DROME,"Longitudinals lacking protein, isoform G",lola CG12052,Drosophila melanogaster (Fruit fly),"YECRHCGKKYRWKSTLRRHENVE, HQCPYCPYKSKQRGNLGVHVRKH",MDDDQQFCLRWNNHQSTLISVFDTLLENETLVDCTLAAEGKFLKAH...
140,FBgn0005630,P42283,zf-C2H2,lola-PT_SANGER_5_FBgn0005630,"[[1, 0, 0, 0, 12, 11, 3, 8], [0, 12, 0, 12, 0,...",1,P42283,P42283,LOLA1_DROME,"Longitudinals lacking protein, isoform G",lola CG12052,Drosophila melanogaster (Fruit fly),"YECRHCGKKYRWKSTLRRHENVE, HQCPYCPYKSKQRGNLGVHVRKH",MDDDQQFCLRWNNHQSTLISVFDTLLENETLVDCTLAAEGKFLKAH...
141,FBgn0005630,P42283,zf-C2H2,lola-PJ_SANGER_5_FBgn0005630,"[[11, 5, 0, 12, 0, 12, 12], [0, 0, 12, 0, 0, 0...",1,P42283,P42283,LOLA1_DROME,"Longitudinals lacking protein, isoform G",lola CG12052,Drosophila melanogaster (Fruit fly),"YECRHCGKKYRWKSTLRRHENVE, HQCPYCPYKSKQRGNLGVHVRKH",MDDDQQFCLRWNNHQSTLISVFDTLLENETLVDCTLAAEGKFLKAH...
