In [1]:
import numpy as np
import pandas as pd
import os
import re
import matplotlib.pyplot as plt

In [2]:
# new entries in Transcription Factors (UniProt)
# for num_fewer_matches == 1, just take their UniProt ID (there are 1491/1555 of them so far)

In [3]:
fp = os.path.join('data', 'CisBP_C2H2_ZFs', 'PWM.txt')

In [4]:
open(fp).read().count('\n\n\n') # '\n\n\n' is a divider, so we have 2105 total rows from fp

2104

In [5]:
def CisBP_to_df(fp):
    """
    fp: file path to desired file
    """
    ZF_entries = open(fp).read().split('\n\n\n')
    ZF_entries = [ZF_entry.replace('\t', ' ').split('\n') for ZF_entry in ZF_entries]
    ZF_entries = process_ZF_entries(ZF_entries) 
    return pd.DataFrame(ZF_entries)

In [6]:
def process_ZF_entries(ZF_entries):
    output = []
    print('fix temp')
    for lst in ZF_entries:
        temp = [elem.rpartition(' ') for elem in lst[:6]]
        if len(temp) > 4:
            del temp[4]
        d = {}
        for tup in temp:
            d[tup[0]] = tup[2] # tup[1] is just a space
        
        PWM = lst[7:]
        PWM = [column.partition(' ')[2] for column in PWM]
        d['PWM'] = transpose_PWM(PWM)
        
        output.append(d)
    return output

In [7]:
def transpose_PWM(PWM):
    rows = [row.split() for row in PWM]
    transpose = [[0 for _ in range(len(PWM))] for _ in range(4)]
    for i in range(4):
        transpose[i] = [float(row[i]) for row in rows]
    return np.array(transpose)

In [8]:
def is_empty_PWM(PWM):
    return PWM.tolist() != np.array([[],[],[],[]]).tolist()

In [9]:
test = CisBP_to_df(fp).drop(columns=['']).iloc[:-1]

fix temp


In [10]:
test['has_PWM'] = test['PWM'].apply(is_empty_PWM)

In [11]:
test.head()

Unnamed: 0,TF,TF Name,Gene,Motif,Species,PWM,has_PWM
0,T080828_2.00,AT1G14580,AT1G14580,M06797_2.00,Arabidopsis_thaliana,"[[0.252066, 0.252066, 0.247934, 0.297521, 0.23...",True
1,T080828_2.00,AT1G14580,AT1G14580,M06798_2.00,Arabidopsis_thaliana,"[[0.308219, 0.25, 0.155822, 0.018836, 0.001712...",True
2,T080841_2.00,AT2G15740,AT2G15740,M06810_2.00,Arabidopsis_thaliana,"[[0.285, 0.021667, 0.095, 0.186667, 0.196667, ...",True
3,T080850_2.00,AT2G48100,AT2G48100,M06812_2.00,Arabidopsis_thaliana,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.894737], [0.0, 0....",True
4,T080855_2.00,AT3G46070,AT3G46070,M06813_2.00,Arabidopsis_thaliana,"[[0.222772, 0.133663, 0.386139, 0.311881, 0.08...",True


In [12]:
cleaned_cisbp = test[test['has_PWM'] == True].drop(columns=['has_PWM'])

Below is the code for aligning DNA sequences (from Database_ZF Designs) from UniProt with CisBP entries.

In [13]:
fp = os.path.join('data', 'UniProt_ZF.csv')
uniprot_zf = pd.read_csv(open(fp)).drop(columns=['Unnamed: 0', 'DNA Binding Sequence (to be added)'])
#fp = os.path.join('data', 'Transcription Factors (UniProt).xlsx')
#uniprot_zf = pd.read_excel(fp)

In [14]:
uniprot_zf.shape

(12948, 7)

In [15]:
uniprot_zf = uniprot_zf.rename(columns={'Entry': 'UniProt ID'})
uniprot_zf = uniprot_zf[uniprot_zf['Gene names'].notnull()]

In [16]:
display(cleaned_cisbp.columns, uniprot_zf.columns)

Index(['TF', 'TF Name', 'Gene', 'Motif', 'Species', 'PWM'], dtype='object')

Index(['Entry (Uniprot ID)', 'Entry name', 'Protein names', 'Gene names',
       'Organism', 'Zinc finger', 'Sequence'],
      dtype='object')

In [17]:
def uniprot_gene_names_contains(name):
    """
    Match TF Name to Gene names (same column in both sheets)
    """
    name = name.lower()
    lst = []
    for i, entry in enumerate(uniprot_zf['Gene names'].str.lower()):
        if re.search('(^| )' + name + '($| )', entry) != None: 
            # the above is some regex. ^ means start of line, $ means end of line, | means or. 
            lst.append({'index': i, 'name': entry})
            # index is the row index (access by iloc) in uniprot_zf to which the match corresponds
    return lst

In [None]:
# very long run time
cleaned_cisbp = cleaned_cisbp.assign(possible_match=cleaned_cisbp['TF Name'].apply(uniprot_gene_names_contains))

In [None]:
cleaned_cisbp = cleaned_cisbp.assign(num_possible_match=cleaned_cisbp['possible_match'].apply(lambda l: len(l)))

In [None]:
matched_cisbp = cleaned_cisbp[cleaned_cisbp['num_possible_match'] != 0]

In [None]:
plt.hist(matched_cisbp['num_possible_match'], bins=range(25))
plt.xlabel('Number of matching rows in UniProt')
plt.ylabel('Number of rows with 1, 2,..., 40 matches')
plt.show()

In [None]:
def filter_by_organism(ser):
    """
    Run new_df = matched_cisbp.apply(filter_by_organism), which should take the list in possible_match and 
    eliminate entries that correspond to a TF that doesn't belong to the Species column in matched_cisbp
    """
    lst = ser['possible_match']
    true_species = ser['Species'].replace('_', ' ')
    species_matches = []
    if len(lst) > 1:
        for d in lst:
            if true_species in uniprot_zf.iloc[d['index']]['Organism']: # works for both sheets
                species_matches.append(d)
        return species_matches
    elif len(lst) == 1:
        return lst
    else:
        return np.NaN

In [None]:
matched_cisbp.head()

In [None]:
# use for looking at examples

In [None]:
fewer_match = matched_cisbp.apply(filter_by_organism, axis=1)
matched_cisbp = matched_cisbp.assign(fewer_match=fewer_match)
matched_cisbp = matched_cisbp.assign(num_fewer_match=matched_cisbp['fewer_match'].apply(len))

In [None]:
#with UniProt_ZF.csv (Endogenous (Human Genome)): reduction from 984 to 57 rows with more than one UniProt match
#with Transcription Factors (UniProt).xlsx: reduction from 887 to 40
display(matched_cisbp[matched_cisbp['num_possible_match'] > 1].shape)
display(matched_cisbp[matched_cisbp['num_fewer_match'] > 1].shape)
matched_cisbp[matched_cisbp['num_fewer_match'] > 1][['num_possible_match', 'num_fewer_match']]

Unfortunately, there are some rows that had no matches in the UniProt database w.r.t. `Species` or `Organism`.

In [None]:
matched_cisbp[matched_cisbp['num_fewer_match'] == 0].shape

Now, for all rows with column `num_fewer_match` > 1, see if the sequences in their `fewer_match` column are identical. If they are, then set that sequence to be their sequence. Otherwise, set the sequence to np.NaN

In [None]:
def filter_fewer_match_by_seq(fewer_match):
    """
    fewer_match: a list in column fewer_match
    use as matched_cisbp['final_sequence'] = matched_cisbp['fewer_match'].apply(filter_fewer_match_by_seq)
    """
    for i in range(len(fewer_match)):
        if i == 0:
            first_seq = uniprot_zf.iloc[fewer_match[0]['index']]['Sequence']
        else:
            next_seq = uniprot_zf.iloc[fewer_match[i]['index']]['Sequence']
            if next_seq != first_seq:
                return np.NaN
    if len(fewer_match) > 0:
        return first_seq
    else:
        return np.NaN

In [None]:
matched_cisbp['sequence'] = matched_cisbp['fewer_match'].apply(filter_fewer_match_by_seq)

In [None]:
matched_cisbp[matched_cisbp['sequence'].notnull()]#.head()

### Only three final sequences were recovered by the *important* part of filter_fewer_match_by_seq. ...so let's bang our heads against the wall with a NEW uniprot_zf dataframe! ###

In [75]:
fp = os.path.join('data', 'Transcription Factors (UniProt).xlsx')
uniprot_zf = pd.read_excel(fp)
matched_cisbp_noseq = matched_cisbp[matched_cisbp['sequence'].isnull()]

In [76]:
matched_cisbp_noseq.shape

(64, 11)

In [85]:
# re-make fewer_match with the new sheet, Transcription Factors (UniProt)
matched_cisbp_noseq['sequence'] = matched_cisbp_noseq['fewer_match'].apply(filter_fewer_match_by_seq)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matched_cisbp_noseq['sequence'] = matched_cisbp_noseq['fewer_match'].apply(filter_fewer_match_by_seq)


In [90]:
matched_cisbp_noseq.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 64 entries, 135 to 1775
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   TF                  64 non-null     object 
 1   TF Name             64 non-null     object 
 2   Gene                64 non-null     object 
 3   Motif               64 non-null     object 
 4   Species             64 non-null     object 
 5   PWM                 64 non-null     object 
 6   possible_match      64 non-null     object 
 7   num_possible_match  64 non-null     int64  
 8   fewer_match         64 non-null     object 
 9   num_fewer_match     64 non-null     int64  
 10  sequence            0 non-null      float64
dtypes: float64(1), int64(2), object(8)
memory usage: 6.0+ KB


It did nothing :(

In [113]:
matched_cisbp_noseq.sample().index[0]

135

In [112]:
uniprot_zf.iloc[[d['index'] for d in matched_cisbp_noseq.loc[13]['fewer_match']]]

Unnamed: 0,Entry,Entry name,Status,Protein names,Gene names,Organism,Length,Sequence,Zinc finger
5101,Q9FGD1,RKD3_ARATH,reviewed,Protein RKD3 (AtRKD3) (RWP-RK domain-containin...,RKD3 At5g66990 K8A10.6,Arabidopsis thaliana (Mouse-ear cress),277,MADQRPLMTWLEANNYESFLQEDIFSFLDQSLFVDPHSSFIDPFKD...,
6132,Q1CCZ2,RPOZ_YERPN,reviewed,DNA-directed RNA polymerase subunit omega (RNA...,rpoZ YPN_3811 YP516_4331,Yersinia pestis bv. Antiqua (strain Nepal516),91,MARVTVQDAVEKIGNRFDLVLVAARRARQIQSGGKDALVPEENDKV...,


In [None]:
matched_cisbp.to_excel('CisBP_many_matches.xlsx')

In [126]:
subset_ZF_entries = open(fp).read().split('\n\n\n')[:2]

In [127]:
[ZF_entry.replace('\t', ' ').split('\n') for ZF_entry in subset_ZF_entries][0][:6]

['TF T080828_2.00',
 'TF Name AT1G14580',
 'Gene AT1G14580',
 'Motif M06797_2.00',
 'Family C2H2 ZF',
 'Species Arabidopsis_thaliana']

In [128]:
process_ZF_entries([ZF_entry.replace('\t', ' ').split('\n') for ZF_entry in subset_ZF_entries])

[{'TF': 'T080828_2.00',
  'TF Name': 'AT1G14580',
  'Gene': 'AT1G14580',
  'Motif': 'M06797_2.00',
  'Species': 'Arabidopsis_thaliana',
  'PWM': array([[0.252066, 0.252066, 0.247934, 0.297521, 0.239669, 0.103306,
          0.      , 0.      , 0.      , 0.      , 0.      , 0.008264,
          0.053719, 0.008264, 0.028926, 0.144628, 0.268595, 0.086777,
          0.016529, 0.157025],
         [0.14876 , 0.132231, 0.177686, 0.11157 , 0.095041, 0.061983,
          0.024793, 0.008264, 0.024793, 0.004132, 0.008264, 0.979339,
          0.086777, 0.03719 , 0.132231, 0.      , 0.020661, 0.367769,
          0.272727, 0.061983],
         [0.132231, 0.140496, 0.132231, 0.115702, 0.115702, 0.045455,
          0.020661, 0.      , 0.004132, 0.991736, 0.061983, 0.      ,
          0.636364, 0.057851, 0.053719, 0.024793, 0.004132, 0.446281,
          0.016529, 0.483471],
         [0.466942, 0.475207, 0.442149, 0.475207, 0.549587, 0.789256,
          0.954545, 0.991736, 0.971074, 0.004132, 0.929752, 0.01