In [1]:
### This notebook take the Hannah et al and Martin et al data to present the hit rate in their paper
import pandas as pd 
from os.path import join
import numpy as np
import MetaAnalysis_utils as mu
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats



data_path = '../../data/MetaAnlysis'
output_path = './processed'

In [2]:
### Define a dictionary that use the same names
dict_rename = {'Missense':['missense', 'MIS', 'Missense'],
'Synonymous':['synonymous', 'SYN','Synonymous'],
'No-Edits':['empty-window','ETY','No-Edits', 'No edits'],
'Non-Targeting':['Non-targeting', 'Non-Targeting'],
'Nonsense':['Nonsense','nonsense'],
'Splice site': ['Splice site', 'splice-donor', 'splice-acceptor'],
'Non-coding':['Intron', 'non-coding', 'UTR']}
# Creating a reverse lookup dictionary for mapping values
reverse_dict = {v: k for k, vals in dict_rename.items() for v in vals}

### Hannah et al

In [3]:
hannah_be39 = pd.read_csv(join(data_path,'Ess_Hanna_2021_BE3.9.csv'))
hannah_be4 = pd.read_csv(join(data_path,'Ess_Hanna_2021_BE4max.csv'))
hannah_annot = pd.read_csv(join(data_path,'Ess_Hanna_2021_annotation.csv'))

pan_lethal = ['EEF2','HNRNPU', 'KPNB1', 'PELP1', 'POLR1C', 'PSMA6', 'RPS20', 'SF3B1', 'SNRPD1','TFRC']
control_genes = ['ICAM1','FAS','CD81','CD33']

In [4]:
','.join(pan_lethal)

'EEF2,HNRNPU,KPNB1,PELP1,POLR1C,PSMA6,RPS20,SF3B1,SNRPD1,TFRC'

In [6]:
# Get the result dataframe
dict_res = {}
for cell_line in ['A375', 'OVCAR8', 'HAP1', 'HA1E', 'MELJUSO']:
    print(f'PROCESSING CELL LINE {cell_line}......')
    sub_df = mu.make_subdf(hannah_be39, hannah_annot, cell_line)
    # sub_df['Category'] = sub_df['Gene symbol'].apply(lambda x: 'NEGATIVE CONTROL' if x == 'NEGATIVE CONTROL')
    sub_df['Category'] = sub_df['Mutation category'].apply(mu.sgRNA_categ)
    sub_df.loc[sub_df['Gene symbol'] == 'NEGATIVE CONTROL', 'Category'] = 'NEGATIVE CONTROL'

    # Make clinical dataframe
    df_res = mu.analyze_DESeq(cell_line, sub_df)

    # Analyze the hit result
    df_res_ = pd.concat([df_res,sub_df], axis = 1)
    df_res_ess = df_res_[df_res_['Gene symbol'].isin(pan_lethal)]
    df_res_ess.to_csv(join(output_path, 'Hannah'+cell_line+'.csv'))

    # Define a temp dictionary
    dict_res[cell_line] = {}

    for categ_ in ['Missense', 'Synonymous', 'No edits', 'Intron', 'Nonsense',
       'Splice site', 'UTR']:
        n_sig, n_tot = mu.sig_perc(df_res_ess, 'padj', categ_, cut_off = 0.01, category_col = 'Category')
        dict_res[cell_line][categ_] = {'sig_n':n_sig, 'tot_n':n_tot}

    df_res_ct = df_res_[df_res_['Gene symbol'].isin(control_genes)]
    n_sig, n_tot = mu.sig_perc(df_res_ct,'padj',  'all', cut_off = 0.01, category_col = 'Category')
    dict_res[cell_line]['Non-essential'] = {'sig_n':n_sig, 'tot_n':n_tot}
    df_res_nct = df_res_[df_res_['Category'] == 'NEGATIVE CONTROL']
    n_sig, n_tot = mu.sig_perc(df_res_nct,'padj',  'all', cut_off = 0.01, category_col = 'Category')
    dict_res[cell_line]['Non-Targeting'] = {'sig_n':n_sig, 'tot_n':n_tot}

PROCESSING CELL LINE A375......


Fitting size factors...
... done in 0.00 seconds.

Fitting dispersions...
... done in 4.49 seconds.

Fitting dispersion trend curve...
... done in 0.45 seconds.

  self.fit_dispersion_prior()
Fitting MAP dispersions...
... done in 4.63 seconds.

Fitting LFCs...
... done in 2.58 seconds.

Refitting 0 outliers.

Running Wald tests...
... done in 1.18 seconds.



Log2 fold change & Wald test p-value: Condition Assay vs Plasmid
                         baseMean  log2FoldChange     lfcSE      stat  \
CATAAAGCAGGAAACCCCCG  1111.934787        0.193017  0.335039  0.576103   
CAGGCGACGAGTTTGAACTG  1761.588409        0.434008  0.308204  1.408186   
CCATGTTGGACATTACCTCG  1055.937993       -0.753943  0.358433 -2.103444   
GTAGGCCACGCACGGCAGTG  1548.011800        0.055437  0.309505  0.179116   
GTGTGCTGAGAGTGTCAACA   646.341011        0.151257  0.363856  0.415705   
...                           ...             ...       ...       ...   
TTTGGTCAACGCATAGCTTG  1707.716428        0.026576  0.309554  0.085852   
TTTTACCTTGTTCACATGGA  1379.360651       -0.371971  0.308630 -1.205232   
TTTTGACTCTAATCACCGGT  1136.209801        0.144364  0.308207  0.468399   
TTTTTAATACAAGGTAATCT   198.962991       -1.175524  1.281155 -0.917550   
TTTTTCTCACCCGATGAATC  1061.656526       -0.871812  0.311703 -2.796934   

                        pvalue      padj  
CATAAAGCAGGAAAC

Fitting size factors...
... done in 0.00 seconds.

Fitting dispersions...
... done in 4.32 seconds.

Fitting dispersion trend curve...
... done in 0.50 seconds.

  self.fit_dispersion_prior()
Fitting MAP dispersions...
... done in 4.62 seconds.

Fitting LFCs...
... done in 2.42 seconds.

Refitting 0 outliers.

Running Wald tests...
... done in 1.19 seconds.



Log2 fold change & Wald test p-value: Condition Assay vs Plasmid
                         baseMean  log2FoldChange     lfcSE      stat  \
CATAAAGCAGGAAACCCCCG  1273.581683        0.052804  0.283677  0.186140   
CAGGCGACGAGTTTGAACTG  1922.045931        0.197856  0.196606  1.006361   
CCATGTTGGACATTACCTCG  1915.695785        0.160354  0.199325  0.804484   
GTAGGCCACGCACGGCAGTG  2252.248558        0.406993  0.182846  2.225875   
GTGTGCTGAGAGTGTCAACA   859.460478        0.318705  0.307049  1.037961   
...                           ...             ...       ...       ...   
TTTGGTCAACGCATAGCTTG  2116.864012        0.051844  0.198752  0.260848   
TTTTACCTTGTTCACATGGA  1608.964699       -0.491938  0.216481 -2.272431   
TTTTGACTCTAATCACCGGT  1328.245828        0.043903  0.239314  0.183453   
TTTTTAATACAAGGTAATCT   196.738444       -1.945771  0.644495 -3.019061   
TTTTTCTCACCCGATGAATC  1297.752983       -0.877764  0.258841 -3.391136   

                        pvalue      padj  
CATAAAGCAGGAAAC

Fitting size factors...
... done in 0.00 seconds.

Fitting dispersions...
... done in 4.88 seconds.

Fitting dispersion trend curve...
... done in 0.44 seconds.

  self.fit_dispersion_prior()
Fitting MAP dispersions...
... done in 4.61 seconds.

Fitting LFCs...
... done in 2.51 seconds.

Refitting 0 outliers.

Running Wald tests...
... done in 1.17 seconds.



Log2 fold change & Wald test p-value: Condition Assay vs Plasmid
                         baseMean  log2FoldChange     lfcSE      stat  \
CATAAAGCAGGAAACCCCCG  1486.270353       -0.516493  0.266337 -1.939244   
CAGGCGACGAGTTTGAACTG  3222.828976        0.431729  0.270146  1.598133   
CCATGTTGGACATTACCTCG  3620.632713        0.629727  0.397820  1.582945   
GTAGGCCACGCACGGCAGTG  3255.178517        0.341249  0.289054  1.180572   
GTGTGCTGAGAGTGTCAACA  1111.342423        0.014079  0.287068  0.049044   
...                           ...             ...       ...       ...   
TTTGGTCAACGCATAGCTTG  3511.123605        0.269037  0.259653  1.036140   
TTTTACCTTGTTCACATGGA  2377.770235       -0.518228  0.259031 -2.000641   
TTTTGACTCTAATCACCGGT  2013.704822        0.075901  0.260162  0.291745   
TTTTTAATACAAGGTAATCT   274.349491       -2.230728  0.305875 -7.292931   
TTTTTCTCACCCGATGAATC  1818.090730       -1.060958  0.282735 -3.752487   

                            pvalue          padj  
CATAAAG

Fitting size factors...
... done in 0.00 seconds.

Fitting dispersions...
... done in 4.60 seconds.

Fitting dispersion trend curve...
... done in 0.43 seconds.

  self.fit_dispersion_prior()
Fitting MAP dispersions...
... done in 4.58 seconds.

Fitting LFCs...
... done in 2.52 seconds.

Refitting 0 outliers.

Running Wald tests...
... done in 1.32 seconds.



Log2 fold change & Wald test p-value: Condition Assay vs Plasmid
                         baseMean  log2FoldChange     lfcSE      stat  \
CATAAAGCAGGAAACCCCCG  1846.577071        0.065910  0.207367  0.317843   
CAGGCGACGAGTTTGAACTG  2929.038263        0.315134  0.197948  1.592001   
CCATGTTGGACATTACCTCG  2699.427981        0.115012  0.195978  0.586862   
GTAGGCCACGCACGGCAGTG  2810.192289        0.115775  0.205327  0.563856   
GTGTGCTGAGAGTGTCAACA  1086.445176        0.047712  0.209932  0.227271   
...                           ...             ...       ...       ...   
TTTGGTCAACGCATAGCTTG  3211.232395        0.161357  0.194763  0.828481   
TTTTACCTTGTTCACATGGA  2668.829861       -0.159402  0.196003 -0.813260   
TTTTGACTCTAATCACCGGT  1946.203793        0.081691  0.196066  0.416653   
TTTTTAATACAAGGTAATCT   309.495430       -1.596200  0.731767 -2.181296   
TTTTTCTCACCCGATGAATC  2008.355762       -0.685664  0.196140 -3.495797   

                        pvalue      padj  
CATAAAGCAGGAAAC

Fitting size factors...
... done in 0.00 seconds.

Fitting dispersions...
... done in 4.96 seconds.

Fitting dispersion trend curve...
... done in 0.44 seconds.

  self.fit_dispersion_prior()
Fitting MAP dispersions...
... done in 4.73 seconds.

Fitting LFCs...
... done in 2.49 seconds.

Refitting 0 outliers.

Running Wald tests...


Log2 fold change & Wald test p-value: Condition Assay vs Plasmid
                         baseMean  log2FoldChange     lfcSE      stat  \
CATAAAGCAGGAAACCCCCG  2912.220597        0.189037  0.226153  0.835880   
CAGGCGACGAGTTTGAACTG  4403.836240        0.336997  0.246072  1.369504   
CCATGTTGGACATTACCTCG  3619.025327       -0.109515  0.227282 -0.481848   
GTAGGCCACGCACGGCAGTG  4798.739487        0.398384  0.226601  1.758082   
GTGTGCTGAGAGTGTCAACA  1617.163319        0.049188  0.227859  0.215869   
...                           ...             ...       ...       ...   
TTTGGTCAACGCATAGCTTG  5082.320218        0.289308  0.225861  1.280909   
TTTTACCTTGTTCACATGGA  3455.558386       -0.484452  0.238893 -2.027902   
TTTTGACTCTAATCACCGGT  3032.309814        0.179306  0.225659  0.794590   
TTTTTAATACAAGGTAATCT   447.353090       -1.701660  0.286090 -5.947993   
TTTTTCTCACCCGATGAATC  2980.661813       -0.691253  0.237960 -2.904917   

                            pvalue          padj  
CATAAAG

... done in 1.23 seconds.



In [5]:
# Organize the result dictionary into dataframe
df_res = pd.DataFrame()
for cell_lines in dict_res.keys():
    df_ = pd.DataFrame.from_dict(dict_res[cell_lines], orient = 'index')
    df_['cell_line'] = cell_lines
    df_ = df_.reset_index(names = ['category'])
    df_res = pd.concat([df_res, df_])
# Calculate the percentage of the hit rate
df_res['mutation_category'] = df_res['category'].replace(reverse_dict)
df_res = df_res.groupby(['mutation_category', 'cell_line'],as_index=False).sum()
df_res = df_res.drop('category', axis = 1)
df_res['perc'] = df_res['sig_n']/df_res['tot_n']
# Save dataframe
df_res.to_csv(join(output_path, 'DESeq_Hannah.csv'))

### 2. Martin et al

In [8]:
ess_genes = ['ATR','BARD1','BRCA1','BRCA2','RAD51C','RAD51D','XRCC3']
print(','.join(ess_genes))

ATR,BARD1,BRCA1,BRCA2,RAD51C,RAD51D,XRCC3


In [25]:
ess_genes = ['ATR','BARD1','BRCA1','BRCA2','RAD51C','RAD51D','XRCC3']

dict_res_ddr = {}
for cell_line in ['HAP1','MCF10A', 'MCF7']:
    dict_res_ddr[cell_line] = {}
    df = pd.read_csv(join(data_path, f'Raquel_{cell_line}_score.csv'))
    df_rc = pd.read_csv(join(data_path, f'{cell_line}_sub1_readcount.csv'))

    df.loc[df['sgRNA_ID'].str.startswith('Non-targeting'), 'Function'] = 'Non-targeting'
    df_ess = df[df['Gene'].isin(ess_genes)]
    
    cols = ['T0_R1', 'T0_R2', 'T0_R3',
       'T18_UNT_R1', 'T18_UNT_R2', 'T18_UNT_R3']
    
    # Create metadata for DESeq analysis
    clinical_df = pd.DataFrame(index=cols, columns=['Day'])
    clinical_df['Day'] = ['D0','D0','D0','D18','D18','D18']
    
    cols = ['T0_R1', 'T0_R2', 'T0_R3',
       'T18_UNT_R1', 'T18_UNT_R2', 'T18_UNT_R3']
    
    # Create metadata for DESeq analysis
    clinical_df = pd.DataFrame(index=cols, columns=['Day'])
    clinical_df['Day'] = ['D0','D0','D0','D18','D18','D18']
    
    try:
        dds = DeseqDataSet(
            counts=df_rc.T.loc[cols, :],
            metadata=clinical_df.loc[cols, :],
            design_factors='Day',
            refit_cooks=True
        )
        dds.deseq2()  # Fit dispersion and normalization

        # Perform DESeq analysis and summarize results
        stat_res = DeseqStats(dds, contrast=('Day', 'D18', 'D0'))
        stat_res.summary()
    except Exception as e:
        print(f"Error in DESeq analysis: {e}")

    df_res = pd.concat([df_rc,stat_res.results_df.reset_index()], axis = 1)
    df_res = pd.concat([df_res.set_index('sgRNA_ID'), df[['sgRNA_ID','Function']].set_index('sgRNA_ID')], axis = 1)


    df_ess = df_res[df_res['Gene'].isin(ess_genes)]
    df_ess.to_csv(join(output_path, 'Martin'+cell_line+'.csv'))

    

    categ_list =[ i for i in df_res['Function'].unique() if not pd.isna(i)]
    for categ_ in categ_list:
        n_sig, n_tot = mu.sig_perc(df_ess, 'padj', categ_, cut_off = 0.01, category_col = 'Function')
        dict_res_ddr[cell_line][categ_] = {'sig_n':n_sig, 'tot_n':n_tot}
        
    df_res_nct = df_res[df_res['Function'] == 'Non-targeting']
    n_sig, n_tot = mu.sig_perc(df_res_nct, 'padj' ,'all', cut_off = 0.01)
    dict_res_ddr[cell_line]['Non-Targeting'] = {'sig_n':n_sig, 'tot_n':n_tot}

Fitting size factors...
... done in 0.01 seconds.

Fitting dispersions...
... done in 4.65 seconds.

Fitting dispersion trend curve...
... done in 0.46 seconds.

Fitting MAP dispersions...
... done in 5.21 seconds.

Fitting LFCs...
... done in 2.74 seconds.

Refitting 0 outliers.

Running Wald tests...
... done in 1.21 seconds.



Log2 fold change & Wald test p-value: Day D18 vs D0
          baseMean  log2FoldChange     lfcSE      stat    pvalue      padj
0      1626.727828        0.583498  0.220218  2.649645  0.008058  0.075689
1       941.798500        0.176377  0.359398  0.490755  0.623600  0.825523
2       677.877454       -0.477415  0.312128 -1.529551  0.126128  0.380494
3       644.645563        0.250150  0.381862  0.655080  0.512416  0.756797
4       620.075332       -0.211889  0.272716 -0.776961  0.437182  0.705748
...            ...             ...       ...       ...       ...       ...
11393   466.738829       -0.304187  0.521412 -0.583391  0.559630  0.785095
11394   771.702454       -0.480240  0.386273 -1.243267  0.213770  0.499146
11395  1125.332420       -0.030882  0.241692 -0.127775  0.898327  0.963188
11396   765.407782       -0.091460  0.372356 -0.245624  0.805973  0.924010
11397   880.522620       -0.188182  0.323286 -0.582093  0.560504  0.785347

[11398 rows x 6 columns]


Fitting size factors...
... done in 0.01 seconds.

Fitting dispersions...
... done in 4.66 seconds.

Fitting dispersion trend curve...
... done in 0.65 seconds.

  ) - np.log(self[:, self.non_zero_genes].varm["fitted_dispersions"])
  result[n] = np.nan
Fitting MAP dispersions...
... done in 5.21 seconds.

  self.varm["_outlier_genes"] = np.log(self.varm["genewise_dispersions"]) > np.log(
Fitting LFCs...
... done in 2.64 seconds.

Refitting 0 outliers.

Running Wald tests...
... done in 1.16 seconds.



Log2 fold change & Wald test p-value: Day D18 vs D0
          baseMean  log2FoldChange     lfcSE      stat    pvalue      padj
0      1825.401797       -0.362161  0.108843 -3.327381  0.000877  0.021439
1      1494.170364       -0.187353  0.151703 -1.234999  0.216831  0.689137
2       904.678049       -0.107530  0.252201 -0.426368  0.669839  0.927829
3      1080.798986        0.657165  0.266673  2.464311  0.013728  0.168225
4       866.437238       -0.108555  0.246603 -0.440202  0.659791  0.926932
...            ...             ...       ...       ...       ...       ...
11393   592.681452       -0.092397  0.383054 -0.241211  0.809391  0.965950
11394   963.758208       -0.180630  0.203616 -0.887112  0.375019  0.811330
11395  1602.301221       -0.048485  0.122119 -0.397030  0.691346  0.931692
11396   990.721299       -0.020621  0.227666 -0.090575  0.927830  0.986954
11397  1386.821154        0.179147  0.130609  1.371629  0.170179  0.635439

[11398 rows x 6 columns]


Fitting size factors...
... done in 0.01 seconds.

Fitting dispersions...
... done in 4.60 seconds.

Fitting dispersion trend curve...
... done in 0.46 seconds.

Fitting MAP dispersions...
... done in 5.53 seconds.

Fitting LFCs...
... done in 2.59 seconds.

Refitting 0 outliers.

Running Wald tests...


Log2 fold change & Wald test p-value: Day D18 vs D0
          baseMean  log2FoldChange     lfcSE      stat    pvalue      padj
0      1873.315232        0.321279  0.216725  1.482428  0.138226  0.510968
1      1569.723016        0.533366  0.262996  2.028043  0.042556  0.300448
2       857.886004        0.233896  0.308432  0.758338  0.448249  0.775671
3       807.452021        0.104579  0.217398  0.481049  0.630482  0.869143
4       970.241614        0.070118  0.179717  0.390156  0.696421  0.895624
...            ...             ...       ...       ...       ...       ...
11393   645.722595        0.321099  0.271208  1.183959  0.236429  0.624789
11394   979.650542        0.038830  0.301723  0.128693  0.897600  0.969861
11395  1610.574960       -0.045912  0.260704 -0.176108  0.860209  0.957354
11396  1002.653870        0.236238  0.257274  0.918236  0.358495  0.719600
11397  1242.371313        0.191422  0.217480  0.880184  0.378759  0.733270

[11398 rows x 6 columns]


... done in 1.14 seconds.



In [16]:
dict_res_ddr

{'HAP1': {}}

In [7]:
# Organize the result dictionary into dataframe
df_res_ddr = pd.DataFrame()
for cell_lines in dict_res_ddr.keys():
    df_ = pd.DataFrame.from_dict(dict_res_ddr[cell_lines], orient = 'index')
    df_['cell_line'] = cell_lines
    df_ = df_.reset_index(names = ['category'])
    df_res_ddr = pd.concat([df_res_ddr, df_])
# Calculate the percentage of the hit rate
df_res_ddr['mutation_category'] = df_res_ddr['category'].replace(reverse_dict)
df_res_ddr = df_res_ddr.groupby(['mutation_category', 'cell_line'],as_index=False).sum()
df_res_ddr = df_res_ddr.drop('category', axis = 1)
df_res_ddr['perc'] = df_res_ddr['sig_n']/df_res_ddr['tot_n']
# Save dataframe
df_res_ddr.to_csv(join(output_path, 'DESeq_Martin.csv'))

### Intersection hits

In [73]:
martinhap1 = pd.read_csv(join(output_path, 'MartinHAP1.csv'), index_col = 0)
mis_df = martinhap1[martinhap1['Function'] == 'synonymous']
mis_sig_df1 = mis_df[mis_df['padj']<0.05]

martinhap1 = pd.read_csv(join(output_path, 'MartinMCF10A.csv'), index_col = 0)
mis_df = martinhap1[martinhap1['Function'] == 'synonymous']
mis_sig_df2 = mis_df[mis_df['padj']<0.05]

martinhap1 = pd.read_csv(join(output_path, 'MartinMCF7.csv'), index_col = 0)
mis_df = martinhap1[martinhap1['Function'] == 'synonymous']
mis_sig_df3 = mis_df[mis_df['padj']<0.05]

In [74]:
sets = [set(mis_sig_df1.index), set(mis_sig_df2.index), set(mis_sig_df3.index)]
intersection_result = set.intersection(*sets)

In [76]:
hannahhap1 = pd.read_csv(join(output_path, 'HannahHAP1.csv'), index_col = 0)
mis_df = hannahhap1[hannahhap1['Category'] == 'Synonymous']
mis_sig_df1 = mis_df[mis_df['padj']<0.05]

hannahhap1 = pd.read_csv(join(output_path, 'HannahHA1E.csv'), index_col = 0)
mis_df = hannahhap1[hannahhap1['Category'] == 'Synonymous']
mis_sig_df2 = mis_df[mis_df['padj']<0.05]

hannahhap1 = pd.read_csv(join(output_path, 'HannahA375.csv'), index_col = 0)
mis_df = hannahhap1[hannahhap1['Category'] == 'Synonymous']
mis_sig_df3 = mis_df[mis_df['padj']<0.05]

hannahhap1 = pd.read_csv(join(output_path, 'HannahMELJUSO.csv'), index_col = 0)
mis_df = hannahhap1[hannahhap1['Category'] == 'Synonymous']
mis_sig_df4 = mis_df[mis_df['padj']<0.05]

hannahhap1 = pd.read_csv(join(output_path, 'HannahOVCAR8.csv'), index_col = 0)
mis_df = hannahhap1[hannahhap1['Category'] == 'Synonymous']
mis_sig_df5 = mis_df[mis_df['padj']<0.05]

In [77]:
sets = [set(mis_sig_df1.index), set(mis_sig_df2.index), set(mis_sig_df3.index), set(mis_sig_df4.index), set(mis_sig_df5.index)]
intersection_result = set.intersection(*sets)

In [78]:
len(intersection_result)

30

  log_alpha_hat = np.log(alpha_hat)
  x0=np.log(alpha_hat),
  sign, logdet = _umath_linalg.slogdet(a, signature=signature)


In [66]:
mis_sig_df3.index.intersection(mis_sig_df4.index)

Index(['AAAACCGGCGCTGGAACCAC', 'AAACCAGTCATACCACCCAA', 'AACCCACAGTAGTAACCTGC',
       'ACACTAGGCCTGGGACAGAA', 'CAGTGGTCAAACACACACTG', 'CATTTCCTCATCACAGACAT',
       'CTAATTCCTCATCAGAAAGT', 'GATCCTCATCTCCAGCAATG', 'GATTCCTGAGTTGAACAAAG',
       'GCTCTTCTTTCCTCATCCCC',
       ...
       'TCTTCTTCAACAGGTGGCTG', 'TGAACACTTCGATGACACAG', 'TGCATCACCTCGTCACAGAA',
       'TGCGCCATCGGTGACTCGGA', 'TGCTTCAGCCAAGGCAGCAA', 'TGTATACCAGGCATATGACA',
       'TGTTCATCTTCTTCTTCAAC', 'TGTTCCGTTCTTCAATTCAA', 'TTCATGTCTGTTCTTTAAGA',
       'TTCTCAGATATGGTCTTAAA'],
      dtype='object', length=108)

In [64]:
mis_sig_df2.index.intersection(mis_sig_df1.index)

Index(['AAAACCGGCGCTGGAACCAC', 'AAACCAGTCATACCACCCAA', 'ACACTAGGCCTGGGACAGAA',
       'ATATCATTAAGCAATGCCAC', 'CAGTGGTCAAACACACACTG', 'CATTTCCTCATCACAGACAT',
       'CCTCTGGAAAGAGATCAGCC', 'CTAATTCCTCATCAGAAAGT', 'CTCCACTTCCAAAAGCAGCC',
       'CTCCATGTACTGATAATATA',
       ...
       'TGTATCTCGAACAACTACAC', 'TGTTCATCTTCTTCTTCAAC', 'TGTTCCGTTCTTCAATTCAA',
       'TTACATCAGTAGCTGTCAGA', 'TTATATCCGCCACGATTCCC', 'TTCATGTCTGTTCTTTAAGA',
       'TTCTCAGATATGGTCTTAAA', 'TTCTGACTGCCATAATCCAG', 'TTGCACAGAGTAAAGACTGA',
       'TTTAATCCAATGCCCAGGAA'],
      dtype='object', length=155)

In [65]:
mis_sig_df2.index.intersection(mis_sig_df3.index)

Index(['AAAACCGGCGCTGGAACCAC', 'AAACCAGTCATACCACCCAA', 'AACCCACAGTAGTAACCTGC',
       'ACACTAGGCCTGGGACAGAA', 'CAGTGGTCAAACACACACTG', 'CATTTCCTCATCACAGACAT',
       'CTAATTCCTCATCAGAAAGT', 'GATCCTCATCTCCAGCAATG', 'GATTCCTGAGTTGAACAAAG',
       'GCTCTTCTTTCCTCATCCCC',
       ...
       'TGAACACTTCGATGACACAG', 'TGAATCCTCCACGTCCTCTA', 'TGCATCACCTCGTCACAGAA',
       'TGCGCCATCGGTGACTCGGA', 'TGCTTCAGCCAAGGCAGCAA', 'TGTATACCAGGCATATGACA',
       'TGTTCATCTTCTTCTTCAAC', 'TGTTCCGTTCTTCAATTCAA', 'TTCATGTCTGTTCTTTAAGA',
       'TTCTCAGATATGGTCTTAAA'],
      dtype='object', length=106)