### Sweep gene finding

We have, a table of sweeps signals, and the genes that lie underneath. \
We also have a list of DE genes. 

We want to find, out of our DE genes, which ones lie underneath a sweep, and also, what sweeps do they lie under?

For each sweep signal, we can record the genes 

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("config/samples.tsv", sep="\t")

In [7]:
"resources/reads/" + df['samples']

0     resources/reads/BusRes1
1     resources/reads/BusRes2
2     resources/reads/BusRes3
3     resources/reads/BusRes4
4     resources/reads/BusRes5
5     resources/reads/BusRes6
6     resources/reads/BusSus1
7     resources/reads/BusSus2
8     resources/reads/BusSus3
9     resources/reads/BusSus4
10    resources/reads/BusSus5
11    resources/reads/BusSus6
12       resources/reads/Kis1
13       resources/reads/Kis2
14       resources/reads/Kis3
15       resources/reads/Kis4
Name: samples, dtype: object

In [5]:
df.assign()

0     BusRes1
1     BusRes2
2     BusRes3
3     BusRes4
4     BusRes5
5     BusRes6
6     BusSus1
7     BusSus2
8     BusSus3
9     BusSus4
10    BusSus5
11    BusSus6
12       Kis1
13       Kis2
14       Kis3
15       Kis4
Name: samples, dtype: object

In [1]:
%run workflow/scripts/tools.py

In [2]:
signals = pd.read_csv("../rna-seq-busia/resources/signals.csv")

DEgenes = pd.read_csv("results/genediff/BusiaSus_BusiaRes.csv")
sigup = DEgenes[np.logical_and(DEgenes['padj'] < 0.05, DEgenes['FC'] > 1)]

In [3]:
sweep = {}
nswept = {}

for i, cols in signals.iterrows():
    
    if pd.isnull(cols['overlapping_genes']):
        continue
    
    sweptgenes = np.array(cols['overlapping_genes'].split(" "))
    
    
    overlap = np.isin(sweptgenes, sigup['GeneID'])
    
    sweep[cols['uid']] = sweptgenes[overlap]
    nswept[cols['uid']] = sweptgenes

In [4]:
genes = np.concatenate(list(sweep.values()))
swept = sigup[np.isin(sigup['GeneID'], genes)]

In [5]:
for k,v in sweep.items():
    sweep[k] = ' '.join(v)

sweptDE = pd.DataFrame.from_dict(sweep, orient='index', columns=['overlapping_DE_genes'])
sweptDE = sweptDE.reset_index().rename(columns={'index': 'Ag1000g_sweep'})
sweptDE['overlapping_genes'] = signals['overlapping_genes'][~pd.isnull(signals['overlapping_genes'])].reset_index(drop=True)
sweptDE['chromosome'] = signals['peak_end_seqid'][~pd.isnull(signals['overlapping_genes'])].reset_index(drop=True)
sweptDE['epicenter'] = signals['epicenter_coord'][~pd.isnull(signals['overlapping_genes'])].reset_index(drop=True)
sweptDE['known_loci'] = signals['known_loci'][~pd.isnull(signals['overlapping_genes'])].reset_index(drop=True)

wheresweep = defaultdict(dict)
whatsweep = defaultdict(list)
    
for gene in genes:
    
    for i, cols in sweptDE.iterrows():
        
        sweptgenes = np.array(cols['overlapping_DE_genes'].split(" "))
        
        if np.isin(sweptgenes, gene).any():
            wheresweep[gene]['chrom'] = cols['chromosome']
            wheresweep[gene]['epicenter'] = cols['epicenter']
            wheresweep[gene]['known_loci'] = cols['known_loci']

            whatsweep[gene].append(cols['Ag1000g_sweep'])

for k,v in whatsweep.items():
    whatsweep[k] = ' '.join(v)

In [13]:
dfwhere = pd.DataFrame.from_dict(wheresweep, orient='index')
dfwhat = pd.DataFrame.from_dict(whatsweep, orient='index', columns=['Ag1000g_sweeps'])

df = pd.concat([dfwhat, dfwhere], axis=1)
df = df.reset_index().rename(columns={'index': 'GeneID'})

swept = swept.merge(df)

swept.to_csv("swept.csv")

### Script

In [19]:
pval_threshold = 0.05
fc_threshold = 1.5

In [None]:
pval_threshold = snakemake.params['pval']
fc_threshold = snakemake.params['fc']

In [37]:
comparisons = pd.read_csv("resources/DE.contrast.list")

In [40]:
for comp in comparisons['contrast']:
    
    DEgenes = pd.read_csv(f"results/genediff/{comp}.csv")
    sigup = DEgenes[np.logical_and(DEgenes['padj'] < pval_threshold, DEgenes['FC'] > fc_threshold)]
    
    sweep = {}
    nswept = {}

    for i, cols in signals.iterrows():

        if pd.isnull(cols['overlapping_genes']):
            continue

        sweptgenes = np.array(cols['overlapping_genes'].split(" "))


        overlap = np.isin(sweptgenes, sigup['GeneID'])

        sweep[cols['uid']] = sweptgenes[overlap]
        nswept[cols['uid']] = sweptgenes
        
    genes = np.concatenate(list(sweep.values()))
    swept = sigup[np.isin(sigup['GeneID'], genes)]
    
    for k,v in sweep.items():
        sweep[k] = ' '.join(v)

    sweptDE = pd.DataFrame.from_dict(sweep, orient='index', columns=['overlapping_DE_genes'])
    sweptDE = sweptDE.reset_index().rename(columns={'index': 'Ag1000g_sweep'})
    sweptDE['overlapping_genes'] = signals['overlapping_genes'][~pd.isnull(signals['overlapping_genes'])].reset_index(drop=True)
    sweptDE['chromosome'] = signals['peak_end_seqid'][~pd.isnull(signals['overlapping_genes'])].reset_index(drop=True)
    sweptDE['epicenter'] = signals['epicenter_coord'][~pd.isnull(signals['overlapping_genes'])].reset_index(drop=True)
    sweptDE['known_loci'] = signals['known_loci'][~pd.isnull(signals['overlapping_genes'])].reset_index(drop=True)

    wheresweep = defaultdict(dict)
    whatsweep = defaultdict(list)

    for gene in genes:

        for i, cols in sweptDE.iterrows():

            sweptgenes = np.array(cols['overlapping_DE_genes'].split(" "))

            if np.isin(sweptgenes, gene).any():
                wheresweep[gene]['chrom'] = cols['chromosome']
                wheresweep[gene]['epicenter'] = cols['epicenter']
                wheresweep[gene]['known_loci'] = cols['known_loci']

                whatsweep[gene].append(cols['Ag1000g_sweep'])

    for k,v in whatsweep.items():
        whatsweep[k] = ' '.join(v)
        
    dfwhere = pd.DataFrame.from_dict(wheresweep, orient='index')
    dfwhat = pd.DataFrame.from_dict(whatsweep, orient='index', columns=['Ag1000g_sweeps'])

    df = pd.concat([dfwhat, dfwhere], axis=1)
    df = df.reset_index().rename(columns={'index': 'GeneID'})

    swept = swept.merge(df)
    swept.to_csv(f"results/genediff/ag1000gSweeps/{comp}_swept.tsv", sep="\t")

In [42]:
ls results/genediff/

ag1000g_BusiaSus_BusiaRes_swept.tsv
ag1000g_Kisumu_BusiaRes_swept.tsv
ag1000g_Kisumu_BusiaSus_swept.tsv
[0m[01;32mBusiaSus_BusiaRes.csv[0m*
hits.csv
[01;32mKisumu_BusiaRes.csv[0m*
Kisumu_BusiaSus_BusiaRes.down.progressive.tsv
Kisumu_BusiaSus_BusiaRes.up.progressive.tsv
[01;32mKisumu_BusiaSus.csv[0m*
[01;32mRNA-Seq_diff.xlsx[0m*
[01;32mVolcano_plot_BusiaSus_BusiaRes.pdf[0m*
[01;32mVolcano_plot_Kisumu_BusiaRes.pdf[0m*
[01;32mVolcano_plot_Kisumu_BusiaSus.pdf[0m*
