In [163]:
import time 
import numpy as np
import pandas as pd
import csv
from collections import defaultdict
import random 
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import scipy.signal as signal
import os
from sklearn.cluster import AgglomerativeClustering

def random_color():
    rgbl=[random.randint(0,255),random.randint(0,255),random.randint(0,255)]
    return tuple(rgbl)

def meanFilterBinarization(introSignal, window, threshold):
    introSignal = np.array(introSignal)
    mf = signal.medfilt(introSignal, window)
    mfbin = np.where(mf<=threshold, -10, 10)
    return mfbin


def masker(df, sample):
    out = list()
    for i, sdf in df.groupby(['CHROM']):
        mask = meanFilterBinarization(sdf[sample], 7, 5)
        out.extend(list(mask))
    return out

def plotSample(mw,sample):
    colors = [random_color() for x in range(2)]
    fig = make_subplots(rows=6, cols=2,subplot_titles=(["CHROM %s" %x for x in range(1,12)]), shared_xaxes=True, shared_yaxes=False)
    row = 1
    col = 1
    for chrom in mw['CHROM'].unique():
        fig.append_trace(go.Line(
                    x=mw[mw['CHROM'] == chrom]['GENPOS'],
                    y=mw[mw['CHROM'] == chrom]['mask'],
                    name = 'Median filter (k=7, t=-5)',
                    line = dict(
                        color = "rgb(%s, %s, %s)"%(colors[1][0],colors[1][1],colors[1][2]) 
                    )

                ), row=row, col=col, )
        
        fig.append_trace(go.Line(
                    x=mw[mw['CHROM'] == chrom]['GENPOS'],
                    y=mw[mw['CHROM'] == chrom][sample],
                    name = 'Max Winwindow 10',
                    line = dict(
                        color = "rgb(%s, %s, %s)"%(colors[0][0],colors[0][1],colors[0][2]) 
                    )

                ), row=row, col=col, )

        col += 1
        if col>2:
            col = 1
            row += 1


    fig.update_layout(height=1500, width=1000,
                      title_text=sample[:-4] + 'P. acutifolius Max Window(threshold=10, s: 10)',
                      showlegend=False,
                         yaxis=dict( zeroline=True))
    fig.update_yaxes(range=[-25, 25])
    #sample = sample.replace('_GT', ':GT')
    
    #fig.write_image("./GCDTPLOTS/%s.pdf"%(sample))
    fig.show()
    

def getSummaryByIntrogression(df,sample,distance_threshold=5000000):
    #sample = sample.replace('_GT', ':GT')
    x = masker(df, sample+'_PAc')
    df['mask'] = x
    #plotSample(df, sample+'_PAc')
    data = df[df['mask']==10].copy()
    if data.shape[0]>1:
        clustering = AgglomerativeClustering(n_clusters=None, linkage='single',distance_threshold=distance_threshold)
        X = np.array(data['pi']).reshape(-1, 1)
        out = clustering.fit(X)
        data['label'] = out.labels_
        introgressions = pd.DataFrame(columns=['sample','CHROM','pi', 'pf', 'lp', 'gpi', 'gpf', 'lgp', 'nSNP'])

        for i, sdf in data.groupby(['label']):
            chrom = sdf['CHROM'].unique()
            pi = sdf['pi'].min()
            pf = sdf['pf'].max()
            lp = pf - pi
            gpi = sdf['GENPOS'].min()
            gpf = sdf['GENPOS'].max()
            lgp = gpf - gpi
            nSNP = (sdf.shape[0]-1) * 10 + sdf.iloc[-1]['nSNP']
            row = pd.Series({'sample':sample,'CHROM':chrom[0],'pi':pi, 'pf':pf, 'lp':lp, 'gpi':gpi, 'gpf':gpf, 'lgp':lgp, 'nSNP':nSNP})
            introgressions = introgressions.append(row, ignore_index=True)
        return introgressions
    else:
        return False


In [181]:
df = pd.read_csv('./salidas2/ALB_034-p21__GBS.csv',sep='\t')
out = getSummaryByIntrogression(df, 'ALB_034-p21__GBS',)
plotSample(df, 'ALB_034-p21__GBS_PAc',)


plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.




In [157]:

sdf.insert(loc=0, column='cluster', value=clustering.labels_)
fig = px.scatter(sdf, x="pi", y="AM_010-p13__GBS_PAc", color='cluster')
fig.show()

In [199]:
def getIntrogressionProfile(df, sample):
    clustering = AgglomerativeClustering(affinity='euclidean', compute_full_tree=True,
                            connectivity=None, distance_threshold=2,
                            linkage='single', memory=None, n_clusters=None,
                            pooling_func='deprecated')
    outDf = list()
    for CHROM, sdf in df.groupby('CHROM'):
        sdf = sdf[sdf[sample] >1]
        if sdf.shape[0] > 1:
            clustering.fit(sdf[['GENPOS', sample]])
            sdf.insert(loc=0, column='cluster', value=clustering.labels_)
            print(CHROM)
            out = pd.DataFrame(columns=['CHROM','n','minBound2','maxBound2','minBound','maxBound','nwindows','length','length2'])
            for n,posIntro in sdf.groupby('cluster'):
                        minBound = posIntro.pi.min()
                        maxBound = posIntro.pf.max()
                        minBound2 = posIntro.GENPOS.min()
                        maxBound2 = posIntro.GENPOS.max()
                        length = maxBound-minBound
                        length2 = maxBound2-minBound2
                        nwindows = posIntro.shape[0]
                        if nwindows > 7:
                            out.loc[n] = [CHROM, n, minBound2, maxBound2,minBound, maxBound, nwindows,length,length2]
            outDf.append(out)
    return pd.concat(outDf)
                        
                        
    