<a href="https://colab.research.google.com/github/sueannmok/tau_digest_and_kinetics_tools/blob/main/upload_copy_spectramulti_multi_csv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

We start by loading the necessary libraries

In [None]:
import pandas as pd
from scipy.spatial.distance import pdist, jaccard
from scipy.stats import pearsonr,spearmanr
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.interpolate import interp1d
from scipy.spatial.distance import correlation,cosine
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import pairwise_distances
from matplotlib import pyplot as plt
import numpy as np
from seaborn import heatmap
%matplotlib inline

We load the txt files into dataframes.

In [None]:
df1 = pd.read_csv('/example.csv',delimiter=",",decimal=".")
df2 = pd.read_csv('/example2.csv',delimiter=",",decimal=".")


From each dataframe associated with a file, we only keep those values that are between 2 and 40.

In [None]:
df1 = df1[(df1['kDa']>=1.9) & (df1['kDa']<=41)]
df2 = df2[(df2['kDa']>=1.9) & (df2['kDa']<=41)]



In [None]:
df1

The next function is used to combine a list of dataframes. Since different kDa values might appear in different files, we perform an interpolation and obtain values with a step of 0.1 in the range from 2 to 40.

In [None]:
def buildDataframe(dfList,dfNames):
    
    samples = []
    names = []
    samples.append(np.arange(6.0,24,0.1))
    names.append('points')
    for df,name in zip(dfList,dfNames):
        columns = df.columns[1:]
        for c in columns:
            f = interp1d(df['kDa'],df[c])
            samples.append(f(np.arange(6.0,24,0.1)))
            names.append(name+'-'+c)
    
    return pd.DataFrame(data=np.transpose(samples),columns=names,index=None)

Combine the loaded dataframes.

In [None]:
newdf = buildDataframe([df1,df2],['Date1','Date2'])

In [None]:
newdf

We now define a set of auxiliary functions.

In [None]:
def generate_spectra(df):
    """
    Generate spectra matrix from dataframe
    
    Parameters
    ----------
    df : dataframe
         A dataframe where the first row contains the names of the samples and for each sample we have its spectra
    
    Returns
    ----------
    matrix : narray
             The spectra associated with each sample
    names  : list
             The list of names of the samples
    """
    
    names = df.columns[1:]
        
    
    return np.transpose(df.values[:,1:]), names   

In [None]:
def compute_distance(vector1,vector2,metric='correlation'):
    """
    Compute the distance of two spectra vectors using either the correlation or cosine distance.
    
    Parameters
    ----------
    vector1 : array
    vector2 : array
       Must have the same size than vector1
    metric  : string
       Either correlation or cosine
    
    Returns
    ----------
    The distance between vector1 and vector2 using the given metric. 
    """
    if(metric=='correlation'):
        return correlation(vector1,vector2)
    if(metric=='cosine'):
        return cosine(vector1,vector2)

In [None]:
def generate_similarity_matrix(df,metric='correlation'):
    """
    Generates the similarity matrix of a dataframe
    
    Parameters
    ----------
    df : dataframe
    metric  : string
       Either correlation or cosine
    
    Returns
    ----------
    The similarity matrix associated with a dataframe
    """
    spectra,names = generate_spectra(df)
    
    matrix  = np.zeros((len(names),len(names)),dtype='float32')
    for i in range(0,len(names)):
        for j in range(i+1,len(names)):
            matrix[i][j] = compute_distance(spectra[i],
                                            spectra[j],
                                            metric)
            matrix[j][i] = matrix[i][j]
    return matrix, names

In [None]:
def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack([model.children_, model.distances_,
                                      counts]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix,color_threshold=0.10, **kwargs)

In [None]:
def generate_dendogram(df,linkage='average',metric='correlation',threshold=0.2,name='dendogram.svg'):
    """
    Generates dendogram
    
    Parameters
    ----------
    df : dataframe
        A dataframe where the first column contains the names of the samples and the second
        contains the molecular weights
    linkage: string
        The linkage method for building the dendogram:
        ‘ward’, ‘complete’, ‘average’, ‘single’.
    metric: string
        One of the following metrics to compute the distance matrix:
        'pearson' or 'spearman'
    threshold: float >=0 and <=1
        Value that is considered to create a cluster. 
    name: string
        Path where the dendogram will be saved
    Returns
    ----------
    This methods shows the associated dendogram and saves the image.
    """

    if(metric=='correlation' or metric=='cosine'):            
        m,names = generate_similarity_matrix(df,metric)
    else:
        raise Excepction('Only jaccard and dice are supported')
    
    clustering = AgglomerativeClustering(n_clusters=None,affinity='precomputed',linkage=linkage,
                                      distance_threshold=threshold)
    clustering.fit(m)
    
    plt.figure(figsize=(55, 2*len(names)))
    plot_dendrogram(clustering,leaf_font_size=0.5*len(names),orientation='left',labels = names)
    plt.savefig(name)
    

In [None]:
generate_dendogram(newdf,name='dendogram_multi.svg')

In [None]:
def generate_heatmap(df,metric='correlation',name='heatmap.svg'):
    m,names = generate_similarity_matrix(df,metric)
    plt.clf()
    plt.figure(figsize=(len(names)+1, len(names)))
    heatmap(1-m,xticklabels=names,yticklabels=names,annot=True)
    plt.savefig(name)

In [None]:
generate_heatmap(newdf,name='heatmap_multi.svg')