In [None]:
%load_ext watermark

In [None]:
%watermark -a Schmelling,Nicolas -u -d -v -p matplotlib,numpy,pandas,scipy,biopython

---
Any comments and suggestions or questions?     
Please feel free to contact me via [twitter](https://twitter.com/DerSchmelling) or [email](mailto:Nicolas.Schmelling@hhu.de).

---

## Co-occurence of circadian clock proteins in cyanobacteria ##

In the previous [IPython notebook](1_KaiABC_BLAST_Data_Collection_and_Perprocessing.ipynb) we collected the data and preformed some preprocessing steps. In this notebook the co-occurence of circadian clock proteins in cyanobacteria will be analyzed. For this purpose multiple pair-waise Fisher's exact tests will be performed on the count.csv file produced in [this previous IPython noteook](2_KaiABC_BLAST_Heatmap.ipynb) and later clustered for better visualization.

### Fisher's exact test ###

Fisher's exact test is a statistical significance test used in the analysis of contingency tables, in most cases a 2 × 2 contingency table. It belongs to the class of exact tests, because the significance of the deviation from a null hypothesis (e.g., P-value) can be calculated exactly, rather than relying on an approximation that becomes exact in the limit as the sample size grows to infinity. Therefore this test is valid for all sample sizes [[Wikipedia](https://en.wikipedia.org/wiki/Fisher%27s_exact_test)].

The p-value is calculated as follows:

||Group 1|Group 2|Row Total|
|---|---|---|---|
|Case 1|a|b|a+b|
|Case 2|c|d|c+d|
|Column Total|a+c|b+d|a+b+c+d=n||

${\displaystyle p=\frac{\left( \begin{array}{c} a+b \\ a \end{array} \right) + \left( \begin{array}{c} c+d \\ c \end{array} \right)}{\left( \begin{array}{c} n \\ a+c \end{array} \right)} = \frac{(a+b)!\;(c+d)!\;(a+c)!\;(b+d)!}{a!\;b!\;c!\;d!\;n!}}$

In [None]:
import string
import warnings

#import matplotlib as mpl
from matplotlib import cm
#from matplotlib import ticker
from matplotlib.colors import LogNorm
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as s
from scipy.cluster import hierarchy
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import pdist

import networkx as nx
from statsmodels.sandbox.stats import multicomp

%matplotlib inline

In [None]:
warnings.filterwarnings("ignore")
pd.set_option('mode.chained_assignment',None)

In [None]:
'''
The function will perform pair-wise Fisher's exact tests on the merged 
dataframe created above. Basis of the analysis are the organisms having 
the desired protein. First the length of two columns is used to determine 
in which order the columns need to be compared. In the end Fisher's exact 
test is performed. The 'greater' alternative refers to a right-sided test.
'''
def fisher_test_all_proteins(df):
    
    fisher_list = []
    
    for i in df.columns[1:]:
        
        for j in df.columns[1:]:
    
            a = list(df.index[df[i].notnull()])
            b = list(df.index[df[j].notnull()])
            
            if len(a) < len(b):
                
                c = list(set(a).intersection(set(b)))
                d = list(set(b) - set(a))

                e = len(df) - len(c) - len(d)

                oddsratio, pvalue = s.fisher_exact([[len(c), len(b)-len(c)],
                                                    [len(a)-len(c), e]],
                                                   alternative='greater')

                #print(i,'vs.',j,':',pvalue)
                fisher_list.append(pvalue)
            
            else:
                
                c = list(set(b).intersection(set(a)))
                d = list(set(a) - set(b))

                e = len(df) - len(c) - len(d)

                oddsratio, pvalue = s.fisher_exact([[len(c), len(b)-len(c)],
                                                    [len(a)-len(c), e]],
                                                   alternative='greater')

                #print(i,'vs.',j,':',pvalue)
                fisher_list.append(pvalue)
                
    fisher_array = np.array(fisher_list).reshape(len(df.columns[1:]),
                                                 len(df.columns[1:]))
    
    return fisher_array

'''
The function will create an index that is used to order the results
of the pair-wise Fisher tests according to the clustering.
'''
def sort_df(df,sorter):
    # Create the dictionary that defines the order for sorting.
    sorterIndex = dict(zip(sorter,range(len(sorter))))

    data = df.reset_index()
    # Generate a rank column that will be used
    # to sort the dataframe numerically. 
    data['Rank'] = data['index'].map(sorterIndex)
    data.index = data['index']
    data = data.iloc[:,1:]
    data = data[sorter+['Rank']]
    data = data.sort_values(by='Rank',ascending=True)
    
    return data

In [None]:
fisher_df = pd.read_csv('../clock_counts.csv', index_col='Unnamed: 0')

order = ['genus','kaiA_count','kaiB1_count','kaiC1_count','kaiB2_count',
         'kaiC2_count', 'kaiB3_count','kaiC3_count','cpmA_count',
         'ircA_count','pex_count','nhtA_count', 'prkE_count','cdpA_count',
         'ldpA_count','cikA_count','labA_count','sasA_count', 'lalA_count',
         'rpaA_count','rpaB_count','crm_count']

# Final dataframe contains all cyanobacteria with more than five proteins 
# of the circadian clock.
fisher_df = fisher_df[fisher_df.sum(axis=1) > 5]

fisher_df.info()

# single_clock dataframe contains only one replicate of the different
# circadian clocks systems.
single_clocks = fisher_df.copy()

single_clocks['genus'] = np.nan
single_clocks.index = range(0,len(single_clocks.index))

count = 0
for org in list(single_clocks.name):
    single_clocks['genus'][count] = org.split(' ',1)[0]
    count += 1
    
single_clocks = single_clocks.iloc[:,1:]
single_clocks = single_clocks[order]

single_clocks = single_clocks.drop_duplicates()

single_clocks.info()

In [None]:
PCC7942 = ['KaiA','KaiB1','KaiC1','KaiB2','KaiC2','KaiB3','KaiC3',
           'CpmA','IrcA','Pex','NhtA','PrkE','CdpA','LdpA','CikA',
           'LabA','SasA','LalA','RpaA','RpaB','Crm']

In [None]:
#res = fisher_test_all_proteins(fisher_df)
res = fisher_test_all_proteins(single_clocks)

### Multitest Correction with Benjamini Hochberg ###

In [None]:
p_values = res.reshape(1,441).tolist()[0]

corrected_res = multicomp.multipletests(p_values, alpha=0.01,
                                        method='fdr_bh')

cor_p = corrected_res[1].reshape(21,21)

data = pd.DataFrame(cor_p, index=PCC7942, columns=PCC7942)
data.to_csv('../fisher.csv')

### Clustering ###

__Complete-linkage clustering__ is one of several methods of agglomerative _hierarchical clustering_. At the beginning of the process, each element is a singleton cluster, i.e. each element represents a single cluster. These clusters are then combined into larger clusters, resulting in one cluster in the end. The members in the cluster and the distance between each other can be visualized in a dendrogram [[Wikipedia](https://en.wikipedia.org/wiki/Complete-linkage_clustering)].

Mathematically, the complete linkage function — the distance $D(X,Y)$ between clusters $X$ and $Y$ — is described by the following expression: 

${\displaystyle D(X,Y)= \max_{x\in X\,,\, y\in Y} d(x,y)}$

where

+ $d(x,y)$ is the distance between elements $x \in X$ and $y \in Y$;
+ $X$ and $Y$ are two sets of elements (clusters)

The clustering is, in this case, used to better visualize possible co-occurence of circadian clock proteins.

For more information about the clustering, look at the [notebook](http://nbviewer.ipython.org/github/rasbt/matplotlib-gallery/blob/master/ipynb/clust_complete_linkage.ipynb) of Sebastian Raschka.

In [None]:
'''
The linkage() function returns a so-called linkage matrix.
This linkage matrix consists of several rows where each 
row consists of 1 merge. The first and second column denote
the most dissimilar members in each cluster, and the
third row reports the distance between those members. The
last column returns the count of members in the clusters.
'''
row_clusters = linkage(pdist(data, metric='euclidean'), method='complete')
pd.DataFrame(row_clusters, 
             columns=['row label 1', 'row label 2', 'distance',
                      'no. of items in clust.'],
             index=['cluster %d' %(i+1) for i in range(row_clusters.shape[0])])

In [None]:
# Plot the dendrogram/clustering tree
hierarchy.set_link_color_palette(['black'])

plt.figure(figsize=(12,2))
row_dendr = dendrogram(row_clusters, labels=PCC7942,
                       color_threshold=np.inf)

In [None]:
'''
#Benjamini Hochberg Correction All Clocks
sorter = ['Pex','KaiB2','KaiC2','KaiB3','KaiC3','KaiA','CdpA','Crm',
          'PrkE','LabA','CikA','LalA','SasA','RpaA','IrcA','LdpA',
          'KaiB1','CpmA','NhtA','KaiC1','RpaB']
'''

#Benjamini Hochberg Correction Single Clocks
sorter = ['KaiB1','CpmA','SasA','RpaA','IrcA','LdpA','Pex','CdpA',
          'PrkE','KaiA','LabA','CikA','LalA','NhtA','KaiC1','RpaB',
          'KaiB2','KaiC2','Crm','KaiB3','KaiC3']

data1 = sort_df(data, sorter)
data1.iloc[:,:-1].head()

In [None]:
f = plt.figure(figsize=(7.5,3.55))#8.75,7.5))
    
plots = gridspec.GridSpec(2, 4, wspace=0.0, hspace=0.12,
                          width_ratios=[0.2, 1, 0.4, 1],
                          height_ratios=[1, 0.05])

# Remove the plot frame lines.
def clean_axis(ax):
    for sp in ax.spines.values():
        sp.set_visible(False)
        
hfont = {'fontname':'Arial'}
    
############ Cluster Dendrogram ############

tree = f.add_subplot(plots[0,0])
    
# Remove the tick marks
tree.tick_params(axis="both", which="both", bottom="off", top="off",
                left="off", right="off", labeltop="off", labelbottom="off", 
                labelleft="off", labelright="off")

clean_axis(tree)
    
# Plot the phylogenetic tree, constructed as described previously.
row_dendr = dendrogram(row_clusters, orientation='left',
                       color_threshold=np.inf)
    
tree.set_xticks([])
tree.set_yticks([])

tree.invert_yaxis()

############ Co-Occurence Heatmap ############
    
heat = f.add_subplot(plots[0,1])

heat.tick_params(axis="both", which="both", bottom="off", top="off",  
                 left="off", right="off", labeltop="on", labelbottom="off",
                 labelleft='off', labelright='on')

clean_axis(heat)

heat.set_xlim(0, len(sorter))
heat.set_xticks(np.arange(len(sorter))+0.5, minor=False)
heat.set_xticklabels(sorter, fontsize=7.5, rotation=90, **hfont)

heat.set_ylim(0, len(sorter))
heat.set_yticks(np.arange(len(sorter))+0.5, minor=False)
heat.set_yticklabels(sorter, fontsize=7.5, rotation=0, **hfont)

heatmap = plt.pcolor(data1, cmap='GnBu_r',
                     norm=LogNorm(vmax=1, vmin=1e-2),
                     lw=0.75, edgecolors='w')

heat.invert_yaxis()

############ Colorbar Heatmap ############

bar = f.add_subplot(plots[1,1])

bar.tick_params(axis='both', which='both', bottom='off', top='off',
                left='off', right='off', labeltop='off', labelbottom='off',
                labelleft='off', labelright='off')

clean_axis(bar)

cbar = f.colorbar(heatmap, orientation='horizontal',
                  pad=0.1, fraction=0.9)

cbar.ax.tick_params(labelsize=8) 
    
cbar.set_label('p-values', fontsize=10, **hfont)
cbar.outline.set_linewidth(0)

############ Co-Occurence Network ############

net = f.add_subplot(plots[0,3])

net.tick_params(axis='both', which='both', bottom='off',top='off',
                left='off', right='off', labelbottom='off', labeltop='off',
                labelleft='off', labelright='off')

clean_axis(net)

pos = { 0: [0.65, 0.8],  
        1: [0.67, 0.957], 
        2: [0.5, 0.975],
        3: [-0.04, 0.72],
        4: [-0.095, 0.58], 
        5: [-0.035, 0.28244981], 
        6: [0.08, 0.15895997],
        7: [0.81, 0.908], 
        8: [1.09,  0.6], 
        9: [0.89,  0.14], 
       10: [0.12, 0.87], 
       11: [0.48,  0.022],
       12: [0.73, 0.055], 
       13: [1.1,  0.45], 
       14: [0.24510083,  0.06577308], 
       15: [0.3,   4.25269783e-01],
       16: [0.938,  0.825],  
       17: [0.85, 0.35],
       18: [1.035, 0.72],
       19: [0.32,  0.95], 
       20: [-0.095, 4.25269783e-01]}

pos_labels = { 0: [ 0.52, 0.805], 
               1: [ 0.75, 1.01], 
               2: [ 0.53, 1.03],
               3: [ 0.0, 0.77],
               4: [ 0.05, 0.58],  
               5: [ 0.1, 0.33], 
               6: [ 0.05, 0.09],  
               7: [ 0.95, 0.93], 
               8: [ 1.19, 0.54], 
               9: [ 1.0, 0.11], 
              10: [ 0.09, 0.93], 
              11: [ 0.47, -0.04], 
              12: [ 0.75, -0.01], 
              13: [ 1.15, 0.38], 
              14: [ 0.242, 0.007], 
              15: [ 0.3, 0.475],
              16: [ 1.08, 0.83],
              17: [ 0.95, 0.3], 
              18: [ 1.17, 0.72],
              19: [ 0.28, 1.01], 
              20: [ 0.02, 0.425]}


labels = { 0: 'KaiA',  1: 'KaiB1', 2: 'KaiC1',
           3: 'KaiB2', 4: 'KaiC2', 5: 'KaiB3',
           6: 'KaiC3', 7: 'CpmA',  8: 'IrcA',
           9: 'Pex',  10: 'NhtA', 11: 'PrkE',
          12: 'CdpA', 13: 'LdpA', 14: 'CikA',
          15: 'LabA', 16: 'SasA', 17: 'LalA',
          18: 'RpaA', 19: 'RpaB', 20: 'Crm'}

np.place(cor_p, cor_p>1e-2, 0)

G = nx.from_numpy_matrix(cor_p)

weights = [weight[2]['weight'] for weight in G.edges(data=True)]

node_weights = [d[1]**1.7+20 for d in G.degree_iter()]

nodes = nx.draw_networkx_nodes(G, pos=pos, nodelist = G.nodes(),
                               node_size=node_weights, node_shape='h',
                               node_color='white', linewidths=0.5)

edges = nx.draw_networkx_edges(G, pos=pos, edgelist = G.edges(),
                               edge_color=weights, edge_cmap=cm.plasma,
                               width=1, alpha=1)

labels = nx.draw_networkx_labels(G, pos=pos_labels, labels=labels,
                                 font_size=7.5, font_family='Arial')

net.set_xlim(-0.2, 1.25)
net.set_ylim(-0.2, 1.1)

net.text(-0.4, 1.2, 'B', fontsize=15, **hfont)
net.text(-2.6, 1.2, 'A', fontsize=15, **hfont)

############ Colorbar Network ############

barnet = f.add_subplot(plots[1,3])

barnet.tick_params(axis='both', which='both', bottom='off', top='off',
                   left='off', right='off', labeltop='off', labelbottom='off',
                   labelleft='off', labelright='off')

clean_axis(barnet)

minval = np.min(cor_p[np.nonzero(cor_p)])
maxval = np.max(cor_p[np.nonzero(cor_p)])

heati = plt.pcolor(cor_p, cmap='plasma',
                   norm=LogNorm(vmax=maxval, vmin=minval),
                   lw=1, edgecolors='w')

heati.set_visible(False)

cb1 = plt.colorbar(orientation='horizontal',
                   pad=0.1, fraction=0.9)
    
cb1.update_ticks()

for label in cb1.ax.xaxis.get_ticklabels():
    label.set_visible(False)
    
label = cb1.ax.xaxis.get_ticklabels()
label[0].set_visible(True)
label[7].set_visible(True)
label[14].set_visible(True)
    
cb1.ax.tick_params(labelsize=8)
cb1.set_label('p-values', fontsize=10, **hfont)

cb1.outline.set_linewidth(0)

plt.savefig('../Correlation+Network.eps', bbox_inches='tight', dpi=1000)

---
### Previous ###

+ [Data Collection and Processing](1_KaiABC_BLAST_Data_Collection_and_Perprocessing.ipynb)
+ [Distribution of Circadian Clock proteins](2_KaiABC_BLAST_Heatmap.ipynb)
+ [Length distribution of KaiA, KaiB, KaiC](3_KaiABC_BLAST_Scatterplot.ipynb)

### Next ###

+ [Additional Analyses](5_KaiABC_BLAST_Other.ipynb)

---