# Co-occurrence analysis

### code for Fig. 1c etc.

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency

import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib_venn import *

import networkx as nx

% matplotlib inline

In [None]:
df0 = pd.read_csv('fish-patterns-db-img.csv')

In [None]:
print(len(set(df0['family'])), " families")
print(len(set(df0['genus'])), " genera")
print(len(set(df0['species'])), " species")
print(len(df0), "images")

In [None]:
df_sp = df0.groupby(['genus', 'species']).sum().applymap(lambda x: 1 if x>0 else 0)
spnum = df_sp.reset_index().groupby('genus').count()['species'].rename('sp_num')
df_gen = df_sp.groupby(['genus']).sum().applymap(lambda x: 1 if x>0 else 0)

df_gen_spnum = pd.merge(spnum.to_frame(), df_gen, on='genus')
df_gen_sp2 = df_gen[df_gen_spnum['sp_num']>=2]
print(len(df_gen_sp2), " genera (incl. >=2 sp.)")

df_gen_sp3 = df_gen[df_gen_spnum['sp_num']>=3]
print(len(df_gen_sp3), " genera (incl. >=3 sp.)")


In [None]:
df_gen_comat = df_gen.T.dot(df_gen)
df_gen_comat

In [None]:
df_gen_sp2_comat = df_gen_sp2.T.dot(df_gen_sp2)
df_gen_sp2_comat

In [None]:
df_gen_sp3_comat = df_gen_sp3.T.dot(df_gen_sp3)
df_gen_sp3_comat

### contingency table

| patterns      |B absent (0) |B present (1) |
|:-------------:|:-----------:|:------------:|
| A absent  (0) |O<sub>00</sub>|O<sub>01</sub>|A0
| A present (1) |O<sub>10</sub>|O<sub>11</sub>|A1
|               |B0            |B1            |N

In [None]:
ptns = ["Mono",
        "Bltc",
        "Sp_D",
        "Sp_L",
        "Maze",
        "St_H",
        "St_D",
        "St_V",
        "Sddl",
        "Eyes",
        "Area"]

labels = ["Mono",
          "Bltc",
          "Sp-D",
          "Sp-L",
          "Maze",
          "St-H",
          "St-D",
          "St-V",
          "Sddl",
          "Eyes",
          "Area"]

colors = ["silver",
          "dimgray",
          "#35A16B",
          "orange",
          "#FAF500",
          "turquoise",
          "royalblue",
          "darkslateblue",
          "mediumorchid",
          "pink",
          "sienna"]


In [None]:
def contingency_mat(ptnA, ptnB):
    n = len(df_gen_sp2)
    nA = df_gen_sp2_comat.loc[ptnA, ptnA]
    nB = df_gen_sp2_comat.loc[ptnB, ptnB]
    kAB = df_gen_sp2_comat.loc[ptnA, ptnB]

    obs = np.array([[n-kAB-(nA-kAB)-(nB-kAB), nB-kAB], [nA-kAB, kAB]])
    return obs


In [None]:
# Jaccard index
df_gen_Jaccard = df_gen_sp2_comat.copy()

# Sørensen-Dice coefficient
df_gen_Dice = df_gen_sp2_comat.copy()

# log-likelihood
df_gen_LL = df_gen_sp2_comat.copy()

# Z-score (Dennis1965)
df_gen_Z = df_gen_sp2_comat.copy()

for i in range(11):
    for j in range(11):
        obs = contingency_mat(ptns[i], ptns[j])

        N = obs[1,1] + obs[1,0] + obs[0,1] + obs[0,0]
        A1 = obs[1,1] + obs[1,0]
        A0 = obs[0,1] + obs[0,0]
        B1 = obs[1,1] + obs[0,1]
        B0 = obs[1,0] + obs[0,0]
        
        df_gen_Jaccard.loc[ptns[i], ptns[j]] = obs[1, 1]/(obs[1, 1]+obs[1, 0]+obs[0, 1])
        df_gen_Dice.loc[ptns[i], ptns[j]] = 2*obs[1, 1]/(obs[1, 1]+obs[1, 0]+obs[1, 1] + obs[0, 1])
        
        LL, p, dof, ex = chi2_contingency(obs, correction=False, lambda_='log-likelihood')
        df_gen_LL.loc[ptns[i], ptns[j]] = LL
        
        # df_gen_Z.loc[ptns[i], ptns[j]] = (obs[1, 1] - ex[1, 1]) / np.sqrt(ex[1, 1]*(1-(A1*B1/N**2)))
        df_gen_Z.loc[ptns[i], ptns[j]] = (obs[1, 1] - ex[1, 1]) / np.sqrt(ex[1, 1])
        

### Jaccard index

In [None]:
df_gen_Jaccard

In [None]:
sns.set_context('talk')
plt.figure(figsize=(15, 12))

mask = np.eye(df_gen_Jaccard.shape[0], dtype=np.bool)
ax = sns.heatmap(df_gen_Jaccard, mask=mask, vmax = 0.40, square=True, fmt='.2f', annot=True, linewidth=1.0, cmap='Purples', rasterized=True)
plt.title('Jaccard index', fontsize=24)

ax.set_xticklabels(labels, rotation=0, fontsize=22)
ax.set_yticklabels(labels, rotation=0, fontsize=22)

plt.show()

In [None]:
sns.set(style="white", context="talk")
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111)

G = nx.Graph()

for i in range(11):
    G.add_node(labels[i])
    
for i in range(11):
    for j in range(11):
        if (i!=j):
            if df_gen_Jaccard.iloc[i,j] >= 0.12:
                G.add_edge(labels[i], labels[j], weight=df_gen_Jaccard.iloc[i,j])

edge_Ws = np.array([edge[2]['weight'] for edge in G.edges(data=True)])

pos = nx.circular_layout(G, scale=0.8)

nx.draw_networkx_nodes(G, pos, node_shape='o', node_size=500, node_color='white', linewidths=5.0, edgecolors=colors)
nx.draw_networkx_edges(G, pos,
                       width=15*edge_Ws/max(edge_Ws),
                       alpha=0.8,
                       edge_cmap=plt.cm.Purples,
                       edge_vmin=0.0,
                       edge_vmax=max(edge_Ws),
                       edge_color=edge_Ws)

# nx.draw_networkx_labels(G, pos, font_size=16)

plt.axis('off')
plt.show()
# plt.savefig('temp_Jaccard.svg', transparent=True)

### Sørensen–Dice coefficient

In [None]:
df_gen_Dice

In [None]:
sns.set_context('talk')
plt.figure(figsize=(15, 12))

mask = np.eye(df_gen_Dice.shape[0], dtype=np.bool)
ax = sns.heatmap(df_gen_Dice, mask=mask, vmax = 0.6, square=True, fmt='.2f', annot=True, linewidth=1.0, cmap='Blues', rasterized=True)
plt.title('Sørensen–Dice coefficient', fontsize=24)

ax.set_xticklabels(labels, rotation=0, fontsize=22)
ax.set_yticklabels(labels, rotation=0, fontsize=22)

plt.show()

In [None]:
sns.set(style="white", context="talk")
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111)

G = nx.Graph()

for i in range(11):
    G.add_node(labels[i])
    
for i in range(11):
    for j in range(11):
        if (i!=j):
            if df_gen_Dice.iloc[i,j] >= 0.25:
                G.add_edge(labels[i], labels[j], weight=df_gen_Dice.iloc[i,j])

edge_Ws = np.array([edge[2]['weight'] for edge in G.edges(data=True)])

pos = nx.circular_layout(G, scale=0.8)

nx.draw_networkx_nodes(G, pos, node_shape='o', node_size=500, node_color='white', linewidths=5.0, edgecolors=colors)
nx.draw_networkx_edges(G, pos,
                       width=15*edge_Ws/max(edge_Ws),
                       alpha=0.8,
                       edge_cmap=plt.cm.Blues,
                       edge_vmin=0.0,
                       edge_vmax=max(edge_Ws),
                       edge_color=edge_Ws)

# nx.draw_networkx_labels(G, pos, font_size=16)

plt.axis('off')
plt.show()
# plt.savefig('temp_Dice.svg', transparent=True)

### Log-likelihood

In [None]:
df_gen_LL

In [None]:
sns.set_context('talk')
plt.figure(figsize=(15, 12))

mask = np.eye(df_gen_LL.shape[0], dtype=np.bool)
ax = sns.heatmap(df_gen_LL, mask=mask, vmax = 270, square=True, fmt='.1f', annot=True, linewidth=1.0, cmap='Reds', rasterized=True)
plt.title('Log-likelihood', fontsize=24)

ax.set_xticklabels(labels, rotation=0, fontsize=22)
ax.set_yticklabels(labels, rotation=0, fontsize=22)

plt.show()

In [None]:
sns.set(style="white", context="talk")
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111)

G = nx.Graph()

for i in range(11):
    G.add_node(labels[i])
    
for i in range(11):
    for j in range(11):
        if (i!=j):
            # if df_gen_LL.iloc[i,j] >= 6.63:      # p<=0.01
            if df_gen_LL.iloc[i,j] >= 3.84:      # p<=0.05
                G.add_edge(labels[i], labels[j], weight=df_gen_LL.iloc[i,j])

edge_Ws = np.array([edge[2]['weight'] for edge in G.edges(data=True)])

pos = nx.circular_layout(G, scale=0.8)

nx.draw_networkx_nodes(G, pos, node_shape='o', node_size=500, node_color='white', linewidths=5.0, edgecolors=colors)
nx.draw_networkx_edges(G, pos,
                       width=15*edge_Ws/max(edge_Ws),
                       alpha=0.8,
                       edge_cmap=plt.cm.Reds,
                       edge_vmin=0.0,
                       edge_vmax=max(edge_Ws),
                       edge_color=edge_Ws)

# nx.draw_networkx_labels(G, pos, font_size=16)

plt.axis('off')
plt.show()
# plt.savefig('temp_LL.svg', transparent=True)

### Z-score (Dennis1965)

In [None]:
df_gen_Z

In [None]:
sns.set_context('talk')
plt.figure(figsize=(15, 12))

mask = np.eye(df_gen_Z.shape[0], dtype=np.bool)
ax = sns.heatmap(df_gen_Z, mask=mask, vmax = 12.5, square=True, fmt='.1f', annot=True, linewidth=1.0, cmap='Greens', rasterized=True)
plt.title('Z-score (Dennis1965)', fontsize=24)

ax.set_xticklabels(labels, rotation=0, fontsize=22)
ax.set_yticklabels(labels, rotation=0, fontsize=22)

plt.show()

In [None]:
sns.set(style="white", context="talk")
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111)

G = nx.Graph()

for i in range(11):
    G.add_node(labels[i])
    
for i in range(11):
    for j in range(11):
        if (i!=j):
            # if df_gen_Z.iloc[i,j] >= 2.58:     # p<0.01
            if df_gen_Z.iloc[i,j] >= 1.96:     # p<0.05
                G.add_edge(labels[i], labels[j], weight=df_gen_Z.iloc[i,j])

edge_Ws = np.array([edge[2]['weight'] for edge in G.edges(data=True)])

pos = nx.circular_layout(G, scale=0.8)

nx.draw_networkx_nodes(G, pos, node_shape='o', node_size=500, node_color='white', linewidths=5.0, edgecolors=colors)
nx.draw_networkx_edges(G, pos,
                       width=15*edge_Ws/max(edge_Ws),
                       alpha=0.8,
                       edge_cmap=plt.cm.Greens,
                       edge_vmin=0.0,
                       edge_vmax=max(edge_Ws),
                       edge_color=edge_Ws)

# nx.draw_networkx_labels(G, pos, font_size=16)

plt.axis('off')
plt.show()
# plt.savefig('temp_Z.svg', transparent=True)

In [None]:
from mpl_toolkits.axes_grid1.axes_divider import make_axes_locatable
from mpl_toolkits.axes_grid1.colorbar import colorbar

sns.set_context('talk')
plt.figure(figsize=(15, 12))

mask_u = np.eye(df_gen_LL.shape[0], dtype=np.bool)
mask_l = np.eye(df_gen_LL.shape[0], dtype=np.bool)
mask_u[np.triu_indices_from(mask_u)] = True
mask_l[np.tril_indices_from(mask_l)] = True

ax = sns.heatmap(df_gen_Jaccard,
                 mask=mask_u,
                 vmax=0.35, 
                 square=True,
                 linewidth=0.0,
                 cmap='Purples',
                 cbar=False)

sns.heatmap(df_gen_LL,
            mask=mask_l,
            vmax=270, 
            square=True,
            linewidth=0.0,
            ax=ax,
            cmap='Reds',
            cbar=False)

ax_divider = make_axes_locatable(ax)
cax1 = ax_divider.append_axes('bottom', size='5%', pad='5%')
cax2 = ax_divider.append_axes('right', size='5%', pad='5%')

cbar1 = colorbar(ax.get_children()[0], cax=cax1, orientation='horizontal')
cbar1.solids.set_rasterized(True)
cax1.set_xlabel('Jaccard', fontsize=28)
cax1.yaxis.set_label_position('right')
cax1.tick_params(length=0)

cbar2 = colorbar(ax.get_children()[1], cax=cax2, orientation='vertical')
cbar2.solids.set_rasterized(True)
cax2.set_ylabel('Log-likelihood', fontsize=28)
cax2.xaxis.set_label_position('bottom')
cax2.tick_params(length=0)

for i in range(10):
    ax.axhline(i+1, linewidth=2.0, color='black')
    ax.axvline(10-i, linewidth=2.0, color='black')
    
    
# ax.hlines([1, 2], *ax.get_xlim())
ax.xaxis.set_ticks_position('top')
ax.set_xticklabels(labels, fontsize=24, rotation=60)
ax.set_yticklabels(labels, fontsize=24, rotation=0)

plt.show()
# plt.savefig("co-occ_JI-LL.svg")


In [None]:
from mpl_toolkits.axes_grid1.axes_divider import make_axes_locatable
from mpl_toolkits.axes_grid1.colorbar import colorbar

sns.set_context('talk')
plt.figure(figsize=(15, 12))

mask_u = np.eye(df_gen_Dice.shape[0], dtype=np.bool)
mask_l = np.eye(df_gen_Dice.shape[0], dtype=np.bool)
mask_u[np.triu_indices_from(mask_u)] = True
mask_l[np.tril_indices_from(mask_l)] = True

ax = sns.heatmap(df_gen_Dice,
                 mask=mask_u,
                 vmax=0.50, 
                 square=True,
                 linewidth=0.0,
                 cmap='Blues',
                 cbar=False)

sns.heatmap(df_gen_Z,
            mask=mask_l,
            vmax=12.5, 
            square=True,
            linewidth=0.0,
            ax=ax,
            cmap='Greens',
            cbar=False)

ax_divider = make_axes_locatable(ax)
cax1 = ax_divider.append_axes('bottom', size='5%', pad='5%')
cax2 = ax_divider.append_axes('right', size='5%', pad='5%')

cbar1 = colorbar(ax.get_children()[0], cax=cax1, orientation='horizontal')
cbar1.solids.set_rasterized(True)
cax1.set_xlabel('Dice', fontsize=28)
cax1.yaxis.set_label_position('right')
cax1.tick_params(length=0)

cbar2 = colorbar(ax.get_children()[1], cax=cax2, orientation='vertical')
cbar2.solids.set_rasterized(True)
cax2.set_ylabel('Z-score', fontsize=28)
cax2.xaxis.set_label_position('bottom')
cax2.tick_params(length=0)

for i in range(10):
    ax.axhline(i+1, linewidth=2.0, color='black')
    ax.axvline(10-i, linewidth=2.0, color='black')
    
    
# ax.hlines([1, 2], *ax.get_xlim())
ax.xaxis.set_ticks_position('top')
ax.set_xticklabels(labels, fontsize=24, rotation=60)
ax.set_yticklabels(labels, fontsize=24, rotation=0)

plt.show()
# plt.savefig("co-occ_Dice-Z.svg")


## Triple co-occurrence analysis
### (Sp_L & Sp_D) vs pattern C

In [None]:
def co_occ_sp3_measures_vs_AB(A='Sp_L', B='Sp_D', C='Maze'):
    ABC = len(df_gen_sp3[(df_gen_sp3[A]==1) & (df_gen_sp3[B]==1) & (df_gen_sp3[C]==1)])
    abC = len(df_gen_sp3[((df_gen_sp3[A]==0) | (df_gen_sp3[B]==0)) & (df_gen_sp3[C]==1)])
    ABc = len(df_gen_sp3[(df_gen_sp3[A]==1) & (df_gen_sp3[B]==1) & (df_gen_sp3[C]==0)])
    abc = len(df_gen_sp3[((df_gen_sp3[A]==0) | (df_gen_sp3[B]==0)) & (df_gen_sp3[C]==0)])

    cntmat = np.array([[ABC, abC], [ABc, abc]])

    jaccard = ABC/(ABC + abC + ABc)
    dice = 2*ABC/((ABC+abC) + (ABC+ABc))

    LL, p, dof, ex = chi2_contingency(cntmat, correction=False, lambda_='log-likelihood')
    zscore = (ABC - ex[0, 0]) / np.sqrt(ex[0, 0])

    return cntmat, jaccard, dice, LL, zscore

In [None]:
A = 'Sp_L'
B = 'Sp_D'

df_coocc_sp3_vs_Sp_LD = pd.DataFrame(
    [co_occ_sp3_measures_vs_AB(A, B, 'Mono')[1:],
     co_occ_sp3_measures_vs_AB(A, B, 'Bltc')[1:],
     co_occ_sp3_measures_vs_AB(A, B, 'Maze')[1:],
     co_occ_sp3_measures_vs_AB(A, B, 'St_H')[1:],
     co_occ_sp3_measures_vs_AB(A, B, 'St_D')[1:],
     co_occ_sp3_measures_vs_AB(A, B, 'St_V')[1:],
     co_occ_sp3_measures_vs_AB(A, B, 'Sddl')[1:],
     co_occ_sp3_measures_vs_AB(A, B, 'Eyes')[1:],
     co_occ_sp3_measures_vs_AB(A, B, 'Area')[1:]],
    index=["Mono", "Bltc", 'Maze', 'St-H', 'St-D', 'St-V', 'Sddl', "Eyes", "Area"],
    columns=['Jaccard', 'Dice', 'Log-likelihood', 'Z-score']
)


In [None]:
df_coocc_sp3_vs_Sp_LD

In [None]:
sns.set_context("talk")
sns.set_style("ticks")

fig, axes = plt.subplots(1, 4, figsize=(10, 4), sharey=True)

sns.barplot(x=df_coocc_sp3_vs_Sp_LD['Jaccard'].iloc[::-1],
            y=df_coocc_sp3_vs_Sp_LD['Jaccard'].iloc[::-1].index,
            color=sns.color_palette('Purples')[3],
            ax=axes[0])
axes[0].set_xlabel("Jaccard")

sns.barplot(x=df_coocc_sp3_vs_Sp_LD['Dice'].iloc[::-1],
            y=df_coocc_sp3_vs_Sp_LD['Dice'].iloc[::-1].index,
            color=sns.color_palette('Blues')[3],
            ax=axes[1])
axes[1].set_xlabel("Dice")

sns.barplot(x=df_coocc_sp3_vs_Sp_LD['Z-score'].iloc[::-1],
            y=df_coocc_sp3_vs_Sp_LD['Z-score'].iloc[::-1].index,
            color=sns.color_palette('Greens')[3],
            ax=axes[2])
axes[2].set_xlabel("Z-score")

sns.barplot(x=df_coocc_sp3_vs_Sp_LD['Log-likelihood'].iloc[::-1],
            y=df_coocc_sp3_vs_Sp_LD['Log-likelihood'].iloc[::-1].index,
            color=sns.color_palette('Reds')[3],
            ax=axes[3])
axes[3].set_xlabel("Log-likelihood")


plt.ylim(-0.6, 8.6)
plt.yticks=True
plt.setp(axes[3].yaxis.get_majorticklabels(), rotation=90)

plt.show()
# plt.savefig('co-occ_triple_Jaccard-Dice-Z-LL.svg')