In [None]:
import numpy as np
import pandas as pd
from BeyondBlooms2024.config import name_dict
from BeyondBlooms2024.config_file import (ABUNDANCES_FILE, CCMN_CON_MAP_PATH, CON_LOUVAIN_META_PATH,CON_LOUVAIN_NETWORK_PATH, ENRICH
,NUM_PERMUTATIONS, NUM_SAMPLES, NUM_CORES, METADATA_FILE, PRUNED_PVAL_CCMN_PATH,PVAL_CCMN_PATH,ENRICHED_META_PATH, RANDOM_PVAL_CCMN_PATH)

In [None]:
df = pd.read_csv("data/F4_euk_meta_data.csv", sep=";")
df_env_F4 = df
df_env_F4 = df_env_F4.set_index("date")
df_env_F4.index = pd.to_datetime(df_env_F4.index)
df_env_F4.columns#'PAR_satellite'
print(len(df_env_F4))
df_env_F4 = df_env_F4#.loc['2016-08-01':'2020-07-31']
print(df_env_F4)

In [None]:
# Load Taxa Table
df_taxa = pd.read_csv(ENRICH, sep =',', engine="python")
selected_asv= df_taxa["Nodes"].tolist()

In [None]:
def get_dict(col = "Class"):
    df_taxa_temp =df_taxa[["Nodes",col]]
    df_taxa_temp.set_index("Nodes", inplace=True)
    d_ = df_taxa_temp.to_dict()[col]
    return d_

In [None]:
df_abundance = pd.read_csv(ABUNDANCES_FILE,sep=";")
df_abundance["Unnamed: 0"]= pd.to_datetime(df_abundance["Unnamed: 0"])
df_abundance.set_index("Unnamed: 0",inplace=True)
df_sqrt = np.sqrt(df_abundance)
row_norms = np.linalg.norm(df_sqrt, axis=1)
df_normalized = df_sqrt.div(row_norms, axis=0)
df_abundance = df_normalized
print(df_abundance.head())

In [None]:
dff = []
l_cus = df_taxa["cluster_names"].unique().tolist()
l_cus.sort()
for clu in l_cus:
    temp =df_taxa[df_taxa["cluster_names"]==clu]
    temp_asv= temp["Nodes"].tolist()
    temp_ab = df_abundance[temp_asv]
    temp_ab = temp_ab.sum(axis=1).reset_index()
    #print(temp_ab.columns)
    temp_ab.rename(columns={"Unnamed: 0": "date", 0: f"{clu}"}, inplace=True)
    temp_ab.set_index("date", inplace = True)
    #print(temp_ab.head())
    dff.append(temp_ab)
df_clus = pd.concat(dff, axis=1)
df_clus.head()

In [None]:
df_clus.columns

In [None]:
df_clus# =df_clus[["0","1","2","4","6","7","8","9","10","11"]]#ToDo Namedict

df_clus.rename(columns=name_dict, inplace=True)
clu_cols = df_clus.columns.tolist()

In [None]:
df_env_F4.columns

In [None]:
cols =['MLD', 'PAR', 'temp', "sal", "PW_frac", "O2_conc","depth"]# df_env_F4.columns#
# corr with only one mooring ASV
df_env_F4.rename(columns={'PAR_satellite': 'PAR'}, inplace=True)
df_env = df_env_F4[cols]


In [None]:
window_size = 8  # You can adjust this as needed
df_env = df_env.reset_index()
for i in range(len(df_env)):
    if pd.isna(df_env.at[i, 'O2_conc']):
        # Calculate the rolling mean for the neighborhood around the NaN value
        start = max(0, i - (window_size // 2))
        end = min(len(df_env), i + (window_size // 2) + 1)
        neighborhood_mean = df_env['O2_conc'][start:end].mean()

        # Replace NaN with the neighborhood mean
        df_env.at[i, 'O2_conc'] = neighborhood_mean
# Now, df_env will have NaN values in the "pH" column replaced with rolling mean values around them
print(df_env)
df_env.set_index("date", inplace=True)
print(df_env)

In [None]:
df_corr_gen= df_clus.join(df_env, how="inner") #only the 94 events (here 93!)
correlation_gen = df_corr_gen.corr(method='pearson')
print(correlation_gen[cols].T[df_clus.columns].T)
target_df_gen = correlation_gen[cols].T[df_clus.columns].T
a = target_df_gen.index.tolist()
a.sort()
a

In [None]:
df_corr = target_df_gen

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt

SMALL_SIZE = 20
MEDIUM_SIZE = 20
BIGGER_SIZE = 20
MEDIUM= 20
TICK = 20
I_SIZE = 8
linewidth=4
plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=TICK)    # fontsize of the tick labels
plt.rc('ytick', labelsize=TICK)    # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM)   # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)

# Reshape the data for heatmap
heatmap_data = df_corr.fillna(0)

# Logarithmic normalization
heatmap_data_normalized = heatmap_data

# Create the heatmap
plt.figure(figsize=(16, 16))
sns.heatmap(heatmap_data_normalized, cmap='coolwarm', cbar=True, annot=True, linewidths=1.5)

plt.xlabel('Environment parameter')
plt.ylabel('Cluster')
#plt.title(f'log1p normalized Count Distribution of BAC ASV by {level} (Top 10 Order)')

# Set x-axis tick labels
#xticklabels = [str(cluster[0]) for cluster in heatmap_data_normalized.columns]
#plt.xticks(range(len(heatmap_data_normalized.columns)), xticklabels, rotation=90)
#plt.xticks(rotation=0)
# Save the heatmap
save_path_temp =f'figures/Main_Figure_4_b__2023_08_08_Env_heatmap_Sec_Filter.png'
#plt.savefig(save_path_temp, dpi=200, bbox_inches='tight')

plt.show()

In [None]:
import scipy.stats as stats
gen_col =clu_cols
env_col = ['MLD', 'PAR', 'temp', "sal", "PW_frac", "O2_conc","depth"]
env_no = len(env_col)
gen_no = len(gen_col)
m_1 = np.zeros((gen_no,env_no))
m = np.zeros((gen_no,env_no))
for i in range(0,gen_no):
    for j in range(0,env_no):
        cors, p_values = stats.pearsonr(df_corr_gen[gen_col[i]], df_corr_gen[env_col[j]])
        m[i, j ] =p_values
        m_1[i, j ] =cors
p_values = m
cor_values =  m_1
mask = (p_values > 0.05)
df_p_values = pd.DataFrame(p_values)
index_val = gen_col

In [None]:
import pandas as pd
SMALL_SIZE = 20
MEDIUM_SIZE = 20
BIGGER_SIZE = 20
MEDIUM= 20
TICK = 20
I_SIZE = 8
linewidth=4
plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=TICK)    # fontsize of the tick labels
plt.rc('ytick', labelsize=TICK)    # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM)   # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)
import seaborn as sns
import matplotlib.pyplot as plt
#sns.set_theme(style='white')
from scipy.cluster import hierarchy
# Reshape the data for heatmap
plt.figure(figsize=(20, 30))
heatmap_data = df_corr.fillna(0)
sns.set_style('white')
# Logarithmic normalization

clustermap=sns.clustermap(heatmap_data, cmap='coolwarm', annot=True, linewidths=2.5, col_cluster=False,row_cluster=True, mask=mask, rasterized=False, cbar_kws={'drawedges': False})

#print(clustermap.dendrogram_row.reordered_ind)
#print(clustermap.dendrogram_col.reordered_ind)
colorbar_ax = clustermap.cax
colorbar_ax.grid(False)
clustermap.ax_heatmap.set(ylabel="Cluster", xlabel="Environment Parameter")
plt.grid(False)



# Reorder the rows based on the desired order
#mask = mask[clustermap.dendrogram_row.reordered_ind]

save_path_temp =f'figures/Main_Figure_4_b__Significants_Env_Dendogram.png'
plt.savefig(save_path_temp, dpi=600, bbox_inches='tight')