This notebook generates Figure 14 in Echols, Rocap, and Riser (). To do so, it performs a curve-fitting routine on individual profiles from the original dataset. Additional cells are included here to allow the user to test out curve-fitting on the group-average profiles presented in the paper. This code also produces Supplemental Figures 3 and 7.

In [None]:
#Import packages
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import chisquare
from scipy.stats import norm
import xarray as xr
from scipy.optimize import curve_fit
from lmfit import Model
import pandas as pd

from scipy import optimize

In [None]:
#Define fit functions: note that this is a restricted list, from which composite functions involving 2 or more
#of these basic shapes can be constructed. 

def gaussian(depth, amplitude, mean_depth, width):
    
    return amplitude * np.exp(-(depth - mean_depth)**2 / width)

def sigmoid(depth, surface, zhalf, slope):
    
    return surface/(1 + np.exp(( zhalf - depth ) * slope))

def exponential(depth, surface, zhalf):
    
    return surface*np.exp((-np.log(2)/(zhalf))*depth)

def line(depth, slope, intercept):

    return slope*x + intercept

In [None]:
#Data paths
#file with all of the individual profiles used in the study:
all_data_file='/Users/rosalindechols/Documents/Generals/Self_Shading_Research/Data/Argo_Merged/CHL_Data/all_chla_argo_250_5m.nc'

#file with the weighted average cluster profiles developed in the study:
gmm_file='/Users/rosalindechols/Documents/Generals/Self_Shading_Research/Analysis/PCA_GMM/FINAL/17pcs_21clusters/gmm_results_17pcs_21clusters.nc' 

#path for saving calculations and figures, if desired
sfpath='/Users/rosalindechols/Documents/Generals/Self_Shading_Research/Analysis/PCA_GMM/FINAL/17pcs_21clusters/Curve Fits/'


save_figs==False
save_calcs==False
#set whether you wish to find curve fits on raw profiles or weighted average profiles:
averages==False

#import data:
print('Import data')

if averages==False:
    data=xr.open_dataset(all_data_file)
else:
    data=xr.open_dataset(gmm_file)

In [None]:
#cell for calculating profiles based on individual profiles:
make_supp_fig=True

if averages==False:
    else:
    count=0
    length=len(data['CHLA'])
    params=np.nan*np.ones((length,24))
    AIC=np.nan*np.ones((length,6))
    chisq=np.nan*np.ones((length,6))
    #for loop to cycle through each profile:
    for n in range(0,length):
        fsurf=np.nanmean(data['CHLA'].values[n,0:7])
        try:
            zhalf=data['DEPTH'].values[n,next(i for i,j in enumerate(data['CHLA'].values[n]) if j < data['CHLA'].values[n,0]/2)]
        except StopIteration:
            zhalf=200
        zmax=data['DEPTH'].values[0,next(i for i,j in enumerate(data['CHLA'].values[n]) if j == max(data['CHLA'].values[n]))]
        cmax=max(data['CHLA'].values[n])
        amp=max(data['CHLA'].values[n])
        peak=zmax
        width=200
        
        y=data['CHLA'].values[n]
        x=data['DEPTH'].values[n]
        
        #option 1: Gaussian
        mod1 = Model(gaussian)
        mod1.set_param_hint('amplitude', value=amp, min=0, max=50)
        mod1.set_param_hint('mean_depth', value=peak, min=0, max=200)
        mod1.set_param_hint('width', value=width,min=1, max=5000)
        pars = mod1.make_params()
        result1 = mod1.fit(y, pars, depth=x)
        
        params[n,0]=result1.params['amplitude'].value
        params[n,1]=result1.params['mean_depth'].value
        params[n,2]=result1.params['width'].value
        
        # print(result1.fit_report())
        AIC[n,0]=result1.aic
        chisq[n,0]=result1.chisqr
        
        # #option 2: Sigmoid
        try:
            mod2 = Model(sigmoid)
            mod2.set_param_hint('surface',value=fsurf,min=0,max=50)
            mod2.set_param_hint('zhalf',value=zhalf, min=10,max=250)
            mod2.set_param_hint('slope',value=-1)
            pars = mod2.make_params()
            result2 = mod2.fit(y, pars, depth=x)
            
            params[n,3]=result2.params['surface'].value
            params[n,4]=result2.params['zhalf'].value
            params[n,5]=result2.params['slope'].value
            
            # print(result2.fit_report())
            AIC[n,1]=result2.aic
            chisq[n,1]=result2.chisqr
        except ValueError:
            pass
        
        #option 3: Exponential
        try:
            mod3 = Model(exponential)
            mod3.set_param_hint('surface',value=amp,min=0.04,max=50)
            mod3.set_param_hint('zhalf',value=zhalf,min=0,max=250)
            pars=mod3.make_params()
            result3 = mod3.fit(y, pars, depth=x)
            
            params[n,6]=result3.params['surface'].value
            params[n,7]=result3.params['zhalf'].value
            AIC[n,2]=result3.aic
            chisq[n,2]=result3.chisqr
        except ValueError:
            pass
        
        
        #can just combine individual models:
        #Gaussian + Line
        try:
            mod4 = Model(gaussian) + Model(line)
            mod4.set_param_hint('amplitude', value=amp, min=0, max=50)
            mod4.set_param_hint('mean_depth', value=peak, min=0, max=200)
            mod4.set_param_hint('width', value=width, min=1, max=5000)
            mod4.set_param_hint('slope',value=-0.0001,min=-0.25,max=0.1)
            mod4.set_param_hint('intercept',value=fsurf,min=0,max=50)
            pars=mod4.make_params()
            result4 = mod4.fit(y, pars, depth=x)
            
            params[n,8]=result4.params['amplitude'].value
            params[n,9]=result4.params['mean_depth'].value
            params[n,10]=result4.params['width'].value
            params[n,11]=result4.params['slope'].value
            params[n,12]=result4.params['intercept'].value
            AIC[n,3]=result4.aic
            chisq[n,3]=result4.chisqr
        except ValueError:
            pass
        
        #Gaussian + Sigmoid
        try:
            mod5 = Model(gaussian) + Model(sigmoid)
            mod5.set_param_hint('amplitude', value=amp/2, min=0, max=50)
            mod5.set_param_hint('mean_depth', value=peak, min=0, max=200)
            mod5.set_param_hint('width', value=width, min=1, max=5000)
            mod5.set_param_hint('surface',value=fsurf,min=0,max=50)
            mod5.set_param_hint('zhalf',value=zhalf, min=10,max=250)
            mod5.set_param_hint('slope',value=-1)
            pars=mod5.make_params()
            result5 = mod5.fit(y, pars, depth=x)
            
            params[n,13]=result5.params['amplitude'].value
            params[n,14]=result5.params['mean_depth'].value
            params[n,15]=result5.params['width'].value
            params[n,16]=result5.params['surface'].value
            params[n,17]=result5.params['zhalf'].value
            params[n,18]=result5.params['slope'].value
            
            AIC[n,4]=result5.aic
            chisq[n,4]=result5.chisqr
        except ValueError:
            pass
        
        #Gaussian + Exponential
        try:
            mod6 = Model(gaussian) + Model(exponential)
            mod6.set_param_hint('amplitude', value=amp/2, min=0, max=50)
            mod6.set_param_hint('mean_depth', value=peak, min=0, max=200)
            mod6.set_param_hint('width', value=width, min=1, max=5000)
            mod6.set_param_hint('surface',value=fsurf,min=0,max=50)
            mod6.set_param_hint('zhalf',value=zhalf,min=0,max=100)
            pars=mod6.make_params()
            result6 = mod6.fit(y, pars,depth=x)
            
            params[n,19]=result5.params['amplitude'].value
            params[n,20]=result5.params['mean_depth'].value
            params[n,21]=result5.params['width'].value
            params[n,22]=result5.params['surface'].value
            params[n,23]=result5.params['zhalf'].value
            
            AIC[n,5]=result6.aic
            chisq[n,5]=result6.chisqr
        except ValueError:
            pass
        
        if make_supp_figs==True and n in [10000,12000]:
            print('Profile #%d' %(n+1))
            print(AIC[n])
            best_fit1=next(i for i,j in enumerate(AIC[n]) if j==np.nanmin(AIC[n]))
            print('Best choice #1: ', best_fit1+1,AIC[n,best_fit1])
            print(chisq[n])
            best_fit2=next(i for i,j in enumerate(chisq[n]) if j==np.nanmin(chisq[n]))
            print('Best choice #2: ', best_fit2+1,chisq[n,best_fit2])
            
            fig=plt.figure(figsize=(16,8))
            ax1=fig.add_subplot(2,3,1)
            ax1.set_title('Gaussian',fontsize=16)
            ax1.plot(y, x, 'bo')
            ax1.plot(result1.init_fit, x, 'k--', label='initial fit')
            ax1.plot(result1.best_fit, x, 'r-', label='best fit')
            ax1.legend(loc='best')
            ax1.set_ylim(250,0)
            ax2=fig.add_subplot(2,3,2)
            ax2.set_title('Sigmoid',fontsize=16)
            ax2.plot(y,x, 'bo')
            ax2.plot(result2.init_fit, x, 'k--', label='initial fit')
            ax2.plot(result2.best_fit, x, 'r-', label='best fit')
            ax2.legend(loc='best')
            ax2.set_ylim(250,0)
            ax3=fig.add_subplot(2,3,3)
            ax3.set_title('Exponential',fontsize=16)
            ax3.plot(y,x, 'bo')
            ax3.plot(result3.init_fit, x, 'k--', label='initial fit')
            ax3.plot(result3.best_fit, x, 'r-', label='best fit')
            ax3.legend(loc='best')
            ax3.set_ylim(250,0)
            ax4=fig.add_subplot(2,3,4)
            ax4.set_title('Gaussian + Line',fontsize=16)
            ax4.plot(y,x, 'bo')
            ax4.plot(result4.init_fit, x, 'k--', label='initial fit')
            ax4.plot(result4.best_fit, x, 'r-', label='best fit')
            ax4.legend(loc='best')
            ax4.set_ylim(250,0)
            ax5=fig.add_subplot(2,3,5)
            ax5.set_title('Gaussian + Sigmoid',fontsize=16)
            ax5.plot(y,x, 'bo')
            ax5.plot(result5.init_fit, x, 'k--', label='initial fit')
            ax5.plot(result5.best_fit, x, 'r-', label='best fit')
            ax5.legend(loc='best')
            ax5.set_ylim(250,0)
            ax6=fig.add_subplot(2,3,6)
            ax6.set_title('Gaussian + Exponential',fontsize=16)
            ax6.plot(y,x, 'bo')
            ax6.plot(result6.init_fit, x, 'k--', label='initial fit')
            ax6.plot(result6.best_fit, x, 'r-', label='best fit')
            ax6.legend(loc='best')
            ax6.set_ylim(250,0)
            
            sub_fig={'10000':'a','12000':'b'}
            if save_figs==True:
                plt.savefig(sfpath+'Figure_S3%s.pdf'%(sub_fig[str(n)]),bbox_inches='tight',format='pdf')
                plt.close()
            else:
                plt.show()

In [None]:
number_of_groups=21

if averages==True:
    new_groups=data['gmm_new_groups'].values
    params=np.nan*np.ones((number_of_groups,24))
    AIC=np.zeros((number_of_groups,6))
    chisq=np.zeros((number_of_groups,6))
    #for loop to cycle through each profile:
    for n in range(0,number_of_groups):
        
        fig=plt.figure(figsize=(16,8))
        
        fsurf=np.nanmean(data['CHLA_ave'].values[new_groups[n],0:7])
        try:
            zhalf=data['DEPTH'].values[0,next(i for i,j in enumerate(data['CHLA_ave'].values[new_groups[n]]) if j < data['CHLA_ave'].values[new_groups[n],0]/2)]
        except StopIteration:
            zhalf=200
        zmax=data['DEPTH'].values[0,next(i for i,j in enumerate(data['CHLA_ave'].values[new_groups[n]]) if j == max(data['CHLA_ave'].values[new_groups[n]]))]
        cmax=max(data['CHLA_ave'].values[new_groups[n]])
        amp=max(data['CHLA_ave'].values[new_groups[n]])
        peak=zmax
        width=200
        
        y=data['CHLA_ave'].values[new_groups[n]]
        x=data['DEPTH'].values[0]
        
        #option 1: Gaussian
        mod1 = Model(gaussian)
        mod1.set_param_hint('amplitude', value=amp, min=0, max=50)
        mod1.set_param_hint('mean_depth', value=peak, min=0, max=200)
        mod1.set_param_hint('width', value=width,min=1, max=5000)
        pars = mod1.make_params()
        result1 = mod1.fit(y, pars, depth=x)
        
        params[n,0]=result1.params['amplitude'].value
        params[n,1]=result1.params['mean_depth'].value
        params[n,2]=result1.params['width'].value
        
        AIC[n,0]=result1.aic
        chisq[n,0]=result1.chisqr
        
        # #option 2: Sigmoid
        mod2 = Model(sigmoid)
        mod2.set_param_hint('surface',value=fsurf,min=0,max=50)
        mod2.set_param_hint('zhalf',value=zhalf, min=10,max=250)
        mod2.set_param_hint('slope',value=-1)
        pars = mod2.make_params()
        result2 = mod2.fit(y, pars, depth=x)
        
        params[n,3]=result2.params['surface'].value
        params[n,4]=result2.params['zhalf'].value
        params[n,5]=result2.params['slope'].value
        
        AIC[n,1]=result2.aic
        chisq[n,1]=result2.chisqr
        
        #option 3: Exponential
        mod3 = Model(exponential)
        mod3.set_param_hint('surface',value=fsurf,min=0,max=50)
        mod3.set_param_hint('zhalf',value=zhalf,min=0,max=100)
        pars=mod3.make_params()
        result3 = mod3.fit(y, pars, depth=x)
        
        params[n,6]=result3.params['surface'].value
        params[n,7]=result3.params['zhalf'].value
        
        AIC[n,2]=result3.aic
        chisq[n,2]=result3.chisqr
        
        #can also combine individual models:
        #Gaussian + Line
        mod4 = Model(gaussian) + Model(line)
        mod4.set_param_hint('amplitude', value=amp, min=0, max=50)
        mod4.set_param_hint('mean_depth', value=peak, min=0, max=200)
        mod4.set_param_hint('width', value=width, min=1, max=5000)
        mod4.set_param_hint('slope',value=-0.0001,min=-0.25,max=0.1)
        mod4.set_param_hint('intercept',value=fsurf,min=0,max=50)
        pars=mod4.make_params()
        result4 = mod4.fit(y, pars, depth=x)
        
        params[n,8]=result4.params['amplitude'].value
        params[n,9]=result4.params['mean_depth'].value
        params[n,10]=result4.params['width'].value
        params[n,11]=result4.params['slope'].value
        params[n,12]=result4.params['intercept'].value
        
        #Gaussian + Sigmoid
        mod5 = Model(gaussian) + Model(sigmoid)
        mod5.set_param_hint('amplitude', value=amp/2, min=0, max=50)
        mod5.set_param_hint('mean_depth', value=peak, min=0, max=200)
        mod5.set_param_hint('width', value=width, min=1, max=5000)
        mod5.set_param_hint('surface',value=fsurf,min=0,max=50)
        mod5.set_param_hint('zhalf',value=zhalf, min=10,max=250)
        mod5.set_param_hint('slope',value=-1)
        pars=mod5.make_params()
        result5 = mod5.fit(y, pars, depth=x)
        
        params[n,13]=result5.params['amplitude'].value
        params[n,14]=result5.params['mean_depth'].value
        params[n,15]=result5.params['width'].value
        params[n,16]=result5.params['surface'].value
        params[n,17]=result5.params['zhalf'].value
        params[n,18]=result5.params['slope'].value
        
        AIC[n,4]=result5.aic
        chisq[n,4]=result5.chisqr
        
        #Gaussian + Exponential
        mod6 = Model(gaussian) + Model(exponential)
        mod6.set_param_hint('amplitude', value=amp/2, min=0, max=50)
        mod6.set_param_hint('mean_depth', value=peak, min=0, max=200)
        mod6.set_param_hint('width', value=width, min=1, max=5000)
        mod6.set_param_hint('surface',value=fsurf)#,min=0,max=50)
        mod6.set_param_hint('zhalf',value=zhalf)#,min=0,max=100)
        pars=mod6.make_params()
        result6 = mod6.fit(y, pars,depth=x)
        
        params[n,19]=result5.params['amplitude'].value
        params[n,20]=result5.params['mean_depth'].value
        params[n,21]=result5.params['width'].value
        params[n,22]=result5.params['surface'].value
        params[n,23]=result5.params['zhalf'].value
        

        AIC[n,5]=result6.aic
        chisq[n,5]=result6.chisqr        

In [None]:
if save_calcs==True:
     param_labels=['g_amplitude','g_mean_depth','g_width','s_surface','s_zhalf','s_slope',
                  'e_surface','e_zhalf','gl_amplitude','gl_mean_depth','gl_width','gl_slope',
                  'gl_intercept','gs_amplitude','gs_mean_depth','gs_width','gs_surface',
                  'gs_zhalf','gs_slope','ge_amplitude','ge_mean_depth','ge_width',
                  'ge_surface','ge_zhalf']
        
    df2 = pd.DataFrame(params,columns=param_labels)
    
    df2.to_csv(sfpath+'fit_parameters.csv')
    
    np.savetxt(sfpath+'AIC_curve_fits.txt',AIC, delimiter=",")
    np.savetxt(sfpath+'chisq_curve_fits.txt',chisq, delimiter=",")

In [None]:
print('Specify subgroup of profiles of interest')
#Southern Ocean:
subset=[i for i,j in enumerate(data['LATITUDE']) if j<-40]
#all profiles:
#subset=range(len(AIC))

best_fit=np.zeros(len(AIC[subset]))

curve_distrib=np.zeros((21,6))

curve_distrib_day=np.zeros((21,6))
curve_distrib_night=np.zeros((21,6))
print('Process data')
#Find best fit
for n in range(0,len(AIC[subset])):
    #if you want to track the progress, you can do so by printing out loop iterations:
#     if n%5000==0:
#         print(n)
    #we calculate the minimum in both AIC and Chi-Square, but the analysis was done using AIC
    #users should feel free to compare the results
    min1=next(i for i,j in enumerate(aic_data[subset][n]) if j == np.nanmin(aic_data[subset][n]))
    min2=next(i for i,j in enumerate(chisq_data[subset][n]) if j == np.nanmin(chisq_data[subset][n]))
    best_fit[n]=min1

    curve_distrib[int(new_labels[subset][n]),min1]+=1  

Figure 14: This figure shows the breakdown of optimal curve fits for individual profiles within each cluster

In [None]:
print('Create Figure 14')

fig=plt.figure(figsize=(16,8))

plt.bar(np.arange(0,21),curve_distrib[:,0]/sum3,label='Gaussian',color='#1b9e77')
plt.bar(np.arange(0,21),curve_distrib[:,1]/sum3,bottom=curve_distrib[:,0]/sum3,label='Sigmoid',color='#d95f02')
plt.bar(np.arange(0,21),curve_distrib[:,2]/sum3,bottom=(curve_distrib[:,0]+curve_distrib[:,1])/sum3,label='Exponential',color='#7570b3')
plt.bar(np.arange(0,21),curve_distrib[:,3]/sum3,bottom=(curve_distrib[:,0]+curve_distrib[:,1]+curve_distrib[:,2])/sum3,
        label='Gaussian+Line',color='#e7298a')
plt.bar(np.arange(0,21),curve_distrib[:,4]/sum3,bottom=(curve_distrib[:,0]+curve_distrib[:,1]+curve_distrib[:,2]+
        curve_distrib[:,3])/sum3,label='Gaussian+Sigmoid',color='#66a61e')
plt.bar(np.arange(0,21),curve_distrib[:,5]/sum3,bottom=(curve_distrib[:,0]+curve_distrib[:,1]+curve_distrib[:,2]+
        curve_distrib[:,3]+curve_distrib[:,4])/sum3,label='Gaussian+Exponential',color='#e6ab02')
plt.legend(loc='best')
plt.xticks([0,5,10,15,20],labels=['1','6','11','16','21'])
plt.yticks([0,0.2,0.4,0.6,0.8,1],labels=['0','20','40','60','80','100'])
plt.xlabel('Group',fontsize=18)
plt.ylabel('Percentage of Profiles',fontsize=18)
plt.tick_params(labelsize=16)

if save_figs==True:
    plt.savefig(sfpath+'Figure_14.pdf',bbox_inches='tight',format='pdf')
    plt.close()
else:
    plt.show()

Supplemental Figure 7: 

In [None]:
print('Create Figure S7')

fig=plt.figure(figsize=(12,6))
vals=plt.hist(best_fit,bins=np.arange(0,7),facecolor='lightgray',edgecolor='k',lw=2)
plt.xticks(ticks=[0.5,1.5,2.5,3.5,4.5,5.5],labels=['Gaussian','Sigmoid','Exponential',
                                          'Gaussian +\n Line','Gaussian +\n Sigmoid','Gaussian + \n Exponential'],
            rotation = 45)
plt.tick_params(labelsize=16)
plt.xlabel('Best Curve Fit Option',fontsize=18)
plt.ylabel('Number of Profiles', fontsize=18)

if save_figs==True:
    plt.savefig(sfpath+'Figure_S7.pdf',format='pdf',bbox_inches='tight')
    plt.close()
else:
    plt.show()