In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load the data
file_path = '0_batch_corrected.csv'
data = pd.read_csv(file_path, usecols=['Sample', 'SurfactinC'])

# Extract batch, sample type, and replicate information
data[['Batch', 'SampleType', 'Replicate']] = data['Sample'].str.split('_', expand=True)

# Add a 'Batch_SampleType' column for grouping
data['Batch_SampleType'] = data['Batch'] + '_' + data['SampleType']
#data["SurfactinC"]=data["SurfactinC"].apply(np.log)


In [2]:
# Calculate mean SurfactinC concentration for each 'Batch_SampleType' and sort
mean_concentration = data.groupby('Batch_SampleType')['SurfactinC'].mean().reset_index()
mean_concentration_sorted = mean_concentration.sort_values(by='SurfactinC')['Batch_SampleType']

In [3]:
refs=mean_concentration[mean_concentration["Batch_SampleType"].apply(lambda x: "QC" in x)].copy()

In [4]:
refs["Batch_SampleType"]=refs["Batch_SampleType"].apply(lambda x: x[0])

In [5]:
ref_dictio=dict(zip(refs["Batch_SampleType"].tolist(),refs["SurfactinC"].tolist()))

In [6]:
ref_dictio

{'0': 463714.472613722,
 '1': 463663.4289331288,
 '2': 463650.4212741394,
 '3': 463703.6732557308,
 '4': 463659.4765523662,
 '5': 463790.9248384902}

In [7]:
# Correcting the column name and performing the division
data['SurfactinC'] = data.apply(lambda row: row['SurfactinC'] / ref_dictio[row['Batch']], axis=1)

In [8]:
#data.to_csv("test.csv",index=False)

In [9]:
# Calculate mean SurfactinC concentration for each 'Batch_SampleType' and sort
mean_concentration = data.groupby('Batch_SampleType')['SurfactinC'].mean().reset_index()
mean_concentration_sorted = mean_concentration.sort_values(by='SurfactinC')['Batch_SampleType']

In [10]:
# Calculate mean SurfactinC concentration for each 'Batch_SampleType' and sort
var_concentration = data.groupby('Batch_SampleType')['SurfactinC'].sem().reset_index()
var_concentration_sorted = var_concentration.sort_values(by='SurfactinC')['Batch_SampleType']

In [11]:
mean_concentration.to_csv("SurfactinC_mean_no_log.csv",index=False)
mean_concentration.to_csv("SurfactinC_sem_no_log.csv",index=False)
data.to_csv("SurfactinC_data_no_log.csv",index=False)

In [12]:
 #Define the color palette
palette = {
    '0': '#0000FF',  # Blue
    '1': '#FF0000',  # Red
    '2': '#77DD77',  # Pastel Green
    '3': '#FDFD96',  # Pastel Yellow
    '4': '#800080',  # Purple
    '5': '#ADD8E6',  # Light Blue
}

# Map each batch to its corresponding color
data['Color'] = data['Batch'].map(lambda x: palette[x])

# Create and adjust the figure for plotting
plt.figure(figsize=(14, 8))
# Plot using the order specified by 'mean_concentration_sorted'
sns.boxplot(x='Batch_SampleType', y='SurfactinC', data=data,
            palette=data.set_index('Batch_SampleType')['Color'].to_dict(),
            order=mean_concentration_sorted)

plt.xticks(rotation=90,fontsize=16)
plt.yticks(fontsize=16)
plt.xlabel('Batch_SampleType',fontsize=20)
plt.ylabel('Surfactin C Titre (log)',fontsize=20)
plt.title('Surfactin C Titre by Batch and Sample Type Ordered by Mean Titre',fontsize=20)
plt.tight_layout()

# Saving the plot
save_path = 'surfactinc_concentration_boxplot_ordered_no_log_QC.png'
plt.savefig(save_path, dpi=600, transparent=True)
plt.close()  # Close the plot


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Batch_SampleType', y='SurfactinC', data=data,
