In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import statistics
import os
###############################################
from peakutils import indexes
from peakutils import baseline
from scipy.signal import find_peaks as fp
from scipy.signal import savgol_filter 
###############################################
from bokeh.plotting import figure , show
from bokeh.models import Range1d
from pybaselines import whittaker as pl

from bokeh.io import output_notebook
output_notebook()

In [3]:
def load_data(folder_path):
    # List to store DataFrames for intensity columns
    intensity_dfs = []

    # List to store CSV file names
    csv_file_names = []

    # Get a list of CSV files in the folder
    csv_files = [file_name for file_name in os.listdir(folder_path) if file_name.endswith('.csv')]

    # Sort the CSV files based on their numerical order
    csv_files.sort(key=lambda x: int(x.split('-')[1].split('.')[0]))

    # Loop through each file in ascending order
    for file_name in csv_files:
        file_path = os.path.join(folder_path, file_name)
        
        # Read CSV file into a DataFrame
        df = pd.read_csv(file_path)
        
        # Extract intensity column and store it in the list
        intensity_dfs.append(df.iloc[:, 1])  # Assuming intensity is in the second column
        
        # Store CSV file name
        csv_file_names.append(os.path.splitext(file_name)[0])

    # Read the wavelength column from the first CSV file
    wavelength_df = pd.read_csv(os.path.join(folder_path, csv_files[0]), usecols=[0])

    # Concatenate wavelength column with intensity columns
    result_df = pd.concat([wavelength_df] + intensity_dfs, axis=1)

    # Rename the columns with CSV file names
    column_names = ['wavelength'] + [f'{csv_file_names[i]}' for i in range(len(intensity_dfs))]

    result_df.columns = column_names

    return result_df



# LOADING THE DATAFRAME

Please note that the '-x' behind the the name of the sample is cleanin shots , where -1 -> correspond to 0 cleaning shots , -2 -> correspond to 5 cleaning shots , -3 -> 10 cleaning shots , ------ , -12 -> 55 cleaning shots

In [4]:
df_raw = load_data('hLIBS_Spectra/UNPOLISHED/CR300LA')
df_raw

Unnamed: 0,wavelength,CR300LA-1,CR300LA-2,CR300LA-3,CR300LA-4,CR300LA-5,CR300LA-6,CR300LA-7,CR300LA-8,CR300LA-9,CR300LA-10,CR300LA-11,CR300LA-12,CR300LA-13,CR300LA-14,CR300LA-15
0,180.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,180.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,180.066667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,180.100000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,180.133333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23426,960.866667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23427,960.900000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23428,960.933333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23429,960.966667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
def baseline_correction(df):
    """
    Perform baseline correction on the intensity columns of the input DataFrame and create a new DataFrame with corrected values.
    
    Parameters:
        df (DataFrame): Input DataFrame containing the wavelength and intensity columns.
        
    Returns:
        DataFrame: New DataFrame with baseline-corrected intensity columns and the same wavelength column as the input DataFrame.
    """
    # Copy the 'wavelength' column from the input DataFrame
    new_df = pd.DataFrame({'wavelength': df['wavelength']})
    
    # Perform baseline correction for each intensity column and add them to the new DataFrame
    for col in df.columns[1:]:  # Exclude the 'wavelength' column
        baseline, _ = pl.airpls(df[col],lam=0.1)
        corrected_values = df[col] - baseline
        new_df[col] = corrected_values
    
    return new_df

In [6]:
df_baselinecorrected =  df_raw
df_baselinecorrected

Unnamed: 0,wavelength,CR300LA-1,CR300LA-2,CR300LA-3,CR300LA-4,CR300LA-5,CR300LA-6,CR300LA-7,CR300LA-8,CR300LA-9,CR300LA-10,CR300LA-11,CR300LA-12,CR300LA-13,CR300LA-14,CR300LA-15
0,180.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,180.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,180.066667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,180.100000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,180.133333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23426,960.866667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23427,960.900000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23428,960.933333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23429,960.966667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Normalization of the Spectra

Focusing on two main Normalization :- 

1) Normalization by Max Intensity of the Spectra
2) Normalization by the Matrix Peak i.e Fe Peak 259.90 - 259.8999999999988 , 238.16666666666336 , 263.10000000000133

In [7]:
# Normalization by the Maximum Intensity

def normalize_intensities(df):
    # Get the maximum intensity for each column starting from column 2
    max_intensities = df.iloc[:, 1:].max()
    max_wavelengths = df.iloc[:, 0][df.iloc[:, 1:].idxmax()]
    # Normalize intensities for each column
    for col_num in range(1, len(df.columns)):
        df.iloc[:, col_num] /= max_intensities[col_num - 1]
    
    return df


In [8]:
# Normalization by the Matrix Fe Peak i.e Fe Peak 234.35

# Lets try to locate the Matrix Peak first

Matrix_Wavelength_Min = 234.2 
Matrix_Wavelength_Max = 234.6 

Matrix_df = df_baselinecorrected[(df_baselinecorrected['wavelength'] >= Matrix_Wavelength_Min) & (df_baselinecorrected['wavelength'] <= Matrix_Wavelength_Max)]


Selected_Matrix_Plot = figure(title='Selected Matrix Peak', x_axis_label='Wavelength', y_axis_label='Intensity', width=600, height=500)

# Loop through each column (starting from the second one)
for col_num, col_name in enumerate(Matrix_df.columns[1:], start=1):
    intensity_values = Matrix_df[col_name]
    Selected_Matrix_Plot.line(Matrix_df['wavelength'], intensity_values, line_width=2, color="red")

show(Selected_Matrix_Plot)
Matrix_df

Unnamed: 0,wavelength,CR300LA-1,CR300LA-2,CR300LA-3,CR300LA-4,CR300LA-5,CR300LA-6,CR300LA-7,CR300LA-8,CR300LA-9,CR300LA-10,CR300LA-11,CR300LA-12,CR300LA-13,CR300LA-14,CR300LA-15
1627,234.233333,209.918472,221.522375,247.2683,1037.825632,804.21624,1527.622702,2376.228127,1040.012037,3168.930798,3928.254342,4204.611821,4588.312797,3093.01951,5190.568353,5908.738419
1628,234.266667,229.351345,228.067323,279.220444,1858.40348,1415.160143,2776.638726,4429.697257,1854.844232,5803.7153,6970.314343,7383.273918,7968.577488,5614.75865,8917.21283,9743.912308
1629,234.3,254.828902,240.768157,323.940039,2934.134323,2218.814786,4407.959076,7150.394198,2926.163493,9313.140313,11102.331335,11686.081005,12489.802368,9031.041712,13820.589025,14747.18695
1630,234.333333,275.62407,252.712493,359.244033,3757.887572,2836.810937,5660.808205,9279.033552,3748.363868,12096.944751,14486.823001,15208.435037,16170.418408,11795.903205,17781.721934,18809.658593
1631,234.366667,280.085333,256.098104,360.892523,3777.896324,2857.257247,5707.562063,9443.069053,3771.05201,12408.658025,15112.337087,15866.039858,16844.507385,12218.434469,18489.034973,19632.10965
1632,234.4,265.563229,248.848051,325.887507,2939.597033,2237.375519,4463.65917,7481.510333,2938.965525,10021.285586,12661.158368,13328.464598,14174.502629,10053.251011,15589.740252,16844.202679
1633,234.433333,240.998278,236.256605,278.579566,1828.566895,1409.614367,2801.218281,4793.224333,1834.314583,6664.13206,8996.22882,9523.349669,10176.80856,6916.055761,11252.956526,12559.000146
1634,234.466667,215.80156,223.893014,243.774342,1039.312649,813.454469,1606.120926,2802.024613,1048.307528,4100.487491,6024.75935,6424.375992,6915.332342,4457.325691,7698.416175,8941.995296
1635,234.5,195.228503,214.33971,227.197727,695.377086,544.44559,1066.311892,1829.218687,704.649857,2749.120539,4245.615883,4549.713315,4929.676597,3083.021917,5508.866289,6583.344024
1636,234.533333,181.453097,207.964385,220.427893,571.081158,443.111219,852.212813,1378.600984,580.391651,2031.831355,3113.370352,3338.263455,3629.165426,2276.291572,4053.22217,4905.082656


In [9]:
def normalize_data(matrix_df, baseline_df):
    normalized_df = pd.DataFrame() #crete a df
    normalized_df['wavelength'] = baseline_df['wavelength'] #add first column as wavelength

    for sample_col in matrix_df.columns[1:]:
        peaks, _ = fp(matrix_df[sample_col], prominence=10) #finds peak from matrix_df
        return_intensities = matrix_df[sample_col].iloc[peaks] #gets the intensity for that matrix peak
        corresponding_col = sample_col
        normalized_df[sample_col] = baseline_df[corresponding_col] / return_intensities.values[0] #create a columns in normalized_df , by dividing the baseline_df by the intensities of the matrix peak 

    return normalized_df

In [10]:
#NORMALIZATION BY MATRIX PEAK
df_normalized = normalize_data(matrix_df=Matrix_df , baseline_df=df_baselinecorrected)
df_normalized


Unnamed: 0,wavelength,CR300LA-1,CR300LA-2,CR300LA-3,CR300LA-4,CR300LA-5,CR300LA-6,CR300LA-7,CR300LA-8,CR300LA-9,CR300LA-10,CR300LA-11,CR300LA-12,CR300LA-13,CR300LA-14,CR300LA-15
0,180.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,180.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,180.066667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,180.100000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,180.133333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23426,960.866667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23427,960.900000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23428,960.933333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23429,960.966667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# #NORMALIZATION BY MAXIMUM INTENSITYT

# df_normalized = normalize_intensities(df_baselinecorrected)
# df_normalized

# Plotting the Zinc Peak
The Zn peaks for handheld LIBS are located at Zn 213.86 , Zn 202.56

In [12]:
Zn1_Peak_Min = 213.7 , 
Zn2_Peak_Min = 202.4 , 
Zn1_Peak_Max = 214.1 , 
Zn2_Peak_Max = 202.8 , 

Zn1_df = df_normalized[(df_normalized['wavelength'] >= Zn1_Peak_Min) & (df_normalized['wavelength'] <= Zn1_Peak_Max)]
Zn2_df = df_normalized[(df_normalized['wavelength'] >= Zn2_Peak_Min) & (df_normalized['wavelength'] <= Zn2_Peak_Max)]

Zn1_df

Unnamed: 0,wavelength,CR300LA-1,CR300LA-2,CR300LA-3,CR300LA-4,CR300LA-5,CR300LA-6,CR300LA-7,CR300LA-8,CR300LA-9,CR300LA-10,CR300LA-11,CR300LA-12,CR300LA-13,CR300LA-14,CR300LA-15
1012,213.733333,36.489226,46.75559,32.904422,2.966312,3.562786,2.106497,1.150181,3.066589,0.826795,0.563017,0.581105,0.49226,0.645846,0.529692,0.381217
1013,213.766667,55.37118,67.034787,48.045021,4.457066,5.440395,3.154619,1.725248,4.250655,1.249306,0.811187,0.865888,0.772594,1.006756,0.791164,0.614975
1014,213.8,73.338035,85.951211,62.22111,5.867052,7.21251,4.153056,2.266793,5.32158,1.648111,1.040973,1.132857,1.039376,1.347453,1.036815,0.838982
1015,213.833333,83.995871,97.429386,70.733996,6.705325,8.231346,4.770083,2.581561,5.941184,1.878823,1.170169,1.283824,1.187451,1.533243,1.175281,0.960697
1016,213.866667,84.770239,98.843464,71.589885,6.769461,8.24726,4.856682,2.593261,5.974401,1.885049,1.168506,1.282386,1.179323,1.51753,1.172705,0.948376
1017,213.9,77.289635,91.363577,65.771459,6.174519,7.448732,4.464891,2.3543,5.509671,1.707225,1.062862,1.16013,1.051956,1.350665,1.058505,0.837175
1018,213.933333,63.4952,76.494438,54.515195,5.060884,6.054664,3.666002,1.926491,4.652605,1.392688,0.883711,0.952955,0.846623,1.088313,0.866118,0.665798
1019,213.966667,46.63132,57.620462,40.416345,3.690356,4.390676,2.651529,1.409247,3.585451,1.014712,0.670433,0.707828,0.61119,0.790695,0.639315,0.474454
1020,214.0,30.875901,39.472496,27.043456,2.411704,2.858826,1.699102,0.928915,2.545352,0.665153,0.468786,0.479642,0.398243,0.521295,0.429271,0.304432
1021,214.033333,19.93031,26.278803,17.573237,1.533866,1.813955,1.054883,0.59729,1.745981,0.425203,0.319571,0.31706,0.253962,0.335905,0.281281,0.191791


In [13]:
# Selected_Zn2_Plot = figure(title='Selected Zn2 plot', x_axis_label='Wavelength', y_axis_label='Intensity', width=300, height=500)

# for col_num, col_name in enumerate(Zn2_df.columns[1:], start=1):
#     intensity_values = Zn2_df[col_name]
#     Selected_Zn2_Plot.line(Zn2_df['wavelength'], intensity_values, line_width=2, color="red")

# show(Selected_Zn2_Plot)

In [14]:

# Load the dataframes
df1 = Zn1_df
df2 = Zn2_df

# Initialize lists to store peak intensities for each dataframe
peak_intensities_df1 = []
peak_intensities_df2 = []

# Determine x-axis values
x_values = list(range(0, 75, 5))

# Find the peak intensity for each column in each dataframe
for col_num in range(1, 16):  # Columns 2 to 13 are intensities
    peak_intensity_df1 = df1.iloc[:, col_num].max()  # Get the maximum intensity value for the column
    peak_intensity_df2 = df2.iloc[:, col_num].max()  # Get the maximum intensity value for the column
    peak_intensities_df1.append(peak_intensity_df1)
    peak_intensities_df2.append(peak_intensity_df2)

# Plotting
plot = figure(title='CR300LA UNPOLISHED - Normalization by Matrix Peak i.e Fe 234.35', x_axis_label='Cleaning Pulses', y_axis_label='Normalized Peak Intensity')
plot.scatter(x_values, peak_intensities_df1, legend_label='Zn 213.86', marker='+', color='blue', size=4)
plot.scatter(x_values, peak_intensities_df2, legend_label='Zn 202.56', marker='+', color='red', size=4)
plot.legend.location = "top_right"
plot.title.align = 'center'

# Set y-axis range
plot.y_range = Range1d(start=0, end=100)
plot.x_range = Range1d(start=(-1) , end =75)

# Connect markers with lines
for i in range(min(len(x_values), len(peak_intensities_df1)) - 1):
    plot.line([x_values[i], x_values[i+1]], [peak_intensities_df1[i], peak_intensities_df1[i+1]], line_color='blue')
    plot.line([x_values[i], x_values[i+1]], [peak_intensities_df2[i], peak_intensities_df2[i+1]], line_color='red')

# Show plot
show(plot)
