In [18]:
import numpy as np
import pandas as pd
import mne
import pathlib

from scipy.stats import pearsonr
from scipy.stats import wilcoxon

from mne.preprocessing import ICA

In [19]:
# Define the DataFrame with appropriate columns
columns = ["Segment_ID", "RMS_Noisy", "RMS_Cleaned", "RMS_diff", "Correlation", "P_Value"]
results_df = pd.DataFrame(columns=columns)

In [20]:
data = {}
for filename in pathlib.Path("./lee2019-artifacts/").glob("*Artifact*.fif"):
    data[filename] = mne.io.read_raw(filename, preload=True, verbose=False).pick(
        ["O1", "O2", "Fp1", "Fp2"]
    )

In [21]:
segment_length = 2
all_segments = []

for filename, raw in data.items():
    n_segments = int(np.floor(raw.times[-1] / segment_length))

    for i in range(n_segments):
        start_sec = i * segment_length
        stop_sec = start_sec + segment_length
        stop_sec = min(stop_sec, raw.times[-1])

        cropped_raw = raw.copy().crop(
            tmin=start_sec, tmax=stop_sec - 1 / raw.info["sfreq"]
        )
        all_segments.append(cropped_raw)

In [22]:
def rms(raw):
    """
    Computes the Root Mean Square of the signal, providing a measure of the overall power in the signal.
    """
    data = raw.get_data()
    rms = np.sqrt(np.mean(data**2))
    return np.round(rms, 4)

In [23]:
def correlation(noisy, clean):
    """
    Calculates the Pearson correlation coefficient for each channel between the
    noisy and cleaned signals to assess how well the overall structure of the signal
    is preserved post-cleaning.
    """
    correlation_coeffs = []
    for ch in range(4):
        coeff, _ = pearsonr(noisy.get_data(picks=ch)[0], clean.get_data(picks=ch)[0])
        correlation_coeffs.append(coeff)

    average_corr = np.mean(correlation_coeffs)

    return np.round(average_corr, 4)

In [24]:
def significance(noisy, clean):
    """
    Uses the Wilcoxon signed-rank test to determine if there is a
    statistically significant difference between the noisy and cleaned EEG signals.
    """
    data_original = noisy.get_data().flatten()
    data_cleaned = clean.get_data().flatten()

    _, p = wilcoxon(data_original, data_cleaned)

    return p

In [25]:
def ica_pipeline(raw):
    raw.filter(1, 40, fir_design="firwin")
    raw.resample(256) 

    reconstructed = raw.copy()

    ica = ICA(
        n_components=4,
        max_iter="auto",
        method="picard",
        random_state=97,
    ).fit(reconstructed)

    eog_indices, eog_scores = ica.find_bads_eog(
        reconstructed, ch_name=["Fp1", "Fp2"], threshold=0.7, measure="correlation"
    )
    ica.exclude = eog_indices

    ica.apply(reconstructed)

    return reconstructed

In [26]:
for i, noisy_segment in enumerate(all_segments):
    cleaned = ica_pipeline(noisy_segment)

    rms_noisy = rms(noisy_segment)
    rms_cleaned = rms(cleaned)
    corr = correlation(noisy_segment, cleaned)
    p_value = significance(noisy_segment, cleaned)

    results_df = results_df.append(
        {
            "Segment_ID": i,
            "RMS_Noisy": rms_noisy,
            "RMS_Cleaned": rms_cleaned,
            "RMS_diff": np.abs(rms_cleaned - rms_noisy),
            "Correlation": corr,
            "P_Value": p_value,
        },
        ignore_index=True,
    )

  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df =

In [27]:
results_df.to_csv("ica_analysis_results.csv", index=False)

In [28]:
# Basic descriptive statistics
print(results_df.describe())

# Filter segments with significant improvements
significant_improvements = results_df[results_df["P_Value"] < 0.05]
print("Significant Improvements Count:", significant_improvements.shape[0])

        Segment_ID    RMS_Noisy  RMS_Cleaned     RMS_diff  Correlation  \
count  3093.000000  3093.000000  3093.000000  3093.000000  3091.000000   
mean   1546.000000    69.402310    36.637752    32.764587     0.599947   
std     893.016517    51.475036    34.263644    29.479328     0.175131   
min       0.000000     5.348600     2.489400     0.000000     0.043100   
25%     773.000000    35.930400    15.433900    13.088800     0.480750   
50%    1546.000000    58.704300    26.660500    26.790700     0.600700   
75%    2319.000000    86.596100    45.417400    43.187000     0.715900   
max    3092.000000   495.824300   491.611700   326.471400     1.000000   

             P_Value  
count   3.093000e+03  
mean    1.573672e-01  
std     2.850999e-01  
min    3.997539e-151  
25%     1.689218e-33  
50%     3.433851e-08  
75%     1.711039e-01  
max     9.995230e-01  
Significant Improvements Count: 2139
