In [27]:
import numpy as np
import os
import re
import pandas as pd
import matplotlib.pyplot as plt 
import mne 
from mne.stats import permutation_cluster_test
import sklearn
import spacy
from fastcoref import spacy_component
nlp = spacy.load('en_core_web_lg')
import syllapy
import random

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder

1) Decimate by 12 then filter data (1 - 30 Hz)

In [28]:
mne.set_log_level(verbose=False)

In [29]:
def read_meg(meg_path, event_path):
    """
    Read in MEG data from given path, clean and filter returning
    dataframe
    """
    raw = mne.io.read_raw_ctf(meg_path, preload=False)
    raw.pick(picks=['mag'])
    raw.resample(100)
    raw.load_data()
    raw.filter(0.1, 30, method='iir')
    df = pd.read_csv(event_path, delimiter='\t')
    df_crop = df[df['type'].str.contains('word_onset', na=False)]
    df_crop = df_crop.query("value != 'sp'")
    return raw, df_crop


In [30]:
def session_text(root_dir, session, pat_id):
    full_text = ""
    if session == "08" and pat_id == "03":
        pattern = re.compile(f'{session}_[1-6]\.txt')
    else:
        pattern = re.compile(f'{session}_\d\.txt')
    for filename in os.listdir(root_dir + "\stimuli"):
        if pattern.match(filename):
            file_path = os.path.join(root_dir + "\stimuli", filename)
            with open(file_path, 'r') as file:
                full_text += file.read().replace("\n", " ")
    return full_text

In [31]:
def has_accented_characters(token):
    for char in token.text:
        if ord(char) > 127:
            return True
    return False

In [32]:
def find_accent(token):
    idx = 0
    for char in token.text:
        if ord(char) > 127:
            return idx
        idx += 1

In [33]:
def pos_tagger(text, ses_id, pat_id):
    """
    Does not work on words with more then 1 accent
    """
    doc = nlp(text)
    pos = []
    tokens = []

    for token in doc:
        if ses_id == "01" and pat_id == "03":
            if token.i < 10:
                continue
        if token.text.strip() != "" and not token.is_punct:
            if has_accented_characters(token):
                text = token.text[find_accent(token) + 1:]
                if text != "":
                    pos.append(token.pos_)
                    tokens.append(text)
            elif token.text.lower() in ["n't", "'ll", "'ve", "'m", "'d", "'t", "'s", "'re"]:
                temp = tokens[-1] + token.text
                tokens[-1] = temp
            else:
                pos.append(token.pos_)
                tokens.append(token.text)

    return pos, tokens

In [34]:
def create_epochs(df, raw):
    word_samples = np.array(df['onset'] * raw.info['sfreq'], dtype='int')
    n_words = len(word_samples)

    word_events = np.zeros([n_words, 3], dtype='int')
    word_events[:, 0] = word_samples

    epochs = mne.Epochs(raw, word_events, tmin=-2.0, tmax=2.0, baseline=(-2.0, 2.0), preload=False, metadata=df)
    return epochs

In [35]:
def run_LR_model(epochs, labels, pipeline):
    df_scores = pd.DataFrame()
    for column in labels.columns:
        column = column[0]
        y = labels[column].to_numpy().ravel()
        auc_score = []
        for i in range(epochs.shape[2]):
            X = epochs[:, :, i]
            skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=True)
            scores = np.zeros(5)
            for j, (train_index, test_index) in enumerate(skf.split(X, y)):
                X_train = X[train_index]
                y_train = y[train_index]
                X_test = X[test_index]
                y_test = y[test_index]
                pipeline.fit(X_train, y_train)
                score = roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1])
                scores[j] = score
            auc_score.append(scores.mean())
        df_scores[column] = auc_score
    return df_scores

In [39]:
def analysis_1(root_dir, save_dir, ses_ids, pat_id, trial):  
     all_data = []
     all_labels = []
     for ses_id in ses_ids:
          characters = ['Holmes', 'Watson', 'Doctor', 'McCarthy', 'Doran', 'Turner', 
                'Simon', 'Ryder', 'Stoner', 'Adler', 'Wilson', 'Angel', 'Lestrade']
          upper_chars = [char.upper() for char in characters]

          # raw, df = read_meg(meg_path, event_path)
          text = session_text(root_dir, ses_id, pat_id)
          # epochs = create_epochs(df, raw)
          epochs = mne.read_epochs(f"{root_dir}/sub_0{pat_id}/ses_0{ses_id}/clean-epo.fif", preload=False)
          df = epochs.metadata

          print(ses_id, pat_id)
          df['POS'], token_text = pos_tagger(text, ses_id, pat_id)
          mask = df["POS"].isin(["PRON"]) | (df["POS"].isin(["PROPN"]) & df["value"].isin(upper_chars))
          noun_epochs = epochs[mask]
          data = noun_epochs.get_data()
          all_data.append(data)
          labels = noun_epochs.metadata[["POS"]]
          all_labels.append(labels)

     X = np.concatenate(all_data, axis=0)
     y = pd.concat(all_labels, axis=0)


     enc = OneHotEncoder()
     enc_y = enc.fit_transform(X=y)
     label_df = pd.DataFrame(enc_y.toarray(), columns=enc.categories_)

     pipeline = make_pipeline(StandardScaler(), LogisticRegression(
          random_state= 125, max_iter=10000, solver="lbfgs", C=10e-3))


     df_scores = run_LR_model(X, label_df, pipeline)
     df_scores.to_csv(save_dir + f"/df_scores_pt_{pat_id}_trial_{trial}.csv")

In [41]:
root_dir = r"C:\Users\ricky\OneDrive\Desktop\Datasci125\Data"
save_dir = r"C:\Users\ricky\OneDrive\Desktop\Datasci125\Code\Results\Analysis_1C"

sessions = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10"]
patients = ["01", "02", "03"]

random.seed(125)
random.shuffle(sessions)

random_pairs = [(sessions[i], sessions[i + 1]) for i in range(0, len(sessions), 2)]

# patients = ["03"]
# random_pairs = [('01', '04'), ('05', '07')]

for patient in patients:
    for trial, pair in enumerate(random_pairs):
        analysis_1(root_dir, save_dir, pair, patient, trial)

print(random_pairs)

09 01
07 01
01 01
03 01
02 01
06 01
08 01
05 01
10 01
04 01
09 02
07 02
01 02
03 02
02 02
06 02
08 02
05 02
10 02
04 02
09 03
07 03
01 03
03 03
02 03
06 03
08 03
05 03
10 03
04 03
[('09', '07'), ('01', '03'), ('02', '06'), ('08', '05'), ('10', '04')]


6.5 Hours per pt

In [38]:
noun_epochs

NameError: name 'noun_epochs' is not defined

In [None]:
random_pairs