<a href="https://colab.research.google.com/github/sina-salmanpour/False_memory_PTSD/blob/main/PTSD_Flase_Memory.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Entirly Reviewing flow

In [20]:
# Import libraries
import pandas as pd
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Load and Clean Data (example for verbal PTSD)
df = pd.read_excel('ptsd_verbal.xlsx')
# ... (pivot, split response/RT, filter RT > 200 & < 5000, exclude negatives)
df['group'] = 'PTSD'  # Repeat for other files, concatenate

# Step 2: Aggregates
agg_df = df.groupby(['subject', 'group', 'emotion', 'item_type']).agg(
    rate=('response', 'mean'),  # Hit/false memory/false alarm
    rt_mean=('rt', 'mean')     # Mean RT for yes
).reset_index()

# Step 3: Normality (Shapiro-Wilk)
for group in agg_df['group'].unique():
    for emotion in agg_df['emotion'].unique():
        rt_data = agg_df[(agg_df['group'] == group) & (agg_df['emotion'] == emotion) & (agg_df['item_type'] == 'CL')]['rt_mean']
        if len(rt_data) > 2:
            stat, p = stats.shapiro(rt_data)
            print(f'{group}-{emotion}: Shapiro p={p}')

# Step 4: ANOVA (example for false memory rate)
model = ols('rate ~ C(group) * C(emotion)', data=agg_df[agg_df['item_type'] == 'CL']).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print(anova_table)

# Non-Parametric Alternative (Kruskal-Wallis)
groups = [agg_df[(agg_df['group'] == g) & (agg_df['item_type'] == 'CL') & (agg_df['emotion'] == 'NT')]['rate'] for g in ['PTSD', 'Non-PTSD', 'Control']]
stat, p = stats.kruskal(*groups)
print(f'Kruskal NT: p={p}')

# Visuals
sns.barplot(data=agg_df[agg_df['item_type'] == 'CL'], x='emotion', y='rate', hue='group')
plt.title('False Memory Rates')
plt.savefig('false_memory_bar.png')  # For thesis inclusion

FileNotFoundError: [Errno 2] No such file or directory: 'ptsd_verbal.xlsx'

## Stepping down by detail

### version one 1

In [21]:
# 1. Import Libraries
import pandas as pd  # Core for DataFrames, reading CSVs, grouping
import numpy as np   # Numerical operations, e.g., NaN handling
from scipy import stats  # For future normality (Shapiro-Wilk)
import matplotlib.pyplot as plt  # Basic plotting
import seaborn as sns  # Advanced visuals (e.g., boxplots for RT)
import os  # File path handling
import warnings  # Suppress warnings for clean output
warnings.filterwarnings('ignore')

# Set random seed for reproducibility (if sampling later)
np.random.seed(42)

# Display options for better readability
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)       # Auto-width

print("Libraries imported successfully. Ready for data loading.")

Libraries imported successfully. Ready for data loading.


In [24]:
import pandas as pd
import numpy as np
import os

# Simplified file paths (list instead of dict for ease; group from key)
files = {
    'verbal_PSTD': '/content/drive/MyDrive/AmirFarhang/ptsd_verbal.xlsx',
    'verbal_Non-PTSD': '/content/drive/MyDrive/AmirFarhang/non_ptsd_verbal.xlsx',
    'verbal_Control': '/content/drive/MyDrive/AmirFarhang/control_verbal.xlsx',
    'video_PSTD': '/content/drive/MyDrive/AmirFarhang/ptsd_video.xlsx',
    'video_Non-PTSD': '/content/drive/MyDrive/AmirFarhang/non_ptsd_visual.xlsx',
    'video_Control': '/content/drive/MyDrive/AmirFarhang/control_video.xlsx'
}

# Function to process and save one file
def process_file(file_path, file_key):
    """
    Load XLSX, drop 'Subject' column, extract response/RT with custom logic, handle negatives/two-digit, save _modified.
    Args:
        file_path (str): Path to file.
        file_key (str): 'verbal_PSTD' etc. for group/task extraction.
    Returns:
        pd.DataFrame: Processed long format.
    """
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}. Skipping.")
        return None

    df = pd.read_excel(file_path)

    # Drop 'Subject' column if exists (as per user: it's words, deleted/empty)
    if 'Subject' in df.columns:
        df = df.drop(columns=['Subject'])

    # Extract task and group from key
    task_type, group_name = file_key.split('_')

    # Identify candidate (subject) columns (starting with 'Sub', 'SubP', etc.)
    candidate_cols = [col for col in df.columns if str(col).startswith(('Sub', 'SubP', 'Subh', 'subh', 'subp'))]

    # Other columns: 'answer' (item_type), unnamed third (emotion), etc.
    id_vars = [col for col in df.columns if col not in candidate_cols]

    # Melt to long (pivot candidates to rows)
    df_long = pd.melt(df, id_vars=id_vars, value_vars=candidate_cols,
                      var_name='candidate', value_name='value')

    # Add group and task
    df_long['group'] = group_name
    df_long['task'] = task_type

    # For videos: Assign emotion blocks (rows 1-26=O, etc.; use reset index for row num)
    df_long = df_long.reset_index()  # Add row index for block assignment
    # if task_type == 'video':
    #     df_long['emotion'] = np.where(df_long['index'] < 26, 'O',  # 0-based index, rows 0-25 = O (1-26)
    #                                   np.where(df_long['index'] < 51, 'NT',
    #                                            np.where(df_long['index'] < 76, 'P', 'N')))

    # # For verbal: Unnamed third column is emotion (assume column index 1, since Subject dropped; cols[0]='answer', cols[1]=emotion)
    # else:
    emotion_col = df.columns[1] if len(df.columns) > 1 else None  # Second col after 'answer'
    if emotion_col:
        df_long['emotion'] = df_long[emotion_col]
    else:
        df_long['emotion'] = 'Unknown'  # Fallback

    # Map item_type from 'answer' (flag column)
    if 'answer' in df_long.columns:
        df_long['item_type'] = df_long['answer'].map({1: 'Target', 0: 'New', 'CL': 'Lure', np.nan: 'Unknown'})
    else:
        df_long['item_type'] = 'Unknown'


    # Custom extract response (0/1) and rt (float in seconds)
    def extract_response_rt(val):
        if pd.isna(val) or (isinstance(val, (int, float)) and val < 0):
            return np.nan, np.nan  # NA for NaN/negatives


        val_str = str(val).replace(',', '')  # Remove commas if any (e.g., 15,757 → 15757)
        response = np.nan
        rt = np.nan

        if '.' in val_str:
            before, after = val_str.split('.')

        else:
            before, after = val_str, '0'

        # Ensure 'before' is a string before accessing its elements
        before_str = str(before)

        # Response = first digit (0 or 1)
        if before_str and before_str[0] in '01':
            response = int(before_str[0])


        # RT = rest + '.' + after as float (in seconds)
        rt_str = before_str[1:] + '.' + after if len(before_str) > 1 else '0.' + after
        try:
            rt = float(rt_str)
        except ValueError:
            rt = np.nan
        if pd.isna(response):
            rt = np.nan

        #print(response, rt)
        return response, rt

    df_long[['response', 'rt']] = df_long.apply(lambda row: extract_response_rt(row['value']), axis=1, result_type='expand')

    # Drop original 'value' and any temp cols (e.g., index)
    df_long = df_long.drop(columns=['value', 'index'] if 'index' in df_long.columns else ['value'])

    # Save _modified in same path (as .xlsx)
    base_dir = os.path.dirname(file_path)
    base_name = os.path.basename(file_path)
    modified_name = base_name.replace('.xlsx', '_modified.xlsx')
    modified_path = os.path.join(base_dir, modified_name)
    df_long.to_excel(modified_path, index=False)
    print(f"Saved modified file: {modified_path}")

    return df_long

# Process all files
all_data = []
for file_key, path in files.items():
    processed = process_file(path, file_key)
    if processed is not None:
        all_data.append(processed)

# Optional: Concat all for full analysis, save
if all_data:
    full_df = pd.concat(all_data, ignore_index=True)
    full_df.to_excel('/content/drive/MyDrive/AmirFarhang/full_processed.xlsx', index=False)
    print("All files processed and full_processed saved.")
else:
    print("No files processed.")

Saved modified file: /content/drive/MyDrive/AmirFarhang/ptsd_verbal_modified.xlsx
Saved modified file: /content/drive/MyDrive/AmirFarhang/non_ptsd_verbal_modified.xlsx
Saved modified file: /content/drive/MyDrive/AmirFarhang/control_verbal_modified.xlsx
Saved modified file: /content/drive/MyDrive/AmirFarhang/ptsd_video_modified.xlsx
Saved modified file: /content/drive/MyDrive/AmirFarhang/non_ptsd_visual_modified.xlsx
Saved modified file: /content/drive/MyDrive/AmirFarhang/control_video_modified.xlsx
All files processed and full_processed saved.


In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

# Load full_processed (or concatenate modified if needed)
full_path = '/content/drive/MyDrive/AmirFarhang/full_processed.xlsx'
df = pd.read_excel(full_path)

# Filter out NaNs in response/rt for analysis (per your request)
df = df.dropna(subset=['response', 'rt'])

print(f"Loaded {len(df)} rows post-NaN filter.")

# Aggregate per subject/group/task/emotion/item_type
agg_df = df.groupby(['candidate', 'group', 'task', 'emotion', 'item_type']).agg(
    n_trials=('response', 'count'),  # Valid trials
    hit_rate=('response', lambda x: x[df.loc[x.index, 'item_type'] == 'Target'].mean()),  # Mean yes for Targets
    false_memory_rate=('response', lambda x: x[df.loc[x.index, 'item_type'] == 'Lure'].mean()),  # Mean yes for Lures
    false_alarm_rate=('response', lambda x: x[df.loc[x.index, 'item_type'] == 'New'].mean()),  # Mean yes for New
    rt_mean=('rt', 'mean'),  # Mean RT for all yes (response==1)
    rt_sd=('rt', 'std')      # SD for RT
).reset_index().fillna(0)  # Fill 0 if no data (e.g., no Lures)

# Descriptives: Overall means/SD by group/emotion (for rates/RT)
descriptives = agg_df.groupby(['group', 'task', 'emotion']).agg(
    mean_hit=('hit_rate', 'mean'),
    sd_hit=('hit_rate', 'std'),
    mean_false_memory=('false_memory_rate', 'mean'),
    sd_false_memory=('false_memory_rate', 'std'),
    mean_false_alarm=('false_alarm_rate', 'mean'),
    sd_false_alarm=('false_alarm_rate', 'std'),
    mean_rt=('rt_mean', 'mean'),
    sd_rt=('rt_sd', 'mean')
).reset_index()

print("Descriptives Summary:")
print(descriptives)

# Save descriptives
descriptives.to_excel('/content/drive/MyDrive/AmirFarhang/descriptives.xlsx', index=False)

# Normality: Shapiro-Wilk on RT_mean per group/emotion/item_type (focus Lure/Target)
norm_results = []
for group in agg_df['group'].unique():
    for emotion in agg_df['emotion'].unique():
        for item in ['Lure', 'Target']:
            rt_data = agg_df[(agg_df['group'] == group) & (agg_df['emotion'] == emotion) & (agg_df['item_type'] == item)]['rt_mean'].dropna()
            if len(rt_data) > 2:
                stat, p = stats.shapiro(rt_data)
                norm_results.append({'group': group, 'emotion': emotion, 'item_type': item, 'shapiro_stat': stat, 'p_value': p, 'normal': p > 0.05})

norm_df = pd.DataFrame(norm_results)
print("Normality Tests:")
print(norm_df)

norm_df.to_excel('/content/drive/MyDrive/AmirFarhang/normality_results.xlsx', index=False)

# Homogeneity: Levene on RT_mean across groups for each emotion/item_type
homog_results = []
for emotion in agg_df['emotion'].unique():
    for item in ['Lure', 'Target']:
        groups_rt = [agg_df[(agg_df['group'] == g) & (agg_df['emotion'] == emotion) & (agg_df['item_type'] == item)]['rt_mean'].dropna() for g in agg_df['group'].unique()]
        if all(len(g) > 1 for g in groups_rt):
            stat, p = stats.levene(*groups_rt)
            homog_results.append({'emotion': emotion, 'item_type': item, 'levene_stat': stat, 'p_value': p, 'homogeneous': p > 0.05})

homog_df = pd.DataFrame(homog_results)
print("Homogeneity Tests:")
print(homog_df)

homog_df.to_excel('/content/drive/MyDrive/AmirFarhang/homogeneity_results.xlsx', index=False)

# Visuals: Bar chart for false memory rates
plt.figure(figsize=(10, 6))
sns.barplot(data=agg_df[agg_df['item_type'] == 'Lure'], x='emotion', y='false_memory_rate', hue='group', errorbar='sd')
plt.title('False Memory Rates by Group and Emotion')
plt.ylabel('Mean False Memory Rate')
plt.savefig('/content/drive/MyDrive/AmirFarhang/false_memory_bar.png', dpi=300)
plt.show()

# Boxplot for RTs (Lure yes)
plt.figure(figsize=(10, 6))
sns.boxplot(data=agg_df[agg_df['item_type'] == 'Lure'], x='group', y='rt_mean', hue='emotion')
plt.title('RT Distribution for Lures by Group and Emotion')
plt.ylabel('Mean RT (ms)')
plt.savefig('/content/drive/MyDrive/AmirFarhang/rt_boxplot.png', dpi=300)
plt.show()

print("Initial Analysis complete. Files saved in Drive.")