# Project Notebook - Advanced
**Effect of Feedback on Cognitive & Affective Measures**

Group: Data Minions

This notebook builds a trial-level dataset from PSY, EEG, and GSR data, performs visualization and ANOVA, and packages results for submission.

In [None]:
# Step 1: Mount Google Drive and unzip dataset (overwrite if exists)
from google.colab import drive
drive.mount('/content/drive')

!mkdir -p /content/data
!unzip -o "/content/drive/MyDrive/datasets.zip" -d /content/data

print('Files extracted to /content/data')


In [None]:
# Step 2: Imports and plotting settings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
import os

sns.set(style='whitegrid')
%matplotlib inline


In [None]:
# Step 3: Load CSV files (update paths if needed)
base = "/content/data"

# Try multiple possible filenames present in the dataset zip
psy_paths = [os.path.join(base, fn) for fn in os.listdir(base) if 'psy' in fn.lower()]
gsr_paths = [os.path.join(base, fn) for fn in os.listdir(base) if 'gsr' in fn.lower()]
eeg_paths = [os.path.join(base, fn) for fn in os.listdir(base) if 'eeg' in fn.lower()]
survey_paths = [os.path.join(base, fn) for fn in os.listdir(base) if 'surv' in fn.lower() or 'pre-survery' in fn.lower() or 'survey' in fn.lower()]

print('Found files:','\n PSY:', psy_paths, '\n GSR:', gsr_paths, '\n EEG:', eeg_paths, '\n Survey:', survey_paths)

# Load first matching file or raise helpful error
if len(psy_paths)==0:
    raise FileNotFoundError('PSY file not found in /content/data. Please check filename.')
if len(gsr_paths)==0:
    raise FileNotFoundError('GSR file not found in /content/data. Please check filename.')
if len(eeg_paths)==0:
    raise FileNotFoundError('EEG file not found in /content/data. Please check filename.')

psy = pd.read_csv(psy_paths[0])
gsr = pd.read_csv(gsr_paths[0])
eeg = pd.read_csv(eeg_paths[0])
survey = pd.read_csv(survey_paths[0]) if len(survey_paths)>0 else pd.DataFrame()

print('\nPSY shape:', psy.shape)
print('GSR shape:', gsr.shape)
print('EEG shape:', eeg.shape)
print('Survey shape:', survey.shape)


In [None]:
# Step 4: Quick preview and info
pd.set_option('display.max_columns', 40)
print('\nPSY columns:', psy.columns.tolist())
print('\nGSR columns (sample):', gsr.columns.tolist()[:15])
print('\nEEG columns (sample):', eeg.columns.tolist()[:15])

print('\n--- PSY head ---')
display(psy.head())
print('\n--- EEG head ---')
display(eeg.head())
print('\n--- GSR head ---')
display(gsr.head())
if not survey.empty:
    print('\n--- Survey head ---')
    display(survey.head())


In [None]:
# Step 5: Cleaning & standardizing column names
# Lowercase and strip
psy.columns = psy.columns.str.strip().str.lower()
gsr.columns = gsr.columns.str.strip().str.lower()
eeg.columns = eeg.columns.str.strip().str.lower()

# Common fixes if needed
# Rename common timestamp columns to unix time if present
for col in ['routinestart','routineend','routineStamp','routinestartms']:
    if col in psy.columns:
        psy.rename(columns={col: 'routinestart'}, inplace=True)
    if col in psy.columns and 'routineend' in psy.columns:
        break

# Ensure numeric unix time columns exist in eeg/gsr
# Try to find 'unixtime' or 'system timestamp cal' like columns
if 'unixtime' not in eeg.columns:
    candidates = [c for c in eeg.columns if 'unix' in c.lower() or 'system timestamp' in c.lower() or 'timestamp' in c.lower()]
    if len(candidates)>0:
        eeg.rename(columns={candidates[0]: 'unixtime'}, inplace=True)

if 'unixtime' not in gsr.columns:
    candidates = [c for c in gsr.columns if 'unix' in c.lower() or 'system timestamp' in c.lower() or 'timestamp' in c.lower()]
    if len(candidates)>0:
        gsr.rename(columns={candidates[0]: 'unixtime'}, inplace=True)

print('Cleaned column names. Example PSY columns:', psy.columns.tolist()[:20])


In [None]:
# Step 6: Create feedback condition column
import numpy as np
if 'feedback_condition' not in psy.columns:
    if 'cat2feedbacktime' in psy.columns:
        psy['feedback_condition'] = np.where(psy['cat2feedbacktime'].isna(), 'No Feedback', 'Feedback')
    else:
        # If no explicit feedback timestamp, try to use 'cat2feedbacktime' variations
        candidates = [c for c in psy.columns if 'feedback' in c.lower()]
        if len(candidates)>0:
            psy['feedback_condition'] = np.where(psy[candidates[0]].isna(), 'No Feedback', 'Feedback')
        else:
            psy['feedback_condition'] = 'Unknown'

print('Feedback values:', psy['feedback_condition'].unique())


In [None]:
# Step 7: Build trial-level dataset by aligning EEG & GSR to PSY trials
# We will use routinestart and routineend if available; else try to use routineStamp or timestamps

# Ensure routinestart and routineend exist and are numeric
if 'routinestart' not in psy.columns or 'routineend' not in psy.columns:
    # Try other names
    for alt_start in ['routinestartms','routinestartms','routinestartstamp','routinestartstampms']:
        if alt_start in psy.columns:
            psy['routinestart'] = psy[alt_start]
    for alt_end in ['routineend','routineendms','routineendstamp']:
        if alt_end in psy.columns:
            psy['routineend'] = psy[alt_end]

# If timestamps are strings like '28/03/2023, 17:04:09' try parsing - else assume numeric unix
from dateutil import parser

def safe_to_unix(x):
    try:
        if pd.isna(x):
            return np.nan
        if isinstance(x, (int, float)):
            return float(x)
        s = str(x)
        # if contains '/' or ':' -> parse
        if '/' in s or ':' in s:
            # try common formats
            try:
                return parser.parse(s).timestamp()
            except:
                return np.nan
        return float(s)
    except:
        return np.nan

# Create numeric start/end
psy['start_unix'] = psy['routinestart'].apply(safe_to_unix)
psy['end_unix'] = psy['routineend'].apply(safe_to_unix)

# If eeg/gsr unixtime are in milliseconds (very large), convert to seconds if needed
for df in [eeg, gsr]:
    if 'unixtime' in df.columns:
        # take median to judge scale
        med = np.nanmedian(df['unixtime'].dropna().values)
        if med > 1e12:  # milliseconds
            df['unixtime'] = df['unixtime'] / 1000.0

# Identify EEG band columns
band_keywords = ['delta','theta','alpha','beta','gamma']
band_cols = [c for c in eeg.columns if any(k in c.lower() for k in band_keywords)]

print('Found EEG band columns (sample):', band_cols[:12])

# Build trials
trial_rows = []
for idx, r in psy.iterrows():
    s = r.get('start_unix', np.nan)
    e = r.get('end_unix', np.nan)
    if pd.isna(s) or pd.isna(e):
        # skip or set NaN values
        eeg_window = eeg.iloc[0:0]
        gsr_window = gsr.iloc[0:0]
    else:
        eeg_window = eeg[(eeg['unixtime'] >= s) & (eeg['unixtime'] <= e)] if 'unixtime' in eeg.columns else eeg.iloc[0:0]
        gsr_window = gsr[(gsr['unixtime'] >= s) & (gsr['unixtime'] <= e)] if 'unixtime' in gsr.columns else gsr.iloc[0:0]

    band_means = {}
    for col in band_cols:
        band_means[col + '_mean'] = eeg_window[col].mean() if len(eeg_window)>0 else np.nan

    gsr_mean = gsr_window['gsr conductance cal'].mean() if 'gsr conductance cal' in gsr_window.columns else (gsr_window.iloc[:,0].mean() if len(gsr_window.columns)>0 and len(gsr_window)>0 else np.nan)

    trial_rows.append({
        'trial_index': idx,
        'trial_key': r.get('key', np.nan),
        'difficulty': r.get('difficulty', np.nan),
        'feedback_condition': r.get('feedback_condition', np.nan),
        'correct': 1 if str(r.get('verdict','')).upper().strip()=='CORRECT' else 0,
        'response_time': r.get('responsetime', np.nan),
        'gsr_mean': gsr_mean,
        **band_means
    })

trial_df = pd.DataFrame(trial_rows)
print('Trial dataset shape:', trial_df.shape)
trial_df.head()


In [None]:
# Step 8: Save trial-level features
out_path = '/content/trial_features.csv'
trial_df.to_csv(out_path, index=False)
print('Saved', out_path)


In [None]:
# Step 9: Plots - Accuracy and Response Time by Feedback
import matplotlib.pyplot as plt

# Drop Unknown feedbacks for plotting
plot_df = trial_df[trial_df['feedback_condition'].notna()].copy()
plot_df['feedback_condition'] = plot_df['feedback_condition'].astype(str)

acc = plot_df.groupby('feedback_condition')['correct'].mean().reset_index()
rt = plot_df.groupby('feedback_condition')['response_time'].mean().reset_index()

plt.figure(figsize=(6,4))
sns.barplot(data=acc, x='feedback_condition', y='correct')
plt.title('Accuracy by Feedback')
plt.ylabel('Mean Accuracy')
plt.savefig('/content/accuracy_feedback.png', bbox_inches='tight')
plt.show()

plt.figure(figsize=(6,4))
sns.barplot(data=rt, x='feedback_condition', y='response_time')
plt.title('Response Time by Feedback')
plt.ylabel('Mean Response Time (s)')
plt.savefig('/content/rt_feedback.png', bbox_inches='tight')
plt.show()


In [None]:
# Step 10: EEG band means by feedback (if band columns exist)
band_mean_cols = [c for c in trial_df.columns if c.endswith('_mean')]
if len(band_mean_cols)>0:
    band_means_fb = trial_df.groupby('feedback_condition')[band_mean_cols].mean().reset_index()
    # plot only top 6 bands to keep figure readable
    to_plot = band_mean_cols[:6]
    band_means_fb.set_index('feedback_condition')[to_plot].T.plot(kind='bar', figsize=(10,5))
    plt.title('EEG band means by feedback')
    plt.ylabel('Mean value')
    plt.savefig('/content/eeg_feedback.png', bbox_inches='tight')
    plt.show()
else:
    print('No EEG band columns detected to plot.')


In [None]:
# Step 11: GSR mean by feedback
if 'gsr_mean' in trial_df.columns:
    gsr_fb = trial_df.groupby('feedback_condition')['gsr_mean'].mean().reset_index()
    plt.figure(figsize=(6,4))
    sns.barplot(data=gsr_fb, x='feedback_condition', y='gsr_mean')
    plt.title('GSR mean by feedback')
    plt.ylabel('Mean GSR')
    plt.savefig('/content/gsr_feedback.png', bbox_inches='tight')
    plt.show()
else:
    print('No GSR values available in trial dataset.')


In [None]:
# Step 12: ANOVA tests
# Accuracy ANOVA (using logistic proportions may be better but we show group means via ANOVA for demo)
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Ensure feedback_condition is categorical
trial_df['feedback_condition'] = trial_df['feedback_condition'].astype('category')

# ANOVA - Response Time
if trial_df['response_time'].notna().sum()>0:
    model_rt = ols('response_time ~ C(feedback_condition)', data=trial_df).fit()
    anova_rt = sm.stats.anova_lm(model_rt, typ=2)
    print('\nANOVA - Response Time\n', anova_rt)
    anova_rt.to_csv('/content/anova_rt.csv')
else:
    print('Not enough response_time values for ANOVA')

# ANOVA - Accuracy
if trial_df['correct'].notna().sum()>0:
    model_acc = ols('correct ~ C(feedback_condition)', data=trial_df).fit()
    anova_acc = sm.stats.anova_lm(model_acc, typ=2)
    print('\nANOVA - Accuracy\n', anova_acc)
    anova_acc.to_csv('/content/anova_acc.csv')
else:
    print('Not enough accuracy values for ANOVA')


In [None]:
# Step 13: Final Summary
print('=== SUMMARY ===')
try:
    display(acc)
except:
    print('No accuracy table')

try:
    display(rt)
except:
    print('No response time table')

# Show trial_df head
print('\nTrial-level dataset preview:')
display(trial_df.head())


In [None]:
# Step 14: Package results into a zip for download
import zipfile
files_to_zip = ['/content/trial_features.csv', '/content/accuracy_feedback.png', '/content/rt_feedback.png']
# optional files
if os.path.exists('/content/eeg_feedback.png'):
    files_to_zip.append('/content/eeg_feedback.png')
if os.path.exists('/content/gsr_feedback.png'):
    files_to_zip.append('/content/gsr_feedback.png')
if os.path.exists('/content/anova_rt.csv'):
    files_to_zip.append('/content/anova_rt.csv')
if os.path.exists('/content/anova_acc.csv'):
    files_to_zip.append('/content/anova_acc.csv')

zip_path = '/content/results.zip'
with zipfile.ZipFile(zip_path, 'w') as z:
    for f in files_to_zip:
        if os.path.exists(f):
            z.write(f, arcname=os.path.basename(f))
print('Created', zip_path)

# If running in Colab, offer download link
try:
    from google.colab import files
    files.download(zip_path)
except:
    print('Not in Colab or download not available. Please retrieve /content/results.zip manually.')
