<a href="https://colab.research.google.com/github/shane-downs/shane_portfolio/blob/main/DataAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Normal Analysis

### Configure Analysis
___
This step configures a number of options:
* Directory Path `dir_path`
  - path to directory containing participant files

In [None]:
from posixpath import splitext
import csv
import os
config = {
    "dir_path": "./dir",
    "phase_count": 3,
}

# Filter out non-csv files and sort them
dir_path = config['dir_path']
unsorted_paths = os.listdir(config['dir_path'])
unsorted_paths = list(filter(lambda file_path: ((os.path.isfile(os.path.join(dir_path, file_path))) and (os.path.splitext(file_path)[1] == '.csv')), unsorted_paths))
participant_files = sorted(unsorted_paths, key=lambda path: path.lower())

print(f"Will perform analysis on {len(participant_files)} participants.")

Will perform analysis on 337 participants.


### Parse Events
___
This step parses each participant's event array.

In [None]:
participants = []

for participant_file in participant_files:
  full_path = os.path.join(dir_path, participant_file)
  participant = {
      "full_path": full_path,
      "name": participant_file[:-4],
      "events": [],
      "event_types": [],
      "excluded": False,
      "no_phase_1": None,
      "num_events": 0,
      "sr": [],
      "summary": [],
  }

  file = open(full_path, "r")

  # Ensure proper formatting
  if file.readline(6) != "Start:":
    participant['excluded'] = True
  else:
    current_line = file.readline()

    # Skip all lines that don't start with 'LIST OF EVENTS'. Then we will have reached the beginning of event list
    while not current_line.startswith("LIST OF EVENTS"):
      current_line = file.readline()

      if current_line.startswith("noPhase1:"):
        participant['no_phase_1'] = bool(int(current_line.rstrip().split()[1]))

      if current_line.startswith("totalSR:"):
        participant['sr'].append(current_line.rstrip())
      if current_line.startswith("srPhase1:"):
        participant['sr'].append(current_line.rstrip())
      if current_line.startswith("srPhase2:"):
        participant['sr'].append(current_line.rstrip())
      if current_line.startswith("srPhase3:"):
        participant['sr'].append(current_line.rstrip())

    current_line = file.readline() # go to the first event

    # At this point, the file reader is at the list of events,
    # so we neet to keep reading until blank line
    while not current_line == "\n":
      participant['event_types'].append(current_line.rstrip().split(": "))

      current_line = file.readline()

    # Instead of fixing the format, we will read until a line has a ')' in it,
    # marking the beginning of the events list
    while ")" not in current_line:
      current_line = file.readline()
      last_position = file.tell()

    # Rewind file pointer back to beginning of current_line
    file.seek(last_position - len(current_line))

    # Parse event array

    # Below line will take care of:
            #   - Filtering out lines with only a newline
            #   - Filtering out empty lines ('')
            #   - Splitting event and time into tuple
            #   - Deleting ')' from each event type ('02)' turns into '02')
    participant['events'] = [tuple([event_line.rstrip().split()[0][:-1], int(event_line.rstrip().split()[1])]) for
                           event_line
                           in file.readlines() if (event_line != '\n' and event_line != '' and ")" in event_line)]


    participant['num_events'] = len(participant['events'])

  # Add participant dict to master participants list
  participants.append(participant)

print(f"{len(participants)} participant files parsed into {sum(len(participant['events']) for participant in participants)} total events!")

337 participant files parsed into 525684 total events!


### Break into bins
___
This step will create a `participant_summary` for each participant.

In [None]:
for participant in participants:
  participant_summary = {
      "name": participant['name'],
      "phase_start_times": (0, 0, 0),
      "phases_offset": (0, 0, 0),
      "bins": [],
  }

  current_phase = 1 if participant['no_phase_1'] else 0
  current_phase_start_time = 0
  first_sr = True if participant['no_phase_1'] else False
  eob_evt = next((ev[0] for ev in participant['event_types'] if ev[1] == "End of bin"), "38")
  p1_start = next((ev[1] for ev in participant['events'] if ev[0] == "35"), 0)

  participant_summary['phases_offset'] = (
      next((time for (event_type, time) in participant['events'] if (event_type == "30"))),
      next((time for (event_type, time) in participant['events'] if (event_type == "31"))),
      next((time for (event_type, time) in participant['events'] if (event_type == "99"))),
  )

  # Ensure events are sorted
  participant['events'].sort(key=lambda event: event[1])

  current_bin = []
  for evt in list(filter(lambda ev: ev[1] > p1_start, participant['events'])): # Filtering out events before p1 starts
    evt_type, evt_time = evt

    # We will break each participant up into a set of bins
    # Each bin will contain all of the events that occurred in that bin
    current_bin.append(evt)
    if evt[0] == eob_evt:
      participant_summary['bins'].append(current_bin)
      current_bin = []

  participant['summary'] = participant_summary

# participant = participants[0]['summary']
# bins = participant['bins']
# t = list(map(lambda evtList: list(filter(lambda evt: evt[0] == "01", evtList)), bins))
# print(f"Target: {list(map(lambda l: len(l), t))}")
# for x in t:
#   print(x)

### Latencies
---
This section will calculate the phase 3 latencies for target responses (time between when p3 starts and first target response in p3 is given)

In [None]:
for participant in participants:
  summary = participant['summary']

  p3_start = summary['phases_offset'][1]
  evts_in_p3 = list(filter(lambda evts: evts[1] > p3_start, participant['events']))
  sorted_evts_p3 = sorted(evts_in_p3, key=lambda evt: evt[1])
  first_tr_p3_time = next((time for (evt_type, time) in sorted_evts_p3 if (evt_type == "01")), 'none')
  p3_latency = "none" if first_tr_p3_time == 'none' else (first_tr_p3_time - p3_start) / 1000
  participant['summary']['p3_latency'] = p3_latency
  # print(f"{participant['name']} => {p3_latency}")

print("Calculated Phase 3 latencies.")

Calculated Phase 3 latencies.


### Exclude Participants
___
This step will exclude certain participants according to given criteria.

In [None]:
for participant in participants:

  """
    Exclusion Criterion #1
    ----------------------
    - There are zero target and zero alt responses in the last 2 minutes of Phase 1.
  """

  phase_1_end = participant['summary']['phases_offset'][0]
  phase_1_last_2_mins = phase_1_end - 120000 if phase_1_end - 120000 > 0 else 0

  # Target responses (tr) & Alt responses (ar) in the last 2 minutes of P1
  last_2_mins_tr_p1 = list(
      filter(lambda evt: evt[0] == "01" and phase_1_last_2_mins <= evt[1] <= phase_1_end, participant['events'])
  )
  last_2_mins_ar_p1 = list(
      filter(lambda evt: evt[0] == "02" and phase_1_last_2_mins <= evt[1] <= phase_1_end, participant['events'])
  )
  if len(last_2_mins_tr_p1) == 0 and len(last_2_mins_ar_p1) == 0:
    participant['excluded'] = True
    participant['exclusion_reason'] = "Zero TR & AR in last 2 mins of P1"


  """
    Exclusion Criterion #2
    ----------------------
    - There are zero target and zero alt responses in the last 2 minutes of Phase 2.
  """

  phase_2_end = participant['summary']['phases_offset'][1]
  phase_2_last_2_mins = phase_2_end - 120000

  # Target responses (tr) & Alt responses (ar) in the last 2 minutes of P1
  last_2_mins_tr_p2 = list(
      filter(lambda evt: evt[0] == "01" and phase_2_last_2_mins <= evt[1] <= phase_2_end, participant['events'])
  )
  last_2_mins_ar_p2 = list(
      filter(lambda evt: evt[0] == "02" and phase_2_last_2_mins <= evt[1] <= phase_2_end, participant['events'])
  )
  if len(last_2_mins_tr_p2) == 0 and len(last_2_mins_ar_p2) == 0:
    participant['excluded'] = True
    participant['exclusion_reason'] = "Zero TR & AR in last 2 mins of P2"


  """
    Exclusion Criterion #3
    ----------------------
    - Target responding has not decreased to 50% of the P1 levels
  """

  # Target responses (tr) in the last 1 minutes of P1

  phase_1_last_1_min = phase_1_end - 60000 if phase_1_end - 60000 > 0 else 0

  last_1_mins_tr_p1 = list(
      filter(lambda evt: evt[0] == "01" and phase_1_last_1_min <= evt[1] <= phase_1_end, participant['events'])
  )

  # Target responses (tr) in the last 1 minutes of P2
  phase_2_last_1_min = phase_2_end - 60000 if phase_2_end - 60000 > 0 else 0

  last_1_mins_tr_p2 = list(
      filter(lambda evt: evt[0] == "01" and phase_2_last_1_min <= evt[1] <= phase_2_end, participant['events'])
  )

  # If responding has not decreased to 50% of P1 -> exclude
  if len(last_1_mins_tr_p2) >= (0.5 * len(last_1_mins_tr_p1)):
    participant['excluded'] = True
    participant['exclusion_reason'] = f"Target responding has not decreased to 50% of P1 levels. P1 level (last min): {len(last_1_mins_tr_p1)}, P2 level (last min): {len(last_1_mins_tr_p2)}"


excluded_participants = list(filter(lambda participant: participant['excluded'], participants))
print(f"There were {len(list(excluded_participants))} participants excluded.")
print("*** NOTE *** This number may be off due to the increased phase duration because of the consummatory response.")

There were 34 participants excluded.
*** NOTE *** This number may be off due to the increased phase duration because of the consummatory response.


### Create Summary
___
This step actually produces the output file based on the bin breakdown of each participant.

In [None]:
out_path = os.path.join(".", "out", "summary.csv")

if not os.path.exists(os.path.dirname(out_path)):
  try:
    os.makedirs(os.path.dirname(out_path), exist_ok=True)

  except OSError:
    pass

with open(out_path, "w+") as out_file:
  writer = csv.writer(out_file)

  # Write each row
  writer.writerow(map(lambda participant: participant['name'], participants))

  # Excluded?
  writer.writerow(map(lambda participant: "Excluded" if participant['excluded'] else "Good", participants))
  writer.writerow(map(lambda participant: participant['exclusion_reason'] if participant['excluded'] else "", participants))

  # Number of bins
  num_bins = len(participants[0]['summary']['bins'])

  # Write events
  for key, evt_type in participants[0]['event_types']:

    # Blank
    writer.writerow([])
    writer.writerow(map(lambda x: evt_type if x[0] == 0 else "", enumerate(participants)))

    for phase in range(3):

      writer.writerow(map(lambda x: f"Phase {phase + 1}" if x[0] == 0 else "", enumerate(participants)))

      for bin_in_phase in range(int(num_bins / config['phase_count'])):

        total_bin_idx = (phase * (int(num_bins / config['phase_count']))) + bin_in_phase
        evts_in_bin = list(map(lambda participant: participant['summary']['bins'][total_bin_idx], participants))
        specific_evts = [len(list(filter(lambda evt: evt[0] == key, evt_list))) for evt_list in evts_in_bin]
        writer.writerow(specific_evts)

  # Write Phase Durations
  writer.writerow(map(lambda x: f"Phase Durations" if x[0] == 0 else "", enumerate(participants)))
  writer.writerow([])
  for phase in range(config['phase_count']):
    writer.writerow(map(lambda x: f"Phase {phase + 1}" if x[0] == 0 else "", enumerate(participants)))
    current_phase_durations = []
    for participant in participants:
      phase_start = participant['summary']['phases_offset'][phase - 1] if phase - 1 >= 0 else 0
      phase_end = participant['summary']['phases_offset'][phase]
      current_phase_durations.append((phase_end - phase_start) / 1000)
    writer.writerow(current_phase_durations)

  # Write latencies
  writer.writerow([])
  writer.writerow(map(lambda x: "Phase 3 Latencies" if x[0] == 0 else "", enumerate(participants)))
  writer.writerow(map(lambda participant: participant['summary']['p3_latency'], participants))

print(f"Summary file available at {out_path}")

Summary file available at ./out/summary.csv


# Other Analyses
---

This section was created to perform other analyses not included elsewhere.

### Reinforcement Count by Minute
---
_Created: May 17th, 2023 - Matthew Lamperski_ \
_Last Edited: May 17th, 2023 - Matthew Lamperski_

Description: This script will produce a breakdown (`./out/data.csv`) of the number of reinforcements (event #17 or #18) during each minute of the given experiment. The script accepts a directory as an input (stored in the `dir_path` variable).

Notes: Make sure to have the data files uploaded in the correct place. `.` refers to the folder that you can open and upload to by pressing the folder icon on the left toolbar.

In [None]:
import os, tabulate, csv

dir_path = '../datalab/data6.21-Group7.8/'
participant_files = map(lambda rel_path: os.path.join(dir_path, rel_path), os.listdir(dir_path))
data = []

for participant_file_path in participant_files:
    # Check to make sure files analyzed are participant files
    if os.path.isfile(participant_file_path) and participant_file_path[-4:] == '.csv':
        # Get event array
        lines = open(participant_file_path).readlines()
        events_idx = lines.index('99: End of session\n') + 2 # index of first event in array
        events_str = lines[events_idx:lines[events_idx:].index('\n') + events_idx] # gets events (starting w/ events_idx, ending with first newline after)
        evts = list(map(lambda evt_str: (int(evt_str.split()[0][0:2]), int(evt_str.split()[1].strip())), events_str))

        # Get phase start times and end times. Format: (start_time, end_time)
        p1_times = (evts[0][1], evts[next(i for i,evt in enumerate(evts) if evt[0] == 30)][1])
        p2_times = (evts[next(i for i,evt in enumerate(evts) if evt[0] == 30)][1], evts[next(i for i,evt in enumerate(evts) if evt[0] == 31)][1])
        p3_times = (evts[next(i for i,evt in enumerate(evts) if evt[0] == 31)][1], evts[next(i for i,evt in enumerate(evts) if evt[0] == 99)][1])

        # Get num of reinforcements during each phase
        p1_rei_tot = list(filter(lambda evt: (evt[0] == 17 or evt[0] == 18) and (p1_times[0] < evt[1] < p1_times[1]), evts))
        p2_rei_tot = list(filter(lambda evt: (evt[0] == 17 or evt[0] == 18) and (p2_times[0] < evt[1] < p2_times[1]), evts))
        p3_rei_tot = list(filter(lambda evt: (evt[0] == 17 or evt[0] == 18) and (p3_times[0] < evt[1] < p3_times[1]), evts))


        # Get num of reinforcements per minute

        minutes = []

        # Phase 1
        for i in range(1, 6):
            current_minute = (p1_times[0] + (60000 * (i - 1)) if p1_times[0] + (60000 * (i - 1)) >= p1_times[0] else p1_times[0], p1_times[0] + (60000 * i) if p1_times[0] + (60000 * i) <= p1_times[1] else p1_times[1])
            minutes.append(len(list(filter(lambda evt: current_minute[0] <= evt[1] <= current_minute[1] and (evt[0] == 17 or evt[0] == 18), evts))))

        # Phase 2
        for i in range(1, 6):
            current_minute = (p2_times[0] + (60000 * (i - 1)) if p2_times[0] + (60000 * (i - 1)) >= p2_times[0] else p2_times[0], p2_times[0] + (60000 * i) if p2_times[0] + (60000 * i) <= p2_times[1] else p2_times[1])
            minutes.append(len(list(filter(lambda evt: current_minute[0] <= evt[1] <= current_minute[1] and (evt[0] == 17 or evt[0] == 18), evts))))

        # Phase 3
        for i in range(1, 5):
            current_minute = (p3_times[0] + (60000 * (i - 1)) if p3_times[0] + (60000 * (i - 1)) >= p3_times[0] else p3_times[0], p3_times[0] + (60000 * i) if p3_times[0] + (60000 * i) <= p3_times[1] else p3_times[1])
            minutes.append(len(list(filter(lambda evt: current_minute[0] <= evt[1] <= current_minute[1] and (evt[0] == 17 or evt[0] == 18), evts))))

        data.append([os.path.basename(participant_file_path)[:-4], *minutes])

# Write to data.csv
out_path = "../datalab/out/"
cols = ["Participant", "P1.1", "P1.2", "P1.3", "P1.4", "P1.5", "P2.1", "P2.2", "P2.3", "P2.4", "P2.5", "P3.1", "P3.2", "P3.3", "P3.4"]
with open(os.path.join(out_path, 'data.csv'), 'w+') as file:
    writer = csv.writer(file)
    writer.writerow(cols)
    writer.writerows(data)

table = tabulate.tabulate(data, tablefmt='grid', headers=cols)
print(table)

+------------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
| Participant      |   P1.1 |   P1.2 |   P1.3 |   P1.4 |   P1.5 |   P2.1 |   P2.2 |   P2.3 |   P2.4 |   P2.5 |   P3.1 |   P3.2 |   P3.3 |   P3.4 |
| Participant036-7 |      9 |     13 |     10 |     11 |     12 |     12 |      7 |      9 |      4 |     10 |     13 |     13 |     11 |      0 |
+------------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
| Participant034-7 |     18 |     21 |     24 |     20 |     23 |     19 |     19 |     21 |     20 |     17 |     15 |     21 |     18 |      0 |
+------------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
| Participant034-8 |     14 |     14 |     12 |     18 |     19 |      9 |     12 |      1 |     17 |     19 |     12 

In [None]:
from google.colab import drive
drive.mount('/content/drive')