In [1]:
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

file_path = 'merged_participant_data.json'

with open(file_path, 'r') as file:
    data = json.load(file)


# Now you can work with the un-JSONed data
# For example, you can access specific values using keys
# print(data) 

### Create long dataframe|

Meeting met Martijn:
Maak een lang dataframe en probeer 'is_detected' te voorspellen met een multilevel regression
Voeg verschillende variabelen toe:
- type patroon
- volgorde van version
- volgorde van dataset
- leereffect centreren door van -17 tot + 17 te laten gaan

In [2]:
new_data = data['participants'][0]

In [3]:
dataset_map = { 'dataset1': {0:'short', 1:'long', 2:'change', 3:'short', 4:'long', 5:'change', 
                            6:'short', 7:'change', 8:'change', 9:'long', 10:'short', 11:'long'}, 
                'dataset2': {0:'long', 1:'change', 2:'change', 3:'short', 4:'change', 5:'short',
                            6:'long', 7:'long', 8:'long', 9:'short', 10:'change', 11:'short'},
                'dataset3': {0:'change', 1:'change', 2:'long', 3:'long', 4:'long', 5:'short',
                            6:'short', 7:'short', 8:'change', 9:'change', 10:'short', 11:'long'},
                'dataset4': {0:'short', 1:'short', 2:'long', 3:'long', 4:'short', 5:'short',
                            6:'long', 7:'change', 8:'long', 9:'change', 10:'change', 11:'change'}
               }

In [4]:
new_data['metadata']['version_sequence'][0]

'version3'

In [5]:
def preprocessing(log_version, data):
  log_version_data = data[log_version]

  # Flatten the nested lists
  flattened_data = [item for sublist in log_version_data for item in sublist]

  df = pd.json_normalize(flattened_data)
  df.rename(columns={'block.id': 'id'}, inplace=True)
  df.rename(columns={'block.xpos': 'xpos'}, inplace=True)
  df.rename(columns={'block.colorID': 'colorID'}, inplace=True)
  df.rename(columns={'block.eventCat': 'eventCat'}, inplace=True)
  df.rename(columns={'block.inititialTime': 'inititialTime'}, inplace=True)
  df.rename(columns={'block.detectionTime': 'detectionTime'}, inplace=True)

  # Assuming 'df' is your dataframe
  df['inititialTime'] = pd.to_datetime(df['inititialTime'])
  df['detectionTime'] = pd.to_datetime(df['detectionTime'])

  # Now you can compute the reaction time as the difference
  df['reactionTime'] = df['detectionTime'] - df['inititialTime']

  # To convert the reaction time from a timedelta to a more usable format (e.g., seconds)
  df['reactionTime_seconds'] = df['reactionTime'].dt.total_seconds()

  #remove the rows where sequenceID is NaN
  df = df.dropna(subset=['sequenceID'])

  return df

In [6]:
df = preprocessing('log_version3', new_data)

In [7]:
# returns a dataframe with all last clicks of the participant

def last_click_to_df(df):

  marked_an_unmarked_clicks = df[(df['source'] == 'user') & ((df['event'] == 'confirmAnomaly') | (df['event'] == 'unmarkAnomaly') | (df['event'] == 'markAsAnomaly'))]
  marked_an_unmarked_clicks = marked_an_unmarked_clicks.sort_values(by =['sequenceID','id','detectionTime'])	

  # i want to remove the rows of drop_duplicates from the marked_an_unmarked_clicks
  no_duplicated_clicks = marked_an_unmarked_clicks.drop_duplicates(subset=['sequenceID', 'id'], keep='last')

  return no_duplicated_clicks

In [8]:
def anomalies_to_df(df):
  anomalies = df[df['event'] == 'anomaly']
  anomalies = anomalies[['sequenceID', 'id', 'eventCat']]
  sorted_anomalies = anomalies.sort_values(by=['sequenceID', 'id'])
  sorted_anomalies.reset_index(drop=True, inplace=True)

  return sorted_anomalies
  

In [9]:
def long_dataframe(last_click, anomalies, detector):

  # unmarked clicks eruit gehaald
  last_click_archive = last_click.copy()

  last_click = last_click[last_click['event'] != 'unmarkAnomaly']
  last_click = last_click[['sequenceID', 'id']]
  last_click.reset_index(drop=True, inplace=True)

  # anomaly frame
  anomalies['is_detected'] = anomalies.apply(lambda row: (row['sequenceID'], row['id']) in zip(last_click['sequenceID'], last_click['id']), axis=1)
  
  #i want to merge the column 'reactionTime_seconds' of last_click_archive with the anomalies df while keeping all rows of anomalies
  anoamlies_reaction_time_df = anomalies.merge(last_click_archive, on=['sequenceID', 'id'], how='left')
  anoamlies_reaction_time_df = anoamlies_reaction_time_df[['sequenceID', 'id', 'eventCat_x', 'is_detected', 'reactionTime_seconds']]
  anoamlies_reaction_time_df.rename(columns={'eventCat_x': 'eventCat'}, inplace=True)

  anoamlies_reaction_time_df['automatic_detection'] = anoamlies_reaction_time_df.apply(lambda row: (row['sequenceID'], row['id']) in zip(detector['sequenceID'], detector['id']), axis=1)

  return anoamlies_reaction_time_df

In [10]:
def events_detector(df_test):
  df_anomaly_detector = df_test[df_test['source'] == 'detector']
  df_anomaly_detector = df_anomaly_detector[['sequenceID', 'id']]
  df_anomaly_detector.sort_values(by=['sequenceID', 'id'], inplace=True)

  return df_anomaly_detector

In [11]:
long_frame = long_dataframe(last_click_to_df(df), anomalies_to_df(df), events_detector(df))


In [12]:
long_frame['version'] = new_data['metadata']['version_sequence'][0]


In [13]:
def df_per_version(version, participant_data):

  df = preprocessing(version, participant_data)
  long_frame = long_dataframe(last_click_to_df(df), anomalies_to_df(df), events_detector(df))

  #version data
  long_frame['version'] = participant_data['metadata']['version_sequence'][int(version[-1])-1]
  long_frame['run_order'] = int(version[-1])
  long_frame['dataset'] = participant_data['metadata']['dataset_sequence'][int(version[-1])-1]

  #pattern data
  def get_pattern(row):
      dataset = row['dataset']
      sequenceID = row['sequenceID']
      return dataset_map.get(dataset, {}).get(sequenceID, None)
    
  long_frame['pattern'] = long_frame.apply(get_pattern, axis=1)

  #participant data
  long_frame['participant'] = participant_data['metadata']['participant_number']
  long_frame['gender'] = participant_data['metadata']['gender']
  long_frame['age'] = participant_data['metadata']['age']
  long_frame['education'] = participant_data['metadata']['education']
  long_frame['experience'] = participant_data['metadata']['experience']

  #survey data
  long_frame['general_difficulty'] = participant_data['survey']['anomaly_difficulty']
  long_frame['automatic_detection_helpful'] = participant_data['survey'].get('automatic_detection_helpful', pd.NA)
  long_frame['history_helpful'] = participant_data['survey']['history_helpful']
  long_frame['difficult_v1'] = participant_data['survey']['difficult_v1']
  long_frame['difficult_v2'] = participant_data['survey']['difficult_v2']
  long_frame['difficult_v3'] = participant_data['survey']['difficult_v3']
  long_frame['difficult_v4'] = participant_data['survey']['difficult_v4']
  long_frame['preference_v1'] = participant_data['survey']['preference_v1']
  long_frame['preference_v2'] = participant_data['survey']['preference_v2']
  long_frame['preference_v3'] = participant_data['survey']['preference_v3']
  long_frame['preference_v4'] = participant_data['survey']['preference_v4']

  # Reset the index to create the 'anomalyID' column and rename it
  long_frame.reset_index(inplace=True)
  long_frame.rename(columns={'index': 'anomalyID'}, inplace=True)

  # Reorder the columns, making 'participant' the first column after the index
  columns = ['participant'] + [col for col in long_frame.columns if col != 'participant']
  long_frame = long_frame[columns]

  version_mapping = {
    'version1': 'window',
    'version2': 'history',
    'version3': 'window+ad',
    'version4': 'history+ad'
  }
    
  # Replace the values in the 'version' column
  long_frame['version'] = long_frame['version'].replace(version_mapping)
  
  return long_frame


In [14]:
def df_per_participant(participant_data):
  df = pd.DataFrame()

  for i in range(1, 5):
    df = df.append(df_per_version(f'log_version{i}', participant_data))

  return df

In [15]:
checken = df_per_participant(new_data)
checken

  df = df.append(df_per_version(f'log_version{i}', participant_data))
  df = df.append(df_per_version(f'log_version{i}', participant_data))
  df = df.append(df_per_version(f'log_version{i}', participant_data))
  df = df.append(df_per_version(f'log_version{i}', participant_data))


Unnamed: 0,participant,anomalyID,sequenceID,id,eventCat,is_detected,reactionTime_seconds,automatic_detection,version,run_order,...,automatic_detection_helpful,history_helpful,difficult_v1,difficult_v2,difficult_v3,difficult_v4,preference_v1,preference_v2,preference_v3,preference_v4
0,18,0,0.0,82.0,10,False,,False,window+ad,1,...,6,1,3,4,1,2,4,1,3,2
1,18,1,0.0,143.0,10,True,4.318,False,window+ad,1,...,6,1,3,4,1,2,4,1,3,2
2,18,2,0.0,179.0,10,True,19.697,False,window+ad,1,...,6,1,3,4,1,2,4,1,3,2
3,18,3,0.0,212.0,10,True,22.615,False,window+ad,1,...,6,1,3,4,1,2,4,1,3,2
4,18,4,1.0,63.0,10,True,19.983,False,window+ad,1,...,6,1,3,4,1,2,4,1,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31,18,31,9.0,202.0,10,True,6.862,True,window,4,...,6,1,3,4,1,2,4,1,3,2
32,18,32,11.0,69.0,10,True,10.697,False,window,4,...,6,1,3,4,1,2,4,1,3,2
33,18,33,11.0,118.0,10,True,18.525,True,window,4,...,6,1,3,4,1,2,4,1,3,2
34,18,34,11.0,173.0,9,True,7.613,True,window,4,...,6,1,3,4,1,2,4,1,3,2


In [16]:
def make_ultra_long_dataframe(data):
  df = pd.DataFrame()

  for participant in data['participants']:
    df = df.append(df_per_participant(participant))

  return df

In [17]:
complete_df = make_ultra_long_dataframe(data)
complete_df

#complete_df.to_csv('complete_df.csv', index=False) dit niet meer gebruiken

  df = df.append(df_per_version(f'log_version{i}', participant_data))
  df = df.append(df_per_version(f'log_version{i}', participant_data))
  df = df.append(df_per_version(f'log_version{i}', participant_data))
  df = df.append(df_per_version(f'log_version{i}', participant_data))
  df = df.append(df_per_participant(participant))
  df = df.append(df_per_version(f'log_version{i}', participant_data))
  df = df.append(df_per_version(f'log_version{i}', participant_data))
  df = df.append(df_per_version(f'log_version{i}', participant_data))
  df = df.append(df_per_version(f'log_version{i}', participant_data))
  df = df.append(df_per_participant(participant))
  df = df.append(df_per_version(f'log_version{i}', participant_data))
  df = df.append(df_per_version(f'log_version{i}', participant_data))
  df = df.append(df_per_version(f'log_version{i}', participant_data))
  df = df.append(df_per_version(f'log_version{i}', participant_data))
  df = df.append(df_per_participant(participant))
  df = df.

Unnamed: 0,participant,anomalyID,sequenceID,id,eventCat,is_detected,reactionTime_seconds,automatic_detection,version,run_order,...,automatic_detection_helpful,history_helpful,difficult_v1,difficult_v2,difficult_v3,difficult_v4,preference_v1,preference_v2,preference_v3,preference_v4
0,18,0,0.0,82.0,10,False,,False,window+ad,1,...,6,1,3,4,1,2,4,1,3,2
1,18,1,0.0,143.0,10,True,4.318,False,window+ad,1,...,6,1,3,4,1,2,4,1,3,2
2,18,2,0.0,179.0,10,True,19.697,False,window+ad,1,...,6,1,3,4,1,2,4,1,3,2
3,18,3,0.0,212.0,10,True,22.615,False,window+ad,1,...,6,1,3,4,1,2,4,1,3,2
4,18,4,1.0,63.0,10,True,19.983,False,window+ad,1,...,6,1,3,4,1,2,4,1,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31,9,31,9.0,202.0,10,False,,False,history+ad,4,...,5,2,4,3,2,1,1,2,3,4
32,9,32,11.0,69.0,10,True,26.727,False,history+ad,4,...,5,2,4,3,2,1,1,2,3,4
33,9,33,11.0,118.0,10,True,18.463,True,history+ad,4,...,5,2,4,3,2,1,1,2,3,4
34,9,34,11.0,173.0,9,True,11.100,True,history+ad,4,...,5,2,4,3,2,1,1,2,3,4


In [18]:
complete_df['participant'] = complete_df['participant'].astype(int)
complete_df['version'] = complete_df['version'].astype(str)

In [19]:
all_metrics = pd.read_csv('all_metrics.csv')

all_metrics

Unnamed: 0,participant,version,recall,precision,F1,TP,FN,FP
0,18,window,0.86,0.46,0.60,31,5,37
1,43,window,0.72,0.39,0.51,26,10,40
2,51,window,0.58,1.00,0.74,21,15,0
3,42,window,0.72,0.38,0.49,26,10,43
4,12,window,0.72,0.68,0.70,26,10,12
...,...,...,...,...,...,...,...,...
207,64,history+ad,0.58,0.84,0.69,21,15,4
208,66,history+ad,0.69,0.73,0.71,25,11,9
209,14,history+ad,0.31,0.50,0.38,11,25,11
210,73,history+ad,0.72,0.48,0.58,26,10,28


In [20]:
all_metrics['participant'] = all_metrics['participant'].astype(int)
all_metrics['version'] = all_metrics['version'].astype(str)

In [26]:
df_merged = pd.merge(complete_df, all_metrics, on=['participant', 'version'], how='left')
df_merged

#remove the rows with participant number 11 and 42
df_merged = df_merged[df_merged['participant'] != 11]
df_merged = df_merged[df_merged['participant'] != 42]

df_merged

df_merged.to_csv('complete_df_recall.csv', index=False)

In [22]:
def search_survey(data):
  df = pd.DataFrame()

  for participant in data['participants']:  
      # Create a copy of the survey dictionary and add the participant number
      survey_data = participant['survey'].copy()
      survey_data['participant'] = participant['metadata']['participant_number']
      
      # Append the updated survey data to the dataframe
      df = df.append(survey_data, ignore_index=True)

  return df

In [23]:
survey_results = search_survey(data)
survey_results

# survey_results.to_csv('survey_results.csv', index=False)

  df = df.append(survey_data, ignore_index=True)
  df = df.append(survey_data, ignore_index=True)
  df = df.append(survey_data, ignore_index=True)
  df = df.append(survey_data, ignore_index=True)
  df = df.append(survey_data, ignore_index=True)
  df = df.append(survey_data, ignore_index=True)
  df = df.append(survey_data, ignore_index=True)
  df = df.append(survey_data, ignore_index=True)
  df = df.append(survey_data, ignore_index=True)
  df = df.append(survey_data, ignore_index=True)
  df = df.append(survey_data, ignore_index=True)
  df = df.append(survey_data, ignore_index=True)
  df = df.append(survey_data, ignore_index=True)
  df = df.append(survey_data, ignore_index=True)
  df = df.append(survey_data, ignore_index=True)
  df = df.append(survey_data, ignore_index=True)
  df = df.append(survey_data, ignore_index=True)
  df = df.append(survey_data, ignore_index=True)
  df = df.append(survey_data, ignore_index=True)
  df = df.append(survey_data, ignore_index=True)
  df = df.append(sur

Unnamed: 0,anomaly_difficulty,automatic_detection_helpful,automatic_detection_helpful_explanation,history_helpful,history_helpful_explanation,difficult_v1,difficult_v2,difficult_v3,difficult_v4,preference_v1,preference_v2,preference_v3,preference_v4,issues_or_interruptions,participant
0,2,6.0,Especially in the runs where the blobs ´crunch...,1,It did not help me deduce the patterns.,3,4,1,2,4,1,3,2,My mouse did not respond as quickly as it does...,18
1,2,5.0,"Although it was incorrect quite often, it did ...",2,,3,4,1,2,2,1,4,3,,43
2,2,3.0,It was not that helpful because you still had ...,4,-,4,3,2,1,1,2,3,4,no,51
3,2,6.0,"`Guides the eye, gives something to focus on a...",3,"Distracting and first, but data accumulated sh...",4,3,1,2,2,1,4,3,"I only had automatic detection once, this was ...",42
4,2,5.0,"It gave me something to focus on, a starting p...",6,I found the smaller colored lines without the ...,1,2,3,4,4,3,2,1,No.,12
5,2,5.0,often the detection would indicate a wrong ano...,5,"the compressed view could be handy, the histor...",3,4,2,2,3,4,1,2,,52
6,2,2.0,The automatic detection borders distracted me ...,6,For the long chains it was easier to see the d...,2,1,4,3,2,4,1,3,No,63
7,3,3.0,Sometimes they were inaccurate so you had to w...,5,The first time it was confusing but the last t...,4,3,2,1,1,3,2,4,No :),44
8,2,1.0,It felt like it was consistently taking starts...,6,"The aggregated view did very little for me, th...",3,1,4,2,2,4,1,3,No more than usual.,32
9,2,5.0,Yes the red borders were somewhat helpful. It'...,7,I particularly liked the compressed view. It h...,1,2,3,4,4,3,2,1,No.,27


Voor morgen:
- patronen toevoegen door een mapping te maken van versie van patronen
- quantitative data toevoegen
- dataset en versie etc. toevoegen