# **Habit App - Download, Save and Parse data**

* This code is sturctured to work using google colab. 
* It is designed to connect and pull the data from mongoDB using credentials located in a json file (determined by the variable credentialsFileNameRelativePath)
> This code was written by Rani Gera, last editted on November 2022


In [16]:
#@title Load Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [17]:
#@title Import packages
# *** In Colab one might need to RUN this part and if there is an error, RESTART THE RUNTIME AND RUN AGAIN) ***
import os
import json
import numpy as np
import pandas as pd
import collections
import dateutil.parser
import pytz
from datetime import datetime, timedelta
from scipy import stats
from pymongo import MongoClient

# Helps make csv like string data to dataframe: (taken from https://stackoverflow.com/questions/22604564/create-pandas-dataframe-from-a-string):
import sys
if sys.version_info[0] < 3:
    from StringIO import StringIO
else:
    from io import StringIO


In [45]:
#@title Parameter definition { form-width: "5%" }
manual_exclusions = [] # example:[705, 706]

minSubID = [100, 200, 300]
maxSubID = [199, 299, 399]

doNotExcludedDueToMissingDays = [] # participants not to exclude due to missing days
doNotExcludedDueToNotSatisfyingMinDailyEntries = [] # participants not satisfying the minimum amount of daily entries

# paths:
# main_path = os.path.dirname(os.path.abspath(__file__))
main_path = '/content/drive/MyDrive/Experiments/HAS_STUDY/HAS_Analysis'
credentialsFileNameRelativePath = 'mongoDB_stuff/mongo_DB_credentials.json'
DB_credentials_file = os.path.join(main_path, credentialsFileNameRelativePath)
# Raw data file names:
rawDataFileName = os.path.join(main_path, 'data/extracted_data/raw_data.json')
rawDataAsListFile = 'raw_data.txt'
rawDataAsBinaryFile = "raw_data.pkl"
# Parsed data filenames:
filteredDF_File = os.path.join(main_path, 'data/extracted_data/filteredDF.csv')
allDataFor_R_File = os.path.join(main_path, 'data/extracted_data/all_data_for_R.csv')
coreDataFileName = '/data/extracted_data/core_table.csv'


# number of experimental days by group:
n_experimental_days = {'short_training': 4, 'long_training': 11, 'long_training_parallel_manipulations': 11}
# number of experimental manipulations per subject:
n_manipulations = {'short_training': 3, 'long_training': 3, 'long_training_parallel_manipulations': 6}
n_only_val_and_deval_manipulations = {'short_training': 2, 'long_training': 2, 'long_training_parallel_manipulations': 4}
parallelManiplationDays = {'still_valued_week1': 2, 'still_valued_post_deval_week1': 4}

main_manipulation_days = {
    'short_training': {'still_valued': 2, 'devaluation': 3, 'still_valued_post_deval': 4},
    'long_training': {'still_valued': 9, 'devaluation': 10, 'still_valued_post_deval': 11},
    'long_training_parallel_manipulations': {'still_valued': 9, 'devaluation': 10, 'still_valued_post_deval': 11},  
}

manipulations_renamed = {
    'short_training': ['still_valued', 'devaluation', 'still_valued_post_deval'],
    'long_training': ['still_valued', 'devaluation', 'still_valued_post_deval'],
    'long_training_parallel_manipulations': ['still_valued_week1', 'still_valued_replacing_devaluation', 'still_valued_post_deval_week1', 'still_valued', 'devaluation', 'still_valued_post_deval'],  
}

all_manipulation_days = {
    'short_training': {'still_valued': 2, 'devaluation': 3, 'still_valued_post_deval': 4},
    'long_training': {'still_valued': 9, 'devaluation': 10, 'still_valued_post_deval': 11},
    'long_training_parallel_manipulations': {'still_valued_week1': 2, 'still_valued_replacing_devaluation': 3, 'still_valued_post_deval_week1': 4, 'still_valued': 9, 'devaluation': 10, 'still_valued_post_deval': 11},  
}

minimumDailyEntriesRequired = 5

# lines to remove in data:
signals_to_remove_data_lines = {
    'opennedInstallaitonPage': True,
    'showInstructions': True,
    'instructionsStartedFlag': True,
    'isDemo': True
}

localTimeZone = "Israel" # used to adjust the time variables which are recorded in utc time.
timeInStringColumns = ['startTime', 'press1Time', 'press2Time', 'outcomeTime', 'endTime', 'userExitOrUnloadTime',
                       'realGameBeginsAlertTime', 'realGameBeginsConfirmationTime', 'resetContainerAlertTime', 'resetContainerConfirmationTime', 'manipulationAlertTime', 'manipulationConfirmationTime', 'foundCaveAlertTime', 'foundCaveConfirmationTime', 'endExperimentAlertTime'] # these variables will be converted to the local time zone and saved as datetime variables
selected_df_structure = (['subId', 'group', 'day', 'cost', 'reward', 'hideOutcome', 'manipulationToday', 'activateManipulation', 'isUnderManipulation', 'consumptionTest', 'coins_task_hits_count', 'coins_task_misses_count', 'coin_task_finish_status']
                         + timeInStringColumns
                         + ['dataLoadingTime', 'screenOrientationData', 'touchData', 'isDemo', 'endExperiment', 'uniqueEntryID'])

# viewing stuff
pd.set_option('max_rows', None)
pd.set_option('max_colwidth', None)
pd.set_option('display.max_columns', 40)  # set this number to >= your number of cols
pd.set_option('display.width', 1000)


In [19]:
#@title Function definition { form-width: "5%" }
# -----------------------------------------------------------------------
def read_mongoextjson_file(
        filename):  # adapted from https://stackoverflow.com/a/60850425 to handle the bson files created by MongoDB
    with open(filename, "r") as f:
        # read the entire input; in a real application,
        # you would want to read a chunk at a time
        bsondata = f.read()

        # convert the TenGen JSON to Strict JSON
        # here, I just convert the ObjectId and Date structures,
        # but it's easy to extend to cover all structures listed at
        # http://www.mongodb.org/display/DOCS/Mongo+Extended+JSON
        jsondata = re.sub(r'ObjectId\s*\(\s*\"(\S+)\"\s*\)',
                          r'{"$oid": "\1"}',
                          bsondata)
        jsondata = re.sub(r'ISODate\s*\(\s*(\S+)\s*\)',
                          r'{"$date": \1}',
                          jsondata)
        jsondata = re.sub(r'NumberInt\s*\(\s*(\S+)\s*\)',
                          r'{"$numberInt": "\1"}',
                          jsondata)

        # now we can parse this as JSON, and use MongoDB's object_hook
        # function to get rich Python data structures inside a dictionary
        data = json.loads(jsondata, object_hook=json_util.object_hook)

        return data


def getRawDataAll(DB_credentials_file):
  '''
  Using a (local) json file which holds the credentials for our Mongo database
  to connect and pull the (raw) data into a variable. 
  '''
  with open(DB_credentials_file) as f:
    DB_credentials = json.load(f)
  client = MongoClient(DB_credentials['host'],
                      username=DB_credentials['username'],
                      password=DB_credentials['password'],
                      authSource=DB_credentials['authSource'],
                      authMechanism=DB_credentials['authMechanism'],
                      tls=DB_credentials['tls'])
  return list(client['nodejs-app'][DB_credentials['collectionName']].find({}))


def getRawDataInRange(DB_credentials_file, minSubID, maxSubID):
  '''
  Using a (local) json file which holds the credentials for our Mongo database
  to connect and pull the (raw) data into a variable. 
  '''
  with open(DB_credentials_file) as f:
    DB_credentials = json.load(f)
  client = MongoClient(DB_credentials['host'],
                      username=DB_credentials['username'],
                      password=DB_credentials['password'],
                      authSource=DB_credentials['authSource'],
                      authMechanism=DB_credentials['authMechanism'],
                      tls=DB_credentials['tls'])
  return list(client['nodejs-app'][DB_credentials['collectionName']].find({'subId':{"$gte":minSubID[0],"$lte":maxSubID[0]}})) + \
  list(client['nodejs-app'][DB_credentials['collectionName']].find({'subId':{"$gte":minSubID[1],"$lte":maxSubID[1]}})) + \
  list(client['nodejs-app'][DB_credentials['collectionName']].find({'subId':{"$gte":minSubID[2],"$lte":maxSubID[2]}}))


def getRawDataOneSubject(DB_credentials_file, subject):
  '''
  Using a (local) json file which holds the credentials for our Mongo database
  to connect and pull the (raw) data into a variable. 
  '''
  with open(DB_credentials_file) as f:
    DB_credentials = json.load(f)
  client = MongoClient(DB_credentials['host'],
                      username=DB_credentials['username'],
                      password=DB_credentials['password'],
                      authSource=DB_credentials['authSource'],
                      authMechanism=DB_credentials['authMechanism'],
                      tls=DB_credentials['tls'])
  return list(client['nodejs-app'][DB_credentials['collectionName']].find({'subId': subject}))


def convertToLocalTime(df, timeInStringColumns, localTimeZone):
  '''
  This funciton converts in the data frame df the column variables indicated in
  timeInStringColumns, from strigs of 'ISOdate' to a datetime variable and sets
  it to the local time.
  The function mutates the input data frame.
  '''
  for timeVar in timeInStringColumns:
    df[timeVar] = data.sub_df[timeVar].map(lambda x: dateutil.parser.parse(x).astimezone(pytz.timezone(localTimeZone)), na_action='ignore')


In [20]:
#@title Class definition { form-width: "5%" }
class Paths:

    def __init__(self):
        self.main_analysis = main_path
        self.app_data = os.path.join(main_path, data_path)
        self.app_data = os.path.join(self.app_data, app_data_path)
        self.app_data_file = os.path.join(self.app_data, app_data_file)


class SimulateData:

    def __init__(self, df_vars=selected_df_structure, n_subjects=198):
        # attributes initiation:
        self.core_table = []
        self.core_data = []
        self.summaryStats = []
        self.diff_index_data = []
        self.diffSummaryStats = []
        self.habitIndex = []
        self.habitIndexSummaryStats = []
        self.total_entries_per_day = []

        # simulate data:
        print('Simulating data - suitable for up to 200 participants (edit code for more)')
        # initialize some stuff:
        manipulationTodayOPTIONS = ['devaluation', 'still_valued']
        self.sub_df = pd.DataFrame(columns=df_vars)
        n_per_group = int(n_subjects/2)
        sub_list = list(range(1701, 1701 + n_per_group)) + list(range(1801, 1801 + n_per_group))
        for subId in sub_list:
            n_data_point = random.randint(100, 500) if subId % 200 > 100 else random.randint(240, 1200)  # as there are *2.40 days (when including the 5th and 12th day
            # set some of the variables:
            dic_data = {'subId': [subId] * n_data_point,
                    'group': ["short_training" if subId % 200 > 100 else "long_training"] * n_data_point,
                    'day': sorted([random.randint(1,5) for i in range(n_data_point)]) if subId % 200 > 100 else sorted([random.randint(1,12) for i in range(n_data_point)]),
                    'cost': [[1]] * n_data_point,
                    'reward': random.choices([0,0,15],k=n_data_point),
                        }
            # set manipulationToday
            for i in range(len(dic_data['subId'])):
                if dic_data['group'][i] == 'short_training' and dic_data['day'][i] == 3:
                    dic_data.setdefault('manipulationToday', []).append(manipulationTodayOPTIONS[subId % 2])
                elif dic_data['group'][i] == 'short_training' and dic_data['day'][i] == 4:
                    dic_data.setdefault('manipulationToday', []).append(manipulationTodayOPTIONS[abs(subId % 2 -1)])
                elif dic_data['group'][i] == 'long_training' and dic_data['day'][i] == 10:
                    dic_data.setdefault('manipulationToday', []).append(manipulationTodayOPTIONS[subId % 2])
                elif dic_data['group'][i] == 'long_training' and dic_data['day'][i] == 11:
                    dic_data.setdefault('manipulationToday', []).append(manipulationTodayOPTIONS[abs(subId % 2 - 1)])
                else:
                    dic_data.setdefault('manipulationToday', []).append(None)
            # set isUnderManipulation
            isUnderManipulation = [False] * n_data_point

            isUnderManipulation_ind_in_devaluation = round(random.normalvariate(mu=dic_data['manipulationToday'].count('devaluation')/2,sigma=10)-1)
            first_isUnderManipulation_ind_in_devaluation = dic_data['manipulationToday'].index('devaluation') + isUnderManipulation_ind_in_devaluation
            last_isUnderManipulation_ind_in_devaluation = len(dic_data['manipulationToday']) - 1 - dic_data['manipulationToday'][::-1].index('devaluation')

            isUnderManipulation_ind_in_still_devalued = round(random.normalvariate(mu=dic_data['manipulationToday'].count('still_valued')/2,sigma=10)-1)
            first_isUnderManipulation_ind_in_still_devalued = dic_data['manipulationToday'].index('still_valued') + isUnderManipulation_ind_in_still_devalued
            last_isUnderManipulation_ind_in_still_devalued = len(dic_data['manipulationToday']) - 1 - dic_data['manipulationToday'][::-1].index('still_valued')

            isUnderManipulation[first_isUnderManipulation_ind_in_devaluation:last_isUnderManipulation_ind_in_devaluation+1] = [True] * (last_isUnderManipulation_ind_in_devaluation - first_isUnderManipulation_ind_in_devaluation +1)
            isUnderManipulation[first_isUnderManipulation_ind_in_still_devalued:last_isUnderManipulation_ind_in_still_devalued+1] = [True] * (last_isUnderManipulation_ind_in_still_devalued - first_isUnderManipulation_ind_in_still_devalued +1)
            dic_data['isUnderManipulation'] = isUnderManipulation

            # append dict:
            self.sub_df = self.sub_df.append(pd.DataFrame.from_dict(dic_data), ignore_index=True)  ###### add self twice

    def create_core_table(self, coreTableRelativePath):
        self.sub_df.loc[(data.sub_df.group == 'long_training_parallel_manipulations') & (data.sub_df.day == (parallelManiplationDays['still_valued_week1'])) & (data.sub_df.manipulationToday == 'still_valued'),'manipulationToday']  = 'still_valued_week1'
        self.sub_df.loc[(data.sub_df.group == 'long_training_parallel_manipulations') & (data.sub_df.day == (parallelManiplationDays['still_valued_post_deval_week1'])) & (data.sub_df.manipulationToday == 'still_valued_post_deval'),'manipulationToday']  = 'still_valued_post_deval_week1'
        groupingVars = ["subId", "group", "manipulationToday", "isUnderManipulation"]
        print('** Grouping by:')
        print(groupingVars)
        # do the grouping:
        groupedData = self.sub_df.groupby(by=groupingVars).size().unstack(fill_value=0).stack()
        # change and set names
        groupedData.index.names = ['subID', 'group', 'manipulation', 'time']
        groupedData = groupedData.rename(index={False: 'pre', True: 'post'})
        groupedData.name = 'n_entries'
        print('** The data:')
        print(groupedData)
        groupedData.to_csv(main_path + coreTableRelativePath)
        print('** csv file saved to ' + coreTableRelativePath)
        self.core_table = pd.read_csv(StringIO(groupedData.to_csv()))
        return self.core_table

    def core_bar_plots(self):
        # initialize vars (tables):
        groups = self.sub_df.group.unique()
        # making the data ready for ploting:
        core_data = self.core_table.set_index(['subID', 'group', 'manipulation', 'time'])
        summaryStats = self.core_table.groupby(['group', 'manipulation', 'time']).n_entries.describe()
        summaryStats['std_err'] = self.core_table.groupby(['group', 'manipulation', 'time']).n_entries.sem()
        print(summaryStats)
        diff_index_data = core_data.groupby(level=[0, 1, 2], axis=0).diff().dropna().reset_index('time', drop=True) # post - pre
        diffSummaryStats = diff_index_data.groupby(['group', 'manipulation']).n_entries.describe()
        diffSummaryStats['std_err'] = diff_index_data.groupby(['group', 'manipulation']).n_entries.sem()
        habitIndex = diff_index_data.groupby(level=[0, 1], axis=0).diff().dropna().reset_index('manipulation', drop=True) # [valued post - pre] minus [devalued post - pre]
        habitIndexSummaryStats = habitIndex.groupby(['group']).n_entries.describe()
        total_entries_per_day = pd.DataFrame(self.sub_df.groupby(by=['subId', 'group', 'day']).size().unstack(fill_value=np.nan).stack()).rename(
            columns={0: 'n_entries'}).reset_index().pivot(index='subId', columns=['day'], values='n_entries')

        # save tables
        self.core_data = core_data
        self.summaryStats = summaryStats
        self.diff_index_data = diff_index_data
        self.diffSummaryStats = diffSummaryStats
        self.habitIndex = habitIndex
        self.habitIndexSummaryStats = habitIndexSummaryStats
        self.total_entries_per_day = total_entries_per_day

        # bar plot per group
        for group in groups:
            plt.figure()
            barWidth = 0.3  # width of the bars
            bars1 = [summaryStats.loc[group, 'still_valued', 'pre']['mean'], summaryStats.loc[group,'devaluation','pre']['mean']]  # Choose the height of the blue bars
            bars2 = [summaryStats.loc[group, 'still_valued', 'post']['mean'], summaryStats.loc[group,'devaluation','post']['mean']]  # Choose the height of the cyan bars
            yer1 =  [summaryStats.loc[group, 'still_valued', 'pre']['std_err'], summaryStats.loc[group,'devaluation','pre']['std_err']]  # Choose the height of the error bars (bars1)
            yer2 =  [summaryStats.loc[group, 'still_valued', 'post']['std_err'], summaryStats.loc[group,'devaluation','post']['std_err']]  # Choose the height of the error bars (bars2)
            # The x position of bars
            r1 = np.arange(len(bars1))
            r2 = [x + barWidth for x in r1]
            # Create blue bars
            plt.bar(r1, bars1, width=barWidth, color='blue', edgecolor='black', yerr=yer1, capsize=7, label='pre', alpha=0.5, zorder=0)
            # Create cyan bars
            plt.bar(r2, bars2, width=barWidth, color='cyan', edgecolor='black', yerr=yer2, capsize=7, label='post', alpha=0.5, zorder=0)
            # general layout
            plt.xticks(r1 + 0.5 * barWidth, ['valued', 'devalued'])
            plt.ylabel('# entries')
            plt.title(group)
            # add individual data points
            for subID in core_data.index.get_level_values('subID').unique():
                if core_data.loc[subID].index.get_level_values('group')[0] == group:
                    subUniqueColor = np.random.rand(3)*0.9
                    plt.plot(np.array([r1[0], r2[0]]), np.array([core_data.loc[subID, group, 'still_valued', 'pre'].n_entries, core_data.loc[subID, group, 'still_valued', 'post'].n_entries]), zorder=0, marker='o', linewidth=1, markersize=2, color=subUniqueColor, alpha=0.3)
                    plt.plot(np.array([r1[1], r2[1]]), np.array([core_data.loc[subID, group, 'devaluation', 'pre'].n_entries, core_data.loc[subID, group, 'devaluation', 'post'].n_entries]), zorder=0, marker='o', linewidth=1, markersize=2, color=subUniqueColor, alpha=0.3)

            plt.legend(loc='upper center')
            plt.show()

        # bar plot diff:
        plt.figure()
        barWidth = 0.3  # width of the bars
        try:
          bars1 = [diffSummaryStats.loc['short_training', 'still_valued']['mean'], diffSummaryStats.loc['long_training','still_valued']['mean']]  # Choose the height of the blue bars
          bars2 = [diffSummaryStats.loc['short_training', 'devaluation']['mean'], diffSummaryStats.loc['long_training','devaluation']['mean']]  # Choose the height of the cyan bars
          yer1 =  [diffSummaryStats.loc['short_training', 'still_valued']['std_err'], diffSummaryStats.loc['long_training','still_valued']['std_err']]  # Choose the height of the error bars (bars1)
          yer2 =  [diffSummaryStats.loc['short_training', 'devaluation']['std_err'], diffSummaryStats.loc['long_training','devaluation']['std_err']]  # Choose the height of the error bars (bars2)
        except:
          bars1 = [diffSummaryStats.loc['short_training', 'still_valued']['mean']]  # Choose the height of the blue bars
          bars2 = [diffSummaryStats.loc['short_training', 'devaluation']['mean']]  # Choose the height of the cyan bars
          yer1 =  [diffSummaryStats.loc['short_training', 'still_valued']['std_err']]  # Choose the height of the error bars (bars1)
          yer2 =  [diffSummaryStats.loc['short_training', 'devaluation']['std_err']]  # Choose the height of the error bars (bars2)
        # The x position of bars
        r1 = np.arange(len(bars1))
        r2 = [x + barWidth for x in r1]
        # Create blue bars
        plt.bar(r1, bars1, width=barWidth, color='blue', edgecolor='black', yerr=yer1, capsize=7, label='valued', alpha=0.5, zorder=0)
        # Create cyan bars
        plt.bar(r2, bars2, width=barWidth, color='cyan', edgecolor='black', yerr=yer2, capsize=7, label='devalued', alpha=0.5, zorder=0)
        # general layout
        plt.xticks(r1 + 0.5 * barWidth, ['short_training', 'long_training'])
        plt.ylabel('Diff # entries')
        plt.title('Sensitivity to outcome devaluation [difference index]')
        # add individual data points
        for subID in diff_index_data.index.get_level_values('subID').unique():
            #subUniqueColor = np.random.rand(3) * 0.9
            if diff_index_data.loc[subID].index.get_level_values('group')[0] == 'short_training':
                plt.plot(np.array([r1[0], r2[0]]), np.array([diff_index_data.loc[subID, 'short_training', 'still_valued'].n_entries, diff_index_data.loc[subID, 'short_training', 'devaluation'].n_entries]),zorder=0, marker='o', linewidth=1, markersize=2, alpha=0.3)
            elif diff_index_data.loc[subID].index.get_level_values('group')[0] == 'long_training':
                plt.plot(np.array([r1[1], r2[1]]), np.array([diff_index_data.loc[subID, 'long_training', 'still_valued'].n_entries, diff_index_data.loc[subID, 'long_training', 'devaluation'].n_entries]), zorder=0, marker='o', linewidth=1, markersize=2, alpha=0.3)
        plt.legend(loc='upper center')
        plt.show()

        # create 'learning' curve figure (using inner function):
        # -------------------------------------------------------
        def createLearningCurve(days_data=total_entries_per_day, individualLines=False):  # a local funciton
            shortTraining = days_data.loc[days_data.index % 200 > 100]
            longTraining = days_data.loc[days_data.index % 200 < 100]
            
            # get the means and fill empty days with zero (in case participants entered following the experiment completion)
            daysDataMean=days_data.mean(skipna=True)
            daysDataSEM=days_data.sem(skipna=True)
            for i in range(int(daysDataMean.index.max())):
              if i+1 not in daysDataMean.index:
                daysDataMean[float(i+1)]=0
                daysDataSEM[float(i+1)]=0
            daysDataMean.sort_index()
            daysDataSEM.sort_index()

            shortTrainingMean=shortTraining.mean(skipna=True)
            shortTrainingSEM=shortTraining.sem(skipna=True)
            for i in range(int(shortTrainingMean.index.max())):
              if i+1 not in shortTrainingMean.index:
                shortTrainingMean[float(i+1)]=0
                shortTrainingSEM[float(i+1)]=0
            shortTrainingMean.sort_index()
            shortTrainingSEM.sort_index()

            longTrainingMean=longTraining.mean(skipna=True)
            longTrainingSEM=longTraining.sem(skipna=True)
            for i in range(int(longTrainingMean.index.max())):
              if i+1 not in longTrainingMean.index:
                longTrainingMean[float(i+1)]=0
                longTrainingSEM[float(i+1)]=0
            longTrainingMean.sort_index()
            longTrainingSEM.sort_index()

            # plot
            plt.figure()
            if individualLines:  # create individual lines
                plt.plot(shortTraining.T, color='blue', alpha=0.1)
                try:
                  plt.plot(longTraining.T, color='red', alpha=0.1)
                except: 
                  pass
            # plt.plot(total_entries_per_day.T, color='black', alpha=0.1) # all together
            # mean lines
            plt.errorbar(daysDataMean.index, daysDataMean, color='green', label='All', yerr=daysDataSEM, capsize=7, alpha=0.5, marker='o', linewidth=2)
            plt.errorbar(shortTrainingMean.index, shortTrainingMean, color='blue', label='Short-Training', yerr=shortTrainingSEM, capsize=7, alpha=0.5, marker='o', linewidth=2)
            try:
              plt.errorbar(longTrainingMean.index, longTrainingMean, color='red', label='Long-Training', yerr=longTrainingSEM, capsize=7, alpha=0.5, marker='o', linewidth=2)
            except: 
              pass
            plt.legend()
            plt.xlabel('Day')
            plt.ylabel('# entries')
            plt.title('"learning curve"')
            plt.show()

        createLearningCurve()
        createLearningCurve(individualLines=True)


class Data(SimulateData):
    def __init__(self, minSubID = None, maxSubID = None):
        # GET DATA:
        # OPTION #1 [deprecated by Rani]
        # -------------------------------------------------------------
        # The commented part below is to handle an array of json which are not in the bson format:
        # with open(Paths().app_data_file, encoding='utf-8') as json_file: [OPTION #1 - deprecated]
        #     self.raw = json.load(json_file)
        # OPTION #2 [still relying on a text file produced by Robo3T]
        # -------------------------------------------------------------
        # self.raw = read_mongoextjson_file(
        #     Paths().app_data_file)  # this is based on downloading the data from "robo 3t" after using db.getCollection('sessions').find({}).toArray() in the query and save to a file the from array.
        # OPTION #3 - get data directly from the data base (uses a credentials file on the drive):
        if not minSubID or not maxSubID:
          self.raw = getRawDataAll(DB_credentials_file)        
        else:
          self.raw = getRawDataInRange(DB_credentials_file, minSubID, maxSubID)
        # initialize stuff
        self.trials_data = []
        self.subject_list = []
        self.sub_dict = {}
        self.sub_df_list = []
        self.sub_df = []

    # Get methods from SimulateData
    create_core_table = SimulateData.__dict__["create_core_table"]
    core_bar_plots = SimulateData.__dict__["core_bar_plots"]

    def filter_trials_data(self):
        # remove unwanted lines:
        for dic in self.raw:
            include = True
            for key, val in signals_to_remove_data_lines.items():
                if key in dic and dic[key] == val:
                    include = False
                    break
            if include:
                self.trials_data.append(dic)

    def create_sub_dict(self, json_array_data=None):  # create an ordered dictionary of subject IDs as keys:
        if json_array_data is None:
            json_array_data = self.trials_data
        sub_regular_dict = {}
        for dic in json_array_data:
            sub_regular_dict.setdefault(dic['subId'], []).append(dic)
        self.sub_dict = collections.OrderedDict(sorted(sub_regular_dict.items()))

    def remove_manual_exclusions(self, sub_dict=None):
        if sub_dict is None:
            sub_dict = self.sub_dict
        for subID in manual_exclusions:
            sub_dict.pop(subID, None)

    def remove_data_based_exclusions(self, sub_dict=None, remove_subs_without_a_day_that_exceeds_the_last_day=False, requireAllManipulatios=False):
        print('** Removing data based exclusions **')
        print('-- removing subjects without a day that exceeds the last day is set to: ' + str(
            remove_subs_without_a_day_that_exceeds_the_last_day))
        if sub_dict is None:
            sub_dict = self.sub_dict
        
        # remove subjects who didn't end the experiment:
        subjects = list(sub_dict.keys())
        for subID in subjects:
            particularSubDF = pd.DataFrame(sub_dict[subID])       
            try: # NOTE: if there is an error maybe remove here the KeyError from the 'except' below.. but make sure it's ok.
                subGroup = particularSubDF[particularSubDF.group.notna()].group.iloc[-1]
                n_expectedDaysOfExperiment = n_experimental_days[subGroup]
            except KeyError:
                print('subject ' + str(
                    subID) + ' appears not to have a group variable on their last entry, removing subject.')
                sub_dict.pop(subID, None)
                continue  

            actualDaysOfExperiment = particularSubDF[(particularSubDF.day.notna()) & (particularSubDF.day <= n_expectedDaysOfExperiment)].day.unique()
            n_actualDaysOfExperiment = len(actualDaysOfExperiment)
            last_day_listed_for_this_subject = particularSubDF[particularSubDF.day.notna()].day.iloc[-1]

            if n_expectedDaysOfExperiment != n_actualDaysOfExperiment: # removing subject that did not enter at least once at each day
                if (n_actualDaysOfExperiment == n_expectedDaysOfExperiment-1) & (actualDaysOfExperiment[-1] != n_expectedDaysOfExperiment): #checks if only the last day is missing (with the post-deval manipulation)
                  removeData = input('> subject ' + str(subID) + ' is missing his last day *only*. Do you want to REMOVE its data [y/n]? ') == 'y'
                else:
                  removeData = True
                if removeData and (subID not in doNotExcludedDueToMissingDays):  
                  print('REMOVING ' + str(subID), '. Reason: did not commit entry on every day.')
                  sub_dict.pop(subID, None)
                  continue

            # Remove subject who do not satisfy the minimum amount of daily entries:
            subEntriesWithOutcomeTime = particularSubDF[particularSubDF.outcomeTime.notna()]
            subEntriesWithOutcomeTime = subEntriesWithOutcomeTime.drop_duplicates(subset=['subId', 'startTime'], keep='last') # GET RID of double rows with the EXACT same start time (rare but may happen)
            daysSatisfyingMinimumDailyEntries = np.sum(subEntriesWithOutcomeTime['day'].value_counts() >= minimumDailyEntriesRequired)
            if n_expectedDaysOfExperiment != daysSatisfyingMinimumDailyEntries and (subID not in doNotExcludedDueToNotSatisfyingMinDailyEntries):  # removing subject that did not enter at least once at each day
                print('REMOVING ' + str(subID), '. Reason: did not commit the MINIMUM amount of DAILY ENTRIES each day.')
                sub_dict.pop(subID, None)
                continue

            if last_day_listed_for_this_subject == n_expectedDaysOfExperiment: # update if the subject did not enter after the last day (and saw the end game message)
                print('* subject ' + str(subID) + ' last day listed do not exceed the expected last day.')
                if remove_subs_without_a_day_that_exceeds_the_last_day:
                    print('REMOVING ' + str(subID))
                    sub_dict.pop(subID, None)

            if 'manipulationConfirmationTime' not in particularSubDF:
                  print('REMOVING ' + str(subID), '- Reason: Did not see confirm any manipulation (do not have "manipulationConfirmationTime" variable in its data frame')
                  sub_dict.pop(subID, None)
            elif requireAllManipulatios: # removing subjects that did not see sufficient number of manipulations
              # n_all_manipulations = len(particularSubDF[(particularSubDF.activateManipulation == True) & (particularSubDF.endTime.notna())])
              n_all_manipulations = len(particularSubDF[(particularSubDF.activateManipulation == True) & (particularSubDF.manipulationConfirmationTime.notna())].drop_duplicates(subset='day', keep="last"))
              if n_all_manipulations != n_manipulations[subGroup]:
                  print('REMOVING ' + str(subID), '- Reason: Did not see all manipulations (the also consider the post devaluation manipulation. * change method arguments to change this.')
                  sub_dict.pop(subID, None)
            else:
              #n_deval_and_still_val_manipulations = len(particularSubDF[(particularSubDF.activateManipulation == True) & (particularSubDF.endTime.notna()) &
              #                                                          ((particularSubDF.manipulationToday == 'devaluation') | (particularSubDF.manipulationToday == 'still_valued'))])
              n_deval_and_still_val_manipulations = len(particularSubDF[(particularSubDF.activateManipulation == True) & (particularSubDF.manipulationConfirmationTime.notna()) &
                                                                        ((particularSubDF.manipulationToday == 'devaluation') | (particularSubDF.manipulationToday == 'still_valued') | (particularSubDF.manipulationToday == 'still_valued_replacing_devaluation'))].drop_duplicates(subset='day', keep="last"))
              if n_deval_and_still_val_manipulations != n_only_val_and_deval_manipulations[subGroup]:
                  print('REMOVING ' + str(subID), '- Reason: Did not see all devaluation and still-valued manipulations.')
                  sub_dict.pop(subID, None)
        print('** Removing data based exclusions: COMPLETED **')

    def sub_dict_2_df(self, sub_dict=None):
        if sub_dict is None:
            sub_dict = self.sub_dict
        for subID in sub_dict.keys():
            if 'df' in locals():
                df = df.append(pd.DataFrame.from_dict(sub_dict[subID]), ignore_index=True)
            else:
                df = pd.DataFrame.from_dict(sub_dict[subID])
        self.sub_df = df

    def customize_df_structure(self, ordered_col_list=selected_df_structure):
        self.sub_df = self.sub_df[ordered_col_list]
        convertToLocalTime(self.sub_df, timeInStringColumns, localTimeZone)

    def get_subject_list(self, sub_dict=None):
        if sub_dict is None:
            sub_dict = self.sub_dict
        self.subject_list = sorted(list(sub_dict.keys()))

    def remove_manual_exclusions_from_sub_list(self):
        if self.subject_list:  # check that the list isn't empty
            for subID in manual_exclusions:
                if subID in self.subject_list:
                    self.subject_list.remove(subID)

    def runDataBuilderPipeline(self, requireAllManipulatios=False):
      print('    Extracting and Bulding the data    ')
      print('---------------------------------------')
      print('>>> create data.trials_data')
      self.filter_trials_data()  # creates data.trials_data
      print('>>> create data.sub_dict')
      self.create_sub_dict()  # creates data.sub_dict (defaulted to create it from data.trials_data)
      print('>>> remove manual exclusions from data.sub_dict')
      self.remove_manual_exclusions()  # remove manual exclusions from data.sub_dict
      print('>>> remove data based exclusions')
      self.remove_data_based_exclusions(remove_subs_without_a_day_that_exceeds_the_last_day=False, requireAllManipulatios = requireAllManipulatios)
      print('>>> create a data frame')
      self.sub_dict_2_df()  # create pd data frame
      print('>>> customize data frame structure')
      self.customize_df_structure(ordered_col_list=selected_df_structure)  # arrange it according to definition in the parameters (also convert to local time)
      #data.get_subject_list()  # creates data.subject_list
      #data.remove_manual_exclusions_from_sub_list()  # remove manual exclusions from data.subject_list
      print('>>> COMPLETED')



# Read and Parse Data

In [None]:
#@title Load and parse data { form-width: "5%" }
data = Data(minSubID, maxSubID) # instanciates and creates the data.raw
data.runDataBuilderPipeline(requireAllManipulatios=True)
#data.get_subject_list()  # creates data.subject_list
#data.remove_manual_exclusions_from_sub_list()  # remove manual exclusions from data.subject_list

data.sub_df = data.sub_df[(data.sub_df.isDemo == False) & (data.sub_df.startTime.notna())] # get rid of non real trial stuff

In [22]:
#@title Smart remove double lines { form-width: "5%" }

vars_records_in_reversed_temporal_order = ['endTime', 'coin_task_finish_status', 'coins_task_hits_count', 'foundCaveConfirmationTime', 'foundCaveAlertTime', \
                                  'manipulationConfirmationTime', 'manipulationAlertTime', 'outcomeTime', 'press2Time', 'press1Time', \
                                  'resetContainerConfirmationTime', 'resetContainerAlertTime', 'realGameBeginsConfirmationTime', 'realGameBeginsAlertTime',  'startTime']

# iterate subjects
for subj in data.sub_df.subId.unique():
  sub_data = data.sub_df[data.sub_df.subId==subj] # get subject's data
  sub_data_duplicated = sub_data[sub_data.duplicated(['startTime'], keep=False)] # get subject's duplicates
  for similarStartTimes in sub_data_duplicated.startTime.unique(): # iterate each duplicated startTime value
    similarStartTimeArray = sub_data_duplicated[sub_data_duplicated.startTime == similarStartTimes]  # crate a DF of the duplicated startTime value repetitions
    for recorded_var in vars_records_in_reversed_temporal_order: # iterate the relevant variables from the later to the earlier (time-wise) time they are expected to be formed
      rows_with_recorde_var = similarStartTimeArray[similarStartTimeArray[recorded_var].notna()] # get the rows that contain a not NA value of it 
      if rows_with_recorde_var.shape[0]: # check if the variable exists in at least one row
        if recorded_var == 'coins_task_hits_count': # if the variable is coins_task_hits_count keep the one with the maximum value
          indicesToRemove = rows_with_recorde_var.index[rows_with_recorde_var.index != rows_with_recorde_var.index[np.argmax(rows_with_recorde_var.coins_task_hits_count)]] # get the index of the row with the highest value of coins_task_hits_count and then take the other rows to get rid of later
        else:
          indicesToRemove = similarStartTimeArray.index[similarStartTimeArray.index != rows_with_recorde_var.index[0]] # get duplicated rows to remove from the data
        data.sub_df = data.sub_df.drop(indicesToRemove) # remove all other duplicated rows from the data
        break


### Check if and who cheated


#### Some data eyeballing QA stuff


In [None]:
pd.set_option('max_rows', None)
XX = data.sub_df[(data.sub_df.press1Time.isna())]
XX.subId.value_counts()

CHECK WHO CHEATED:

In [None]:
#@title with subsequent entries  { form-width: "5%" }

dat = data.sub_df.copy()
dat['DIFF'] = dat.startTime.diff()
c=0
for i in range(1,dat.shape[0],10):
  #print(dat.DIFF.iloc[i:i+10].std())
  if dat.DIFF.iloc[i:i+10].std() < timedelta(seconds=0.05):
    print(f'{dat.subId.iloc[i]} * {dat.subId.iloc[i+10]} * {dat.subId.index[i]}')
    c+=1
  # if i == 101:
  #   break
c

In [None]:
#@title with a gap of 2 { form-width: "5%" }

jump = 2
dat = data.sub_df.copy()
dat['DIFF'] = dat.startTime.diff()
c=0
for i in range(1,dat.shape[0],10):
  #print(dat.DIFF.iloc[i:i+10].std())
  if dat.DIFF.iloc[i:i+10*jump:jump].std() < timedelta(seconds=0.1):
    print(f'{dat.subId.iloc[i]} - {dat.subId.iloc[i+20]} - {dat.subId.index[i]}')
    c+=1
  if dat.DIFF.iloc[i+1:i+1+10*jump:jump].std() < timedelta(seconds=0.1):
    print(f'{dat.subId.iloc[i+1]} - {dat.subId.iloc[i+1+20]} - {dat.subId.index[i+1]}')
    c+=1

  # if i == 101:
  #   break
c

In [None]:
#@title with a gap of 3 { form-width: "5%" }

jump = 3
dat = data.sub_df.copy()
dat['DIFF'] = dat.startTime.diff()
c=0
for i in range(1,dat.shape[0],10):
  #print(dat.DIFF.iloc[i:i+10].std())
  if dat.DIFF.iloc[i:i+10*jump:jump].std() < timedelta(seconds=0.1):
    print(f'{dat.subId.iloc[i]} - {dat.subId.iloc[i+10]} - {dat.subId.index[i]}')
    c+=1
  if dat.DIFF.iloc[i+1:i+1+10*jump:jump].std() < timedelta(seconds=0.1):
    print(f'{dat.subId.iloc[i+1]} - {dat.subId.iloc[i+1+20]} - {dat.subId.index[i+1]}')
    c+=1
  if dat.DIFF.iloc[i+2:i+2+10*jump:jump].std() < timedelta(seconds=0.1):
    print(f'{dat.subId.iloc[i+2]} - {dat.subId.iloc[i+2+10]} - {dat.subId.index[i+2]}')
    c+=1

  # if i == 101:
  #   break
c

In [None]:
#@title with a gap of 4 { form-width: "5%" }

jump = 4
dat = data.sub_df.copy()
dat['DIFF'] = dat.startTime.diff()
c=0
for i in range(1,dat.shape[0],10):
  #print(dat.DIFF.iloc[i:i+10].std())
  if dat.DIFF.iloc[i:i+10*jump:jump].std() < timedelta(seconds=0.1):
    print(f'{dat.subId.iloc[i]} - {dat.subId.iloc[i+10]} - {dat.subId.index[i]}')
    c+=1
  if dat.DIFF.iloc[i+1:i+1+10*jump:jump].std() < timedelta(seconds=0.1):
    print(f'{dat.subId.iloc[i+1]} - {dat.subId.iloc[i+1+20]} - {dat.subId.index[i+1]}')
    c+=1
  if dat.DIFF.iloc[i+2:i+2+10*jump:jump].std() < timedelta(seconds=0.1):
    print(f'{dat.subId.iloc[i+2]} - {dat.subId.iloc[i+2+10]} - {dat.subId.index[i+2]}')
    c+=1
  if dat.DIFF.iloc[i+3:i+3+10*jump:jump].std() < timedelta(seconds=0.1):
    print(f'{dat.subId.iloc[i+3]} - {dat.subId.iloc[i+3+10]} - {dat.subId.index[i+3]}')
    c+=1

  # if i == 101:
  #   break
c

In [None]:
#@title with a gap of 5  { form-width: "5%" }

jump = 5
dat = data.sub_df.copy()
dat['DIFF'] = dat.startTime.diff()
c=0
for i in range(1,dat.shape[0],10):
  #print(dat.DIFF.iloc[i:i+10].std())
  if dat.DIFF.iloc[i:i+10*jump:jump].std() < timedelta(seconds=0.1):
    print(f'{dat.subId.iloc[i]} - {dat.subId.iloc[i+10]} - {dat.subId.index[i]}')
    c+=1
  if dat.DIFF.iloc[i+1:i+1+10*jump:jump].std() < timedelta(seconds=0.1):
    print(f'{dat.subId.iloc[i+1]} - {dat.subId.iloc[i+1+20]} - {dat.subId.index[i+1]}')
    c+=1
  if dat.DIFF.iloc[i+2:i+2+10*jump:jump].std() < timedelta(seconds=0.1):
    print(f'{dat.subId.iloc[i+2]} - {dat.subId.iloc[i+2+10]} - {dat.subId.index[i+2]}')
    c+=1
  if dat.DIFF.iloc[i+3:i+3+10*jump:jump].std() < timedelta(seconds=0.1):
    print(f'{dat.subId.iloc[i+3]} - {dat.subId.iloc[i+3+10]} - {dat.subId.index[i+3]}')
    c+=1
  if dat.DIFF.iloc[i+4:i+4+10*jump:jump].std() < timedelta(seconds=0.1):
    print(f'{dat.subId.iloc[i+4]} - {dat.subId.iloc[i+4+10]} - {dat.subId.index[i+4]}')
    c+=1

  # if i == 101:
  #   break
c

## **REMOVING SUBJECTS 354 and 208

In [None]:
data.sub_df = data.sub_df[data.sub_df.subId != 354] # for cheating
data.sub_df = data.sub_df[data.sub_df.subId != 208] # Recieved one of the manip[ulations twice (probabely an unclear bug)
data.sub_df.shape

In [None]:
data.create_core_table(coreTableRelativePath=coreDataFileName)

In [31]:
#@title Save the raw data and filtered DF { form-width: "5%" }
from bson import ObjectId
import json
class JSONEncoder(json.JSONEncoder):
    def default(self, o):
        if isinstance(o, ObjectId):
            return str(o)
        if isinstance(o, datetime):
            return str(o)
        return json.JSONEncoder.default(self, o)
with open(rawDataFileName, 'w') as fout:
    json.dump(data.raw , fout, cls=JSONEncoder)

# # load this data:
# with open(main_path + '/data/extracted_data/XXX.json', "r") as read_file:
#     rawData = json.load(read_file)

# # Write directly from the DB data cursor:
# # create a cursor for everything in range
# cursor = client['nodejs-app'][DB_credentials['collectionName']].find({"$or": [{'subId':{"$gte":minSubID[0],"$lte":maxSubID[0]}}, {'subId':{"$gte":minSubID[1],"$lte":maxSubID[1]}}, {'subId':{"$gte":minSubID[2],"$lte":maxSubID[2]}}]})
# with open(main_path + '/data/extracted_data/XXX.json', 'w') as file:
#     file.write('[')
#     for document in cursor:
#         file.write(json_util.dumps(document))
#         file.write(',')
#     file.write(']')

# ------------------------------------------------------------------------

# Save the filtered data frame:
data.sub_df.to_csv(filteredDF_File)


In [None]:
#@title create core_data + Summary Stats
# ------- replacing the core_bar_plots method ---------

# initialize vars (tables):
groups = data.sub_df.group.unique()
# making the data ready for ploting:
core_data = data.core_table[data.core_table.time == 'post']
core_data = core_data.drop(columns='time')
core_data = core_data.set_index(['subID', 'group', 'manipulation'])


In [None]:
#@title Create a DF with columns of averaged still-valued manipulations (+Summary Stats) {form-width: "1%"}
# (one for the pre and post and one that includes all the relvant ones in the parallel group - i.e. 5 days)
# extract and organize
core_data_with_averaged_still_valued=core_data.unstack('manipulation')
core_data_with_averaged_still_valued.columns = core_data_with_averaged_still_valued.columns.droplevel()
# add new vars
core_data_with_averaged_still_valued['mean_still_valued'] = core_data_with_averaged_still_valued.loc[:,['still_valued', 'still_valued_post_deval']].mean(axis=1)
core_data_with_averaged_still_valued['mean_still_valued_all'] = core_data_with_averaged_still_valued.loc[:,['still_valued',	'still_valued_post_deval',	'still_valued_post_deval_week1',	'still_valued_replacing_devaluation',	'still_valued_week1']].mean(axis=1, skipna=True)
# construct and arange to bein the same structure as core_data:
core_data_with_averaged_still_valued.reset_index().set_index(['subID', 'group']).stack('manipulation')
core_data_with_averaged_still_valued = core_data_with_averaged_still_valued.drop(columns=['still_valued',	'still_valued_post_deval',	'still_valued_post_deval_week1',	'still_valued_replacing_devaluation',	'still_valued_week1'])
core_data_with_averaged_still_valued = core_data_with_averaged_still_valued.reset_index().set_index(['subID', 'group']).stack('manipulation').to_frame()
core_data_with_averaged_still_valued.columns = ['n_entries']


In [None]:
#@title Create a DF with columns of DIFFERENCE measures {form-width: "1%"}

core_data_with_diffs = core_data.unstack('manipulation')
core_data_with_diffs.columns = core_data_with_diffs.columns.droplevel()

# add vars:
# add mean valued data:
core_data_with_diffs['mean_still_valued'] = core_data_with_diffs.loc[:,['still_valued', 'still_valued_post_deval']].mean(axis=1)
core_data_with_diffs['mean_still_valued_all'] = core_data_with_diffs.loc[:,['still_valued',	'still_valued_post_deval',	'still_valued_post_deval_week1',	'still_valued_replacing_devaluation',	'still_valued_week1']].mean(axis=1, skipna=True)
# add square root data (to give less weight for high values)
core_data_with_diffs['devaluation_SQRT'] = np.sqrt(core_data_with_diffs['devaluation'])
core_data_with_diffs['still_valued_SQRT'] = np.sqrt(core_data_with_diffs['still_valued'])
core_data_with_diffs['still_valued_post_deval_SQRT'] = np.sqrt(core_data_with_diffs['still_valued_post_deval'])
core_data_with_diffs['mean_still_valued_SQRT'] = np.sqrt(core_data_with_diffs['mean_still_valued'])
core_data_with_diffs['mean_still_valued_all_SQRT'] = np.sqrt(core_data_with_diffs['mean_still_valued_all'])

# add diff vars:
core_data_with_diffs['postVal_minus_deval'] = core_data_with_diffs.loc[:,['devaluation', 'still_valued_post_deval']].diff(axis=1).loc[:,'still_valued_post_deval']
core_data_with_diffs['stillVal_minus_deval'] = core_data_with_diffs.loc[:,['devaluation', 'still_valued']].diff(axis=1).loc[:,'still_valued']
core_data_with_diffs['stillVal_minus_postVal'] = core_data_with_diffs.loc[:,['still_valued_post_deval', 'still_valued']].diff(axis=1).loc[:,'still_valued']
core_data_with_diffs['preVal_relativeDiff_deval'] = core_data_with_diffs.apply(lambda x: (x['still_valued'] - x['devaluation'])/(x['devaluation'] + x['still_valued']), axis=1)
core_data_with_diffs['postVal_relativeDiff_deval'] = core_data_with_diffs.apply(lambda x: (x['still_valued_post_deval'] - x['devaluation'])/(x['devaluation'] + x['still_valued_post_deval']), axis=1)
core_data_with_diffs['preVal_relativeDiff_deval_SQRT'] = core_data_with_diffs.apply(lambda x: (x['still_valued_SQRT'] - x['devaluation_SQRT'])/(x['devaluation_SQRT'] + x['still_valued_SQRT']), axis=1)
core_data_with_diffs['postVal_relativeDiff_deval_SQRT'] = core_data_with_diffs.apply(lambda x: (x['still_valued_post_deval_SQRT'] - x['devaluation_SQRT'])/(x['devaluation_SQRT'] + x['still_valued_post_deval_SQRT']), axis=1)

core_data_with_diffs['meanVal_minus_deval'] = core_data_with_diffs.loc[:,['devaluation', 'mean_still_valued']].diff(axis=1).loc[:,'mean_still_valued']
core_data_with_diffs['meanAllVal_minus_deval'] = core_data_with_diffs.loc[:,['devaluation', 'mean_still_valued_all']].diff(axis=1).loc[:,'mean_still_valued_all']
core_data_with_diffs['meanVal_minus_deval_SQRT'] = core_data_with_diffs.loc[:,['devaluation_SQRT', 'mean_still_valued_SQRT']].diff(axis=1).loc[:,'mean_still_valued_SQRT']
core_data_with_diffs['meanAllVal_minus_deval_SQRT'] = core_data_with_diffs.loc[:,['devaluation_SQRT', 'mean_still_valued_all_SQRT']].diff(axis=1).loc[:,'mean_still_valued_all_SQRT']
core_data_with_diffs['meanVal_relativeDiff_deval'] = core_data_with_diffs.apply(lambda x: (x['mean_still_valued'] - x['devaluation'])/(x['devaluation'] + x['mean_still_valued']), axis=1)
core_data_with_diffs['meanAllVal_relativeDiff_deval'] = core_data_with_diffs.apply(lambda x: (x['mean_still_valued_all'] - x['devaluation'])/(x['devaluation'] + x['mean_still_valued_all']), axis=1)
core_data_with_diffs['meanVal_relativeDiff_deval_SQRT'] = core_data_with_diffs.apply(lambda x: (x['mean_still_valued_SQRT'] - x['devaluation_SQRT'])/(x['devaluation_SQRT'] + x['mean_still_valued_SQRT']), axis=1)
core_data_with_diffs['meanAllVal_relativeDiff_deval_SQRT'] = core_data_with_diffs.apply(lambda x: (x['mean_still_valued_all_SQRT'] - x['devaluation_SQRT'])/(x['devaluation_SQRT'] + x['mean_still_valued_all_SQRT']), axis=1)

core_data_with_diffs['prcChange_preToDeval'] = core_data_with_diffs.apply(lambda x: ((x['devaluation'] - x['still_valued'])/x['still_valued'])*100, axis=1)
core_data_with_diffs['prcChange_preToPost'] = core_data_with_diffs.apply(lambda x: ((x['still_valued_post_deval'] - x['still_valued'])/x['still_valued'])*100, axis=1)

core_data_with_diffs['preVal_relativeDiff_postVal'] = core_data_with_diffs.apply(lambda x: (x['still_valued'] - x['still_valued_post_deval'])/(x['still_valued_post_deval'] + x['still_valued']), axis=1)
core_data_with_diffs['preVal_relativeDiff_postVal_SQRT'] = core_data_with_diffs.apply(lambda x: (x['still_valued_SQRT'] - x['still_valued_post_deval_SQRT'])/(x['still_valued_post_deval_SQRT'] + x['still_valued_SQRT']), axis=1)

if input("Should RelativeDiff values for participants with 0 entries after all manipulatuions turned to 0 (rather than nan)? [y/N] ") == 'y':
  print('Turn these values to zeros')
  columnsWithRelativeDiff = core_data_with_diffs.columns[['relativeDiff' in x for x in core_data_with_diffs.columns]]
  for col in columnsWithRelativeDiff:
    print(core_data_with_diffs.loc[core_data_with_diffs[col].isna(),col])
    core_data_with_diffs.loc[core_data_with_diffs[col].isna(),col] = 0

#Assemble entries variability measures
# get number of entries in each day
total_entries_per_day = pd.DataFrame(data.sub_df.groupby(by=['subId', 'group', 'day']).size().unstack(fill_value=np.nan).stack()).rename(
    columns={0: 'n_entries'}).reset_index().pivot(index='subId', columns=['day'], values='n_entries')

# remove data for days that shoudn't be in the calculation
total_entries_per_day = total_entries_per_day.loc[:,:n_experimental_days['long_training']]
total_entries_per_day.loc[minSubID[0]:maxSubID[0],main_manipulation_days['short_training']['devaluation']] = None
total_entries_per_day.loc[minSubID[0]:maxSubID[0],(n_experimental_days['short_training']+1):] = None
total_entries_per_day.loc[minSubID[1]:maxSubID[2],main_manipulation_days['long_training']['devaluation']] = None

# add the relevant data
core_data_with_diffs = core_data_with_diffs.assign(STD_entriesAcrossNoDevalDays=total_entries_per_day.std(axis=1,skipna=True).values)
core_data_with_diffs = core_data_with_diffs.assign(SEM_entriesAcrossNoDevalDays=total_entries_per_day.sem(axis=1,skipna=True).values)

core_data_with_diffs = core_data_with_diffs.assign(NormedSTD_entriesAcrossNoDevalDays=total_entries_per_day.std(axis=1,skipna=True).values / total_entries_per_day.mean(axis=1,skipna=True).values)
core_data_with_diffs = core_data_with_diffs.assign(NormedVar_entriesAcrossNoDevalDays=total_entries_per_day.var(axis=1,skipna=True).values / total_entries_per_day.mean(axis=1,skipna=True).values)

# Add first day entries and entries throught the experiment
core_data_with_diffs = core_data_with_diffs.assign(firstDayEntries=total_entries_per_day[1].values)
core_data_with_diffs = core_data_with_diffs.assign(allEntries=total_entries_per_day.sum(axis=1,skipna=True).values)

# -------------------------- Adding similar stuff but after excluding all manipulation days: -----------------------------

# get number of entries in each day
total_entries_per_day2 = pd.DataFrame(data.sub_df.groupby(by=['subId', 'group', 'day']).size().unstack(fill_value=np.nan).stack()).rename(
    columns={0: 'n_entries'}).reset_index().pivot(index='subId', columns=['day'], values='n_entries')

# remove data for days that shoudn't be in the calculation
total_entries_per_day2 = total_entries_per_day2.loc[:,:11]
total_entries_per_day2.loc[100:199,2:] = None
total_entries_per_day2.loc[200:400,9:] = None
total_entries_per_day2.loc[300:400,2:4] = None

# add the relevant data
core_data_with_diffs = core_data_with_diffs.assign(STD_entriesAcrossNoManipulationDays=total_entries_per_day2.std(axis=1,skipna=True).values)
core_data_with_diffs = core_data_with_diffs.assign(SEM_entriesAcrossNoManipulationDays=total_entries_per_day2.sem(axis=1,skipna=True).values)

core_data_with_diffs = core_data_with_diffs.assign(NormedSTD_entriesAcrossNoManipulationDays=total_entries_per_day2.std(axis=1,skipna=True).values / total_entries_per_day2.mean(axis=1,skipna=True).values)
core_data_with_diffs = core_data_with_diffs.assign(NormedVar_entriesAcrossNoManipulationDays=total_entries_per_day2.var(axis=1,skipna=True).values / total_entries_per_day2.mean(axis=1,skipna=True).values)

# Add first day entries and entries throught the experiment
core_data_with_diffs = core_data_with_diffs.assign(allEntriesAcrossNoManipulationDay=total_entries_per_day2.sum(axis=1,skipna=True).values)


## Assemble data for ITI (self-fromed sessions) and RTs

In [35]:
#@title Assemble the ITI and RTs data { form-width: "5%" }

# parameters:
timeOfEntranceAnim = 800 # time in ms it should takes the sequence pressing screen to appear:

seperatingMicroSessionsTime = 300 # namely if there is 300 seconds apart between subsequent startTime's it will be considered a new session.
# helpers:
getSecsFunc = np.vectorize(lambda x: x.total_seconds())
# ------------------------------------------------------------------------------
# Create a new sub_df and add new measures:
sub_df_extended = data.sub_df.copy().sort_values(['subId','startTime']) # ******** TO MAKE SURE STARTTIMES ARE IN THE CORRECT ORDER
sub_df_extended = sub_df_extended[sub_df_extended.endExperiment!=True] # remove irrelevant days
sub_df_extended['deltaStartTime']=np.insert(np.diff(sub_df_extended.startTime),0,None) # add difference between startTime
sub_df_extended['deltaSecStartTime']=np.insert(getSecsFunc(np.diff(sub_df_extended.startTime)),0,None) # add difference between startTime
sub_df_extended['sessionsBeginning'] = False

for sub in sub_df_extended.subId.unique(): # set stuff for the beginning of each subject
  sub_df_extended.loc[(sub_df_extended.subId==sub).idxmax(), 'deltaStartTime'] = None
  sub_df_extended.loc[(sub_df_extended.subId==sub).idxmax(), 'deltaSecStartTime'] = None
  sub_df_extended.loc[(sub_df_extended.subId==sub).idxmax(), 'sessionsBeginning'] = True

sub_df_extended.loc[sub_df_extended.deltaSecStartTime > seperatingMicroSessionsTime , 'sessionsBeginning'] = True # Add session beginning (except for the ones on day beginning)

sub_df_extended['secsToPress1'] = sub_df_extended.apply(lambda x: (x.press1Time - (x.startTime + timedelta(milliseconds = x.dataLoadingTime) + timedelta(milliseconds = timeOfEntranceAnim))).total_seconds(), axis=1)
sub_df_extended['secsToPress2'] = sub_df_extended.apply(lambda x: (x.press2Time - x.press1Time).total_seconds(), axis=1)
sub_df_extended['secsToExitAfterOutcomeTime'] = sub_df_extended.apply(lambda x: (x.userExitOrUnloadTime - x.outcomeTime).total_seconds(), axis=1)

sub_df_extended['secsToPress2fromEntry'] = sub_df_extended.apply(lambda x: (x.press2Time - (x.startTime + timedelta(milliseconds = x.dataLoadingTime) + timedelta(milliseconds = timeOfEntranceAnim))).total_seconds(), axis=1)
sub_df_extended['secsToExitAfterOutcomeTimefromEntry'] = sub_df_extended.apply(lambda x: (x.userExitOrUnloadTime - (x.startTime + timedelta(milliseconds = x.dataLoadingTime) + timedelta(milliseconds = timeOfEntranceAnim))).total_seconds(), axis=1)


In [None]:
#@title Assemble a summary table of ITI and RTs data + per day tables { form-width: "5%" }

for i, sub in enumerate(sub_df_extended.subId.unique()):
  # Assemble data
  # --------------
  # get relevant days and calculate time differences between entries:
  sub_data = sub_df_extended[sub_df_extended.subId==sub]
  # Remove first entry from each to ignore the large time difference over night
  x = sub_data.copy()
  for day in x.day.unique():
    x = x.drop(index = x[x.day==day].index[0])

  # -------------- construct DFs of daily data of session --------------------
  sub_n_daily_sessions=(x[x.deltaSecStartTime>300].groupby(['subId','group','day']).count().deltaSecStartTime + 1).reset_index('day').astype({"day": int}).pivot(columns='day')
  sub_n_daily_sessions.columns = sub_n_daily_sessions.columns.droplevel(0)

  sub_n_daily_entries=(sub_data.groupby(['subId','group','day']).count().startTime).reset_index('day').astype({"day": int}).pivot(columns='day')
  sub_n_daily_entries.columns = sub_n_daily_entries.columns.droplevel(0)

  # -------------- Calculate and Add new important measures --------------------
  x_devalDropped = x[x.manipulationToday != 'devaluation']
  x_allManipulationsDropped = x[x.manipulationToday.isna()]

  # Define the number of session as a measure of entries that came after more than 5 minutes from the last entry:
  n_sessions = len(x_devalDropped.deltaSecStartTime[x_devalDropped.deltaSecStartTime>300]) + (n_experimental_days[x_devalDropped.iloc[0].group]-1) # for the beginning of each day (except devaluation day);
  if x.group.iloc[0] == 'long_training_parallel_manipulations':
    n_sessions_no_manipulations = len(x_allManipulationsDropped.deltaSecStartTime[x_allManipulationsDropped.deltaSecStartTime>300]) + (n_experimental_days[x_allManipulationsDropped.iloc[0].group]-6) # for the beginning of each day (except manipulation days);
  else:
    n_sessions_no_manipulations = len(x_allManipulationsDropped.deltaSecStartTime[x_allManipulationsDropped.deltaSecStartTime>300]) + (n_experimental_days[x_allManipulationsDropped.iloc[0].group]-3) # for the beginning of each day (except manipulation days);

  # Define the average sessions per day
  averageSessionsPerDay = n_sessions/(n_experimental_days[x_devalDropped.iloc[0].group]-1)
  if x.group.iloc[0] == 'long_training_parallel_manipulations':
    averageSessionsPerDay_no_manipulations = n_sessions_no_manipulations/(n_experimental_days[x_allManipulationsDropped.iloc[0].group]-6)
  else:
    averageSessionsPerDay_no_manipulations = n_sessions_no_manipulations/(n_experimental_days[x_allManipulationsDropped.iloc[0].group]-3)

  # Define the average entries per session
  averageEntriesInSession = (len(x_devalDropped.deltaSecStartTime) + (n_experimental_days[x_devalDropped.iloc[0].group]-1)) / n_sessions
  if x.group.iloc[0] == 'long_training_parallel_manipulations':
    averageEntriesInSession_no_manipulations = (len(x_allManipulationsDropped.deltaSecStartTime) + (n_experimental_days[x_allManipulationsDropped.iloc[0].group]-6)) / n_sessions_no_manipulations
  else:
    averageEntriesInSession_no_manipulations = (len(x_allManipulationsDropped.deltaSecStartTime) + (n_experimental_days[x_allManipulationsDropped.iloc[0].group]-3)) / n_sessions_no_manipulations

  # Create a table with the different measures
  summaryTable = x_devalDropped.deltaSecStartTime.describe()
  summaryTable['MAD'] = stats.median_abs_deviation(x_devalDropped.deltaSecStartTime, nan_policy='omit')
  summaryTable['Skew'] = x_devalDropped.deltaSecStartTime.skew()
  summaryTable['NP_Skew'] = (summaryTable['mean'] - summaryTable['50%']) / summaryTable['std']
  summaryTable['Norm_spread'] = summaryTable['std'] / summaryTable['mean']
  summaryTable['n_sessions'] = n_sessions
  summaryTable['n_sessionsPerDay'] = averageSessionsPerDay
  summaryTable['avgSessionEntries'] = averageEntriesInSession
  summaryTable['n_sessions_no_manipulations'] = n_sessions_no_manipulations
  summaryTable['n_sessionsPerDay_no_manipulations'] = averageSessionsPerDay_no_manipulations
  summaryTable['avgSessionEntries_no_manipulations'] = averageEntriesInSession_no_manipulations

  # Add measures of number of session and number of entries per session for after each manipulation
  for manipulation in sub_data.manipulationToday.unique()[sub_data.manipulationToday.unique()!=None]:
    # sessions after manipulations
    postManpulationTrialsDeltaSec = x.deltaSecStartTime[(x.manipulationToday == manipulation) & (x.isUnderManipulation)]
    summaryTable['n_sessions_' + manipulation] = len(postManpulationTrialsDeltaSec[postManpulationTrialsDeltaSec>300]) + ((postManpulationTrialsDeltaSec.iloc[0]<300) if len(postManpulationTrialsDeltaSec) else 0) # The last term is to consider as a session entries right after the manipulation
    summaryTable['avgSessionEntries_' + manipulation] = len(postManpulationTrialsDeltaSec) / summaryTable['n_sessions_' + manipulation]

  # assemble within-trial time variables:
  # --------------------------------------------
  summaryTable['MEANsecsToPress1'] = sub_data.secsToPress1.mean()
  summaryTable['MEANsecsToPress2'] = sub_data.secsToPress2.mean()
  summaryTable['MEANsecsToExitAfterOutcomeTime'] = sub_data[(sub_data.activateManipulation != True) & (sub_data.secsToExitAfterOutcomeTime<=60)].secsToExitAfterOutcomeTime.mean() # the latter is to exclude unreasonably higher values
  summaryTable['MEANsecsToPress2fromEntry'] = sub_data.secsToPress2fromEntry.mean()
  summaryTable['MEANsecsToExitAfterOutcomeTimefromEntry'] = sub_data[(sub_data.activateManipulation != True) & (sub_data.secsToExitAfterOutcomeTime<=60)].secsToExitAfterOutcomeTimefromEntry.mean() # the latter is to exclude unreasonably higher values
  
  for manipulation in sub_data.manipulationToday.unique()[sub_data.manipulationToday.unique()!=None]:
    # get the same RT data but only for post manipulations
    relevantData = sub_data[(sub_data.manipulationToday == manipulation) & (sub_data.isUnderManipulation)]
    summaryTable['Manipulation_MEANsecsToPress1_' + manipulation] = relevantData.secsToPress1.mean()
    summaryTable['Manipulation_MEANsecsToPress2_' + manipulation] = relevantData.secsToPress2.mean()
    summaryTable['Manipulation_MEANsecsToExitAfterOutcomeTime_' + manipulation] = relevantData[relevantData.secsToExitAfterOutcomeTime<=60].secsToExitAfterOutcomeTime.mean() # the latter is to exclude unreasonably higher values
    summaryTable['Manipulation_MEANsecsToPress2fromEntry_' + manipulation] = relevantData.secsToPress2fromEntry.mean()
    summaryTable['Manipulation_MEANsecsToExitAfterOutcomeTimefromEntry_' + manipulation] = relevantData[relevantData.secsToExitAfterOutcomeTime<=60].secsToExitAfterOutcomeTimefromEntry.mean() # the latter is to exclude unreasonably higher values
    # get the same RT of the differences between before and after manipulations
    relevantData2 = sub_data[(sub_data.manipulationToday == manipulation) & (sub_data.isUnderManipulation==False)]
    summaryTable['After_minus_Before_Manipulation_MEANsecsToPress1_' + manipulation] = relevantData.secsToPress1.mean() - relevantData2.secsToPress1.mean()
    summaryTable['After_minus_Before_Manipulation_MEANsecsToPress2_' + manipulation] = relevantData.secsToPress2.mean() - relevantData2.secsToPress2.mean()
    summaryTable['After_minus_Before_Manipulation_MEANsecsToExitAfterOutcomeTime_' + manipulation] = relevantData[relevantData.secsToExitAfterOutcomeTime<=60].secsToExitAfterOutcomeTime.mean() - relevantData2[(relevantData2.activateManipulation != True) & (relevantData2.secsToExitAfterOutcomeTime<=60)].secsToExitAfterOutcomeTime.mean() # the latter is to exclude unreasonably higher values
    summaryTable['After_minus_Before_Manipulation_MEANsecsToPress2fromEntry_' + manipulation] = relevantData.secsToPress2fromEntry.mean() - relevantData2.secsToPress2fromEntry.mean()
    summaryTable['After_minus_Before_Manipulation_MEANsecsToExitAfterOutcomeTimefromEntry_' + manipulation] = relevantData[relevantData.secsToExitAfterOutcomeTime<=60].secsToExitAfterOutcomeTimefromEntry.mean() - relevantData2[(relevantData2.activateManipulation != True) & (relevantData2.secsToExitAfterOutcomeTime<=60)].secsToExitAfterOutcomeTimefromEntry.mean() # the latter is to exclude unreasonably higher values

  # get RT data per day
  for day in sub_data.day.unique():
    day = int(day)
    summaryTable['MEANsecsToPress1_day' + str(day)] = sub_data[sub_data.day==day].secsToPress1.mean()
    summaryTable['MEANsecsToPress2_day' + str(day)] = sub_data[sub_data.day==day].secsToPress2.mean()
    summaryTable['MEANsecsToExitAfterOutcomeTime_day' + str(day)] = sub_data[(sub_data.day==day) & (sub_data.activateManipulation != True) & (sub_data.secsToExitAfterOutcomeTime<=60)].secsToExitAfterOutcomeTime.mean() # the latter is to exclude unreasonably higher values
    summaryTable['MEANsecsToPress2fromEntry_day' + str(day)] = sub_data[sub_data.day==day].secsToPress2fromEntry.mean()
    summaryTable['MEANsecsToExitAfterOutcomeTimefromEntry_day' + str(day)] = sub_data[(sub_data.day==day) & (sub_data.activateManipulation != True) & (sub_data.secsToExitAfterOutcomeTime<=60)].secsToExitAfterOutcomeTimefromEntry.mean()


  # Gather delta's data together:
  summaryTable['subId'] = sub
  if i == 0:
    timeDeltaMeasures = summaryTable
    nSessionByDay = sub_n_daily_sessions
    nDailyEntries = sub_n_daily_entries
  else:
    timeDeltaMeasures = pd.concat([timeDeltaMeasures, summaryTable], axis=1)
    nSessionByDay = nSessionByDay.append(sub_n_daily_sessions)
    nDailyEntries = nDailyEntries.append(sub_n_daily_entries)

# group together subjects data
timeDeltaMeasures = timeDeltaMeasures.transpose().set_index('subId')
timeDeltaMeasures.index = timeDeltaMeasures.index.astype('int') # making the index int instead of float

# add 1s to relevant days with nan (which means that there was one session)
nSessionByDay = nSessionByDay.reset_index('group')
nDailyEntries = nDailyEntries.reset_index('group')

# handle subjects with only one session on all days
missing_subjects = list(sorted(set(sub_df_extended.subId.unique()) - set(nSessionByDay.index))) # this is for subjects who had only one session across all days.
for missed_sub in missing_subjects:
  rowToAdd = pd.DataFrame(np.nan, index=[missed_sub], columns=nSessionByDay.columns)
  rowToAdd.group = sub_df_extended[sub_df_extended.subId == missed_sub].group.iloc[0]
  rowToAdd.index.name = 'subId'
  nSessionByDay = pd.concat([nSessionByDay, rowToAdd]).sort_index()

for group in n_experimental_days.keys():
  for c in range(1,n_experimental_days[group]+1):
    nSessionByDay.loc[(nSessionByDay.group==group) & (nSessionByDay[c].isna()),c] = 1
# Get the daily average entries per session
avgEntriesPerSessionByDay = nDailyEntries.copy()
avgEntriesPerSessionByDay.loc[:,[i for i in range(1,avgEntriesPerSessionByDay.columns[-1]+1)]] = avgEntriesPerSessionByDay.loc[:,[i for i in range(1,avgEntriesPerSessionByDay.columns[-1]+1)]] / nSessionByDay.loc[:,[i for i in range(1,nSessionByDay.columns[-1]+1)]]
avgEntriesPerSessionByDay


if input("Should avg. entries per session after manipulations turned to 0 instead of NA when there where no sessions? [y/N] ") == 'y':
  print('Turn these values to zeros')
  columnsWithManipulationsAvgSessionEntries = timeDeltaMeasures.columns[['avgSessionEntries_' in x for x in timeDeltaMeasures.columns]]
  for col in columnsWithManipulationsAvgSessionEntries:
    timeDeltaMeasures.loc[timeDeltaMeasures[col].isna(),col] = 0

print('\n********************************\n created: timeDeltaMeasures + nSessionByDay and avgEntriesPerSessionByDay tables\n********************************')
print('Note: for the n_sessions, averageSessionsPerDay, and averageEntriesInSession (one score per participants) the entire day of devaluation is excluded.')
print('Note: for the n_sessions_no_manipulations, averageSessionsPerDay_no_manipulations, and averageEntriesInSession_no_manipulations (one score per participants) the entire manipulations days are excluded.')

## Make a summary table of all data together

In [37]:
timeDeltaMeasures_temp = timeDeltaMeasures.copy()
timeDeltaMeasures_temp.columns = ['timeDelta_'+ x for x in timeDeltaMeasures_temp.columns]
timeDeltaMeasures_temp.index.name = 'subID'
timeDeltaMeasures_temp.columns.name = 'manipulation'
all_data_for_R = pd.concat([core_data_with_diffs.reset_index('group'), timeDeltaMeasures_temp], axis=1)
all_data_for_R = all_data_for_R.reset_index().set_index(['subID','group'])

In [None]:
#@title add n_session and avgSessionEntries diff measures { form-width: "5%" }

def add_session_diff_measures(table, entryMeasure):
  still_val_Measure = 'timeDelta_' + entryMeasure + '_still_valued'
  deval_Measure = 'timeDelta_' + entryMeasure + '_devaluation'
  still_val_post_deval_Measure = 'timeDelta_' + entryMeasure + '_still_valued_post_deval'

  # add vars:
  # add mean valued data:
  table['mean_' + still_val_Measure] = table.loc[:,[still_val_Measure, still_val_post_deval_Measure]].mean(axis=1)
  # add square root data (to give less weight for high values)
  table[deval_Measure + '_SQRT'] = np.sqrt(table[deval_Measure])
  table[still_val_Measure + '_SQRT'] = np.sqrt(table[still_val_Measure])
  table[still_val_post_deval_Measure + '_SQRT'] = np.sqrt(table[still_val_post_deval_Measure])
  table['mean_' + still_val_Measure + 'SQRT'] = np.sqrt(table['mean_' + still_val_Measure])

  # add diff vars:
  table[entryMeasure + '_postVal_minus_deval'] = table.loc[:,[deval_Measure, still_val_post_deval_Measure]].diff(axis=1).loc[:,still_val_post_deval_Measure]
  table[entryMeasure + '_stillVal_minus_deval'] = table.loc[:,[deval_Measure, still_val_Measure]].diff(axis=1).loc[:,still_val_Measure]
  table[entryMeasure + '_preVal_relativeDiff_deval'] = table.apply(lambda x: (x[still_val_Measure] - x[deval_Measure])/(x[deval_Measure] + x[still_val_Measure]) if (x[deval_Measure] + x[still_val_Measure]) != 0 else None, axis=1)
  table[entryMeasure + '_postVal_relativeDiff_deval'] = table.apply(lambda x: (x[still_val_post_deval_Measure] - x[deval_Measure])/(x[deval_Measure] + x[still_val_post_deval_Measure]) if (x[deval_Measure] + x[still_val_post_deval_Measure]) != 0 else None, axis=1)
  table[entryMeasure + '_preVal_relativeDiff_deval_SQRT'] = table.apply(lambda x: (x[still_val_Measure + '_SQRT'] - x[deval_Measure + '_SQRT'])/(x[deval_Measure + '_SQRT'] + x[still_val_Measure + '_SQRT']) if (x[deval_Measure + '_SQRT'] + x[still_val_Measure + '_SQRT']) != 0 else None, axis=1)
  table[entryMeasure + '_postVal_relativeDiff_deval_SQRT'] = table.apply(lambda x: (x[still_val_post_deval_Measure + '_SQRT'] - x[deval_Measure + '_SQRT'])/(x[deval_Measure + '_SQRT'] + x[still_val_post_deval_Measure + '_SQRT']) if (x[deval_Measure + '_SQRT'] + x[still_val_post_deval_Measure + '_SQRT']) != 0 else None, axis=1)

  table[entryMeasure + '_meanVal_minus_deval'] = table.loc[:,[deval_Measure, 'mean_' + still_val_Measure]].diff(axis=1).loc[:,'mean_' + still_val_Measure]
  table[entryMeasure + '_meanVal_minus_deval_SQRT'] = table.loc[:,[deval_Measure + '_SQRT', 'mean_' + still_val_Measure + 'SQRT']].diff(axis=1).loc[:,'mean_' + still_val_Measure + 'SQRT']
  table[entryMeasure + '_meanVal_relativeDiff_deval'] = table.apply(lambda x: (x['mean_' + still_val_Measure] - x[deval_Measure])/(x[deval_Measure] + x['mean_' + still_val_Measure]) if (x[deval_Measure] + x['mean_' + still_val_Measure]) != 0 else None, axis=1)
  table[entryMeasure + '_meanVal_relativeDiff_deval_SQRT'] = table.apply(lambda x: (x['mean_' + still_val_Measure + 'SQRT'] - x[deval_Measure + '_SQRT'])/(x[deval_Measure + '_SQRT'] + x['mean_' + still_val_Measure + 'SQRT']) if (x[deval_Measure + '_SQRT'] + x['mean_' + still_val_Measure + 'SQRT']) != 0 else None, axis=1)

  if input("Should RelativeDiff values for participants with 0 entries after all manipulatuions turned to 0 (rather than nan)? [y/N] ") == 'y':
    print('Turn these values to zeros')
    columnsWithRelativeDiff = table.columns[['relativeDiff' in x for x in table.columns]]
    for col in columnsWithRelativeDiff:
      table.loc[table[col].isna(),col] = 0

  table.tail()

  return table

all_data_for_R = add_session_diff_measures(table=all_data_for_R, entryMeasure = 'n_sessions')

all_data_for_R = add_session_diff_measures(table=all_data_for_R, entryMeasure = 'avgSessionEntries')

In [39]:
#@title { form-width: "5%" }
#@markdown Assemble and add entries variability measures for n_sessions and avgSessionEntries

# get number of entries in each day
n_sessions_by_day = nSessionByDay.drop(['group'],axis=1)
# remove data for days that shoudn't be in the calculation
n_sessions_by_day.loc[100:199, main_manipulation_days['short_training']['devaluation']] = None
n_sessions_by_day.loc[200:400, main_manipulation_days['long_training']['devaluation']] = None
n_sessions_by_day

# add the relevant data
all_data_for_R = all_data_for_R.assign(n_sessions_STD_entriesAcrossNoDevalDays=n_sessions_by_day.std(axis=1,skipna=True).values)
all_data_for_R = all_data_for_R.assign(n_sessions_SEM_entriesAcrossNoDevalDays=n_sessions_by_day.sem(axis=1,skipna=True).values)

all_data_for_R = all_data_for_R.assign(n_sessions_NormedSTD_entriesAcrossNoDevalDays=n_sessions_by_day.std(axis=1,skipna=True).values / n_sessions_by_day.mean(axis=1,skipna=True).values)
all_data_for_R = all_data_for_R.assign(n_sessions_NormedVar_entriesAcrossNoDevalDays=n_sessions_by_day.var(axis=1,skipna=True).values / n_sessions_by_day.mean(axis=1,skipna=True).values)

# Add first day entries and entries throught the experiment
all_data_for_R = all_data_for_R.assign(firstDaySessions=n_sessions_by_day[1].values)
all_data_for_R = all_data_for_R.assign(allSessions=n_sessions_by_day.sum(axis=1,skipna=True).values)

# ------------------------------------------------------------------------------

# get number of entries in each day
avgSessionEntries_by_day = avgEntriesPerSessionByDay.drop(['group'],axis=1)
# remove data for days that shoudn't be in the calculation
avgSessionEntries_by_day.loc[100:199,main_manipulation_days['short_training']['devaluation']] = None
avgSessionEntries_by_day.loc[200:400,main_manipulation_days['long_training']['devaluation']] = None
avgSessionEntries_by_day

# add the relevant data
all_data_for_R = all_data_for_R.assign(avgSessionEntries_STD_entriesAcrossNoDevalDays=avgSessionEntries_by_day.std(axis=1,skipna=True).values)
all_data_for_R = all_data_for_R.assign(avgSessionEntries_SEM_entriesAcrossNoDevalDays=avgSessionEntries_by_day.sem(axis=1,skipna=True).values)

all_data_for_R = all_data_for_R.assign(avgSessionEntries_NormedSTD_entriesAcrossNoDevalDays=avgSessionEntries_by_day.std(axis=1,skipna=True).values / avgSessionEntries_by_day.mean(axis=1,skipna=True).values)
all_data_for_R = all_data_for_R.assign(avgSessionEntries_NormedVar_entriesAcrossNoDevalDays=avgSessionEntries_by_day.var(axis=1,skipna=True).values / avgSessionEntries_by_day.mean(axis=1,skipna=True).values)

# Add first day entries and entries throught the experiment
all_data_for_R = all_data_for_R.assign(firstDayAvgSessionEntries=avgSessionEntries_by_day[1].values)
all_data_for_R['allAvgSessionEntries'] = all_data_for_R['timeDelta_avgSessionEntries']




## Put data together

In [40]:
# add what's added to the core_data_with_diffs and then save
all_data_for_R = pd.concat([all_data_for_R, core_data_with_diffs.loc[:,~core_data_with_diffs.columns.isin(all_data_for_R.columns)]], axis=1)


# Consumption Test (note a manual fix in the code)

In [None]:
#@title parse Consumption Test Data (note one manual correction based on the raw data) { form-width: "5%" }
consumptionTestData = data.sub_df[(data.sub_df.consumptionTest==True) & (~data.sub_df['foundCaveConfirmationTime'].isnull())]
consumptionTestData = consumptionTestData.drop(['touchData', 'screenOrientationData'], axis=1)

# A specific MANUAL fix for when there was no coin_task_finish_status (but there is data) for one subject
problems = consumptionTestData[consumptionTestData.coin_task_finish_status.isna()]
if problems.subId.unique() == 233 and problems.day.unique() == 9:
  consumptionTestData.loc[consumptionTestData.coin_task_finish_status.isna(),'coin_task_finish_status'] = [{'finish_time': None, 'total_gold_collected': 13, 'total_presses': 13}]

consumptionTestData.loc[consumptionTestData.coins_task_misses_count.isnull(), 'coins_task_misses_count'] = 0  # turn back NaNs in of task misses to 0;
# the next line is a fix needed for when loading the data from a file (and not from the server)
consumptionTestData.coin_task_finish_status = consumptionTestData.coin_task_finish_status.map(lambda x: x if isinstance(x,dict) else json.loads(x.replace("'", '"')))
consumptionTestData['gold'] = consumptionTestData.coin_task_finish_status.map(lambda x: x['total_gold_collected'])
consumptionTestData['rocks'] = consumptionTestData.coin_task_finish_status.map(lambda x: x['total_presses'] - x['total_gold_collected']) - consumptionTestData['coins_task_misses_count']
consumptionTestData['misses'] = consumptionTestData['coins_task_misses_count']
consumptionTestData['completionTime'] = consumptionTestData.coin_task_finish_status.map(lambda x: x['finish_time'])

consumptionTestData = consumptionTestData[['subId','group','day','manipulationToday','gold','rocks','misses', 'completionTime']]
consumptionTestData = consumptionTestData.rename(columns={'manipulationToday': 'manipulation'})
consumptionTestData = consumptionTestData.reset_index()
consumptionTestData = consumptionTestData.drop(columns='index')

# Make sure in the other stuff they were enterd
print('n subjects:', len(consumptionTestData.subId.unique()))
print('n subjects with consumption test on devaluation:', len(consumptionTestData[consumptionTestData.manipulation=='devaluation'].subId.unique()))
print('n subjects with consumption test on still_valued:', len(consumptionTestData[consumptionTestData.manipulation=='still_valued'].subId.unique()))
print('n subjects with consumption test on still_valued_post_deval:', len(consumptionTestData[consumptionTestData.manipulation=='still_valued_post_deval'].subId.unique()))

# create a list of subjects that did not commit at all the cave on the devaluation days:
subsNotEntered = []
for sub in consumptionTestData.subId.unique():
  subGroup = consumptionTestData[consumptionTestData.subId == sub].group.iloc[0]
  for manip in manipulations_renamed[subGroup]:
    if consumptionTestData[(consumptionTestData.subId == sub) & (consumptionTestData.manipulation == manip)].empty:
      print(f'Adding 0 for: {sub} - {subGroup} - {manip}')
      newRow = {'subId': sub,	'group': subGroup,	'day': all_manipulation_days[subGroup][manip], 'manipulation':manip, 'gold':0,'rocks':0,'misses':	0,'completionTime': None}
      consumptionTestData = consumptionTestData.append(newRow, ignore_index=True)
      if manip == 'devaluation':
        subsNotEntered.append(sub)

consumptionTestData = consumptionTestData.sort_values(by=['subId', 'day'])
consumptionTestData = consumptionTestData.reset_index(drop=True)

print('\n\nsubjects that did not committed the task on devaluation:', subsNotEntered)
print('pointed at the data as an entry with 0 gold.')


In [42]:
#@title Check eligibility of consumption tests
for subID in data.sub_df.subId.unique():
  if subID not in consumptionTestData.subId.unique():
    print('>>> subject ' + str(subID) + ' has 0 completed consumption tests.')
for subID in consumptionTestData.subId.unique():
  if len(consumptionTestData[consumptionTestData.subId == subID]) == 1 :
    print('>>> subject ' + str(subID) + ' has only 1 completed consumption tests in ' + consumptionTestData[consumptionTestData.subId == subID].manipulation.iloc[0].upper() + '.')


### Add the consumption test data to the main data for variable and file

In [43]:
all_data_for_R2 = all_data_for_R.reset_index()

for index, row in consumptionTestData.iterrows():
  all_data_for_R2.loc[all_data_for_R2.subID == row['subId'], f"cave_gold_{row['manipulation']}"] = row['gold']

all_data_for_R = all_data_for_R2.set_index(['subID','group'])

all_data_for_R.to_csv(allDataFor_R_File)


# Save raw data - use only when need it because it initialize the data variable
* If run on Colab, the files formed here are formed on line and then should be downloaded.

In [44]:
#@markdown Save Raw Data { form-width: "5%" }

import pickle
# # Save raw data as a list
data_temp = Data(minSubID, maxSubID) # instanciates and creates the data.raw
with open(rawDataAsListFile, 'w') as f:
    for item in data_temp.raw:
        f.write("%s\n" % item)

data_temp.filter_trials_data()  # creates data.trials_data
data_temp.create_sub_dict()  # creates data.sub_dict (defaulted to create it from data.trials_data)
# save raw data binary
a_file = open(rawDataAsBinaryFile, "wb")
pickle.dump(data_temp.sub_dict, a_file)
a_file.close()
