## Full Confidence Database Analysis

**Purpose:** Analyze meta-memory and meta-perception within all studies in the database to explore similarities and differences within the metacognition of memory/perception.
- this script extracts metad and dprime from *all* memory and perception datasets in the database

Author: Saurish Srivastava @ [Subjectivity Lab](https://subjectivity.sites.northeastern.edu/)

In [None]:
# install packages
!pip3 install numpy
!pip3 install pandas
!pip3 install sklearn
!pip3 install git+https://github.com/embodied-computation-group/metadPy.git

In [None]:
# imports
import numpy as np
import pandas as pd
from metadPy.mle import fit_metad, metad
from metadPy.utils import trials2counts, discreteRatings
from sklearn.preprocessing import LabelEncoder

In [None]:
# read in datasets
databaseInfo = pd.read_csv('../../Confidence Database/Database_Information.csv',
                           usecols=['Category', 'Name_in_database', 'Confidence_scale'])
# get all datasets with perception/memory
databaseInfo = databaseInfo.loc[(databaseInfo['Category'] == 'Perception') | (databaseInfo['Category'] == 'Memory')]

In [None]:
requiredCols = ['Subj_idx', 'Stimulus', 'Response', 'Confidence']
finalDatasets = []

# get final dataset names by seeing if the dataset has each of the columns in 'requiredCols'
for i, dataName in enumerate(list(databaseInfo['Name_in_database'])):
    try:
        data = pd.read_csv('../../Confidence Database/data_' + dataName + '.csv', usecols=requiredCols)
    except:
        continue
    else:
        finalDatasets.append(dataName)

# get rid of datasets that include subjects with clinical disorders
finalDatasets = list(set(finalDatasets) - set(['Chandravadia_2020', 'Locke_2020', 'Wang_2017_NatComm']))

# only save datasets with required columns
databaseInfo = databaseInfo[databaseInfo['Name_in_database'].isin(finalDatasets)].reset_index(drop=True)

# add mixed datasets that include memory/perception
mixed_data = {'Category':['Mixed', 'Mixed', 'Mixed', 'Mixed', 'Mixed'],
        'Name_in_database':['Mazancieux_2018', 'Arbuzova_unpub_3', 'Samaha_2016', 'Samaha_2017_exp3', 'Ye_2018'],
        'Confidence_scale':['11-point', '4-point', '4-point', '4-point', '4-point']
       }
  
df_mixed_data = pd.DataFrame(mixed_data)
databaseInfo = databaseInfo.append(df_mixed_data,ignore_index=True)
databaseInfo

In [None]:
def createDataset(data, dataName, nRatings):
    """
    Create a dataset with the subjects from dataName and their respective dprime and metad values
    """
    domain_list = []
    metadprime_list = []
    dprime_list = []
    # if there is only one domain
    if 'Domain' not in list(data.columns):
        for i in list(data['Subject'].unique()):
            temp_data = data.loc[data['Subject'] == i]
            [nR_S1, nR_S2] = trials2counts(data=temp_data.copy(), stimuli="Stimuli", responses="Responses",
                                   confidence="Confidence", nRatings=nRatings, padding=True)
            # if metad function does not return error, continue with this subject
            try:
                temp_fit = fit_metad(nR_S1,nR_S2, nRatings=nRatings, nCriteria=int(2 * nRatings - 1))
            except:
                continue
            else:
                metadprime_list.append(temp_fit['meta_d'])
                dprime_list.append(temp_fit['dprime'])
                domain_temp = databaseInfo.loc[databaseInfo['Name_in_database'] == dataName, 'Category'].values[0]
                domain_list.append(domain_temp)
        data_list = [dataName] * len(metadprime_list)
    # if there are multiple domains
    else:
        domains = list(data['Domain'].unique())
        for domain in domains:
            domain_data = data.loc[data['Domain'] == domain] # only get proper domain
            domain_data = domain_data.reset_index(drop=True) # reset indexes
            # iterate through each subject in the data
            for i in list(domain_data['Subject'].unique()):
                temp_data = domain_data.loc[domain_data['Subject'] == i]
                [nR_S1, nR_S2] = trials2counts(data=temp_data.copy(), stimuli="Stimuli", responses="Responses",
                                       confidence="Confidence", nRatings=nRatings, padding=True)
                # if metad function does not return error, continue with this subject
                try:
                    temp_fit = fit_metad(nR_S1,nR_S2, nRatings=nRatings, nCriteria=int(2 * nRatings - 1))
                except:
                    continue
                else:
                    metadprime_list.append(temp_fit['meta_d'])
                    dprime_list.append(temp_fit['dprime'])
                    if type(domain) == str:
                        domain = domain.capitalize()
                    domain_list.append(domain)
        data_list = [dataName] * len(metadprime_list)
    
    # create dataframe
    product = pd.DataFrame({'dprime': dprime_list, 'metad': metadprime_list,
                            'domain': domain_list,'dataset': data_list})
    
    return product

In [None]:
metricsList = ['Subj_idx', 'Stimulus', 'Response', 'Confidence', 'Task', 'Group', 'group', 'Type']
# concat data function
masterData = pd.DataFrame()
for i, dataName in enumerate(list(databaseInfo['Name_in_database'])):
    # read in data
    try:
        data = pd.read_csv('../../Confidence Database/data_' + dataName + '.csv', usecols=lambda x: x in metricsList)
    except:
        continue
    else:
        print("Start: " + dataName)
        data = data.rename(columns={"Subj_idx": "Subject", "Stimulus": "Stimuli", "Response": "Responses"})

        # convert {string} stimuli to int (0 or 1) [keeps values the same if already in 0 or 1 format]
        data['Stimuli'] = LabelEncoder().fit_transform(data['Stimuli'])
        data['Responses'] = LabelEncoder().fit_transform(data['Responses'])
        
        # create accuracy column
        data['Accuracy'] = np.where((data['Stimuli'] == data['Responses']), 1, 0)

        # drop all NaNs
        data = data.dropna().reset_index(drop=True)
        # get data in order by subject
        data = data.sort_values(by=['Subject']).reset_index(drop=True) 
        # get confidence ratings
        confidence_val = databaseInfo.loc[databaseInfo['Name_in_database'] == dataName, 'Confidence_scale'].values[0]
        if 'continuous' in confidence_val:
            newRatings = discreteRatings(ratings=data['Confidence'].values)[0]
            data['Confidence'] = newRatings
            confidenceRatings = 4
        else:
            confidenceRatings = int(confidence_val[0])
            
        # check if confidence scale starts from 0 or 1 (if starts from 0, add 1 to each confidence value)
        if data['Confidence'].min() == 0:
            data['Confidence'] = data['Confidence'] + 1
        
        # centralize domains (i.e. 'task' or 'group' -> 'domain' for further processing)
        if len(list(data.columns)) > 5:
            # rename it to "Domain"
            data = data.rename(columns={data.columns[4]: "Domain"})
        
        # create datasets
        product = createDataset(data=data, dataName=dataName, nRatings=confidenceRatings)   
        
        # concat data
        masterData = pd.concat([masterData, product])
        
    print('Done: ' + dataName + ' ('+ str(i+1) + '/' + str(len(list(databaseInfo['Name_in_database']))) + ')')

In [None]:
# export data to csv file
masterData.to_csv('../../exports/masterData.csv')
masterData