In [1]:
import os
import pandas as pd

# Data Documentation

The data pre-processed by this notebook were collected in the first round of the Measurement grant (Survey 2020 version). The data were manually labeled as P (Principle), L (Limitations), S (Statistics), or O (Other) by the researchers with a variety of methods. Details of the meaning of each of these codes can be found in the coding scheme in the Measurement grant files. This notebook standardizes the format of the data labeling for P, L, S, and O. 

**Number of examples:**

Sources: **2899**

More: **389**
- 522 pre-filtering for NAs in the explanation column

Better 
**390**
- 522 pre-filtering for NAs in the explanation column

Generic classical
**145**

Generic quantum
**144**
R_3Mg3lCeyGGJa7zz provided a response to the classical question but not the quantum question

### Load data files and convert to a standard multi-label format

In [2]:
cwd = os.getcwd()
data_folder = cwd + '/Raw_labeled_data/'

In [3]:

data_sources = pd.read_excel(data_folder + 'sources_amended.xlsx')
data_more_better = pd.read_excel(data_folder + 'more_better.xlsx')
data_generic = pd.read_excel(data_folder + 'generic.xlsx')

In [10]:
data_sources["P"] = (data_sources["updated_code"] == "P").astype(int)
data_sources["L"] = (data_sources["updated_code"] == "L").astype(int)
data_sources["S"] = (data_sources["updated_code"] == "S").astype(int)
data_sources["O"] = (data_sources["updated_code"] == "O").astype(int)

data_sources["Input"] = data_sources["response"]

data_more = data_more_better[["ResponseId","Experiment","P_more","L_more","S_more","More_response","More_explanation"]].copy()
data_better = data_more_better[["ResponseId","Experiment","P_better","L_better","S_better","Better_response","Better_explanation"]].copy()

data_more.loc[:,"P"] = data_more["P_more"]
data_more.loc[:,"L"] = data_more["L_more"]
data_more.loc[:,"S"] = data_more["S_more"]
data_more.loc[:,"O"] = ((data_more["P"] == 0) & (data_more["L"] == 0) & (data_more["S"] == 0)).astype(int)

data_better.loc[:,"P"] = data_better["P_better"]
data_better.loc[:,"L"] = data_better["L_better"]
data_better.loc[:,"S"] = data_better["S_better"]
data_better.loc[:,"O"] = ((data_better["P"] == 0) & (data_more["L"] == 0) & (data_more["S"] == 0)).astype(int)

data_more["Input"] = data_more["More_explanation"]
data_better["Input"] = data_better["Better_explanation"]

data_classical = data_generic[["DataSource","ResponseId","Response_Generic_Classical","CQ","code","n"]].copy()
data_quantum = data_generic[["DataSource","ResponseId","Response_Generic_Quantum","CQ","code","n"]].copy()

In [11]:
data_classical = data_classical[data_classical["CQ"] == "C"]
data_quantum = data_quantum[data_quantum["CQ"] == "Q"]

data_classical["P"] = (data_classical["code"] == "P").astype(int)
data_classical["L"] = (data_classical["code"] == "L").astype(int)
data_classical["S"] = (data_classical["code"] == "S").astype(int)
data_classical["O"] = (data_classical["code"] == "O").astype(int)

data_quantum["P"] = (data_quantum["code"] == "P").astype(int)
data_quantum["L"] = (data_quantum["code"] == "L").astype(int)
data_quantum["S"] = (data_quantum["code"] == "S").astype(int)
data_quantum["O"] = (data_quantum["code"] == "O").astype(int)


#for responseid in data_classical[data_classical["n"] > 1]["ResponseId"].unique().tolist(): # get all the ResponseIds with multiple segments
    #print(data_classical[data_classical["ResponseId"] == responseid])

data_classical = data_classical.groupby("ResponseId", as_index=False).agg({
    'DataSource': 'first',
    'Response_Generic_Classical': 'first',
    'CQ': 'first',
    'n': 'max',
    'P': 'max',
    'L': 'max',
    'S': 'max',
    'O': 'max'
})

data_quantum = data_quantum.groupby("ResponseId", as_index=False).agg({
    'DataSource': 'first',
    'Response_Generic_Quantum': 'first',
    'CQ': 'first',
    'n': 'max',
    'P': 'max',
    'L': 'max',
    'S': 'max',
    'O': 'max'
})

data_classical["Input"] = data_classical["Response_Generic_Classical"]
data_quantum["Input"] = data_quantum["Response_Generic_Quantum"]

### Print all data sizes prior to filtering

In [12]:
# data length prior to filtering
print(len(data_sources))
print(len(data_more))
print(len(data_better))
print(len(data_classical))
print(len(data_quantum))

2899
522
522
145
144


### Why are classical and quantum different by one?

In [13]:
merged = pd.merge(data_classical, data_quantum, on='ResponseId', how='outer', indicator=True)
merged[merged["_merge"] == "left_only"]

Unnamed: 0,ResponseId,DataSource_x,Response_Generic_Classical,CQ_x,n_x,P_x,L_x,S_x,O_x,Input_x,DataSource_y,Response_Generic_Quantum,CQ_y,n_y,P_y,L_y,S_y,O_y,Input_y,_merge
84,R_3Mg3lCeyGGJa7zz,Cornell pilot,"What comes to mind immediately is ""human error...",C,3.0,0,1,0,0,"What comes to mind immediately is ""human error...",,,,,,,,,,left_only


In [14]:
data_generic[data_generic["ResponseId"] == "R_3Mg3lCeyGGJa7zz"]
#indeed, this response has an answer to generic classical but not generic quantum

Unnamed: 0.1,Unnamed: 0,DataSource,ResponseId,Response_Generic_Classical,Response_Generic_Quantum,CQ,Segmented_Response,code,n
1,2,Cornell pilot,R_3Mg3lCeyGGJa7zz,"What comes to mind immediately is ""human error...",,C,"What comes to mind immediately is ""human error...",L,1.0
146,2,Cornell pilot,R_3Mg3lCeyGGJa7zz,"What comes to mind immediately is ""human error...",,C,Then uncertainty in the measurement tool you a...,L,2.0
210,2,Cornell pilot,R_3Mg3lCeyGGJa7zz,"What comes to mind immediately is ""human error...",,C,uncertainty caused inherently by your experime...,L,3.0


### Filter out NAs

In [15]:
data_sources[data_sources["Response"].isna()] #none

KeyError: 'Response'

In [16]:
print(len(data_more[data_more["More_explanation"].isna()])) #42, all coded as "Other"
print(len(data_more[data_more["More_explanation"] == 0])) #14, all coded as "Other"
print(len(data_more[data_more["More_explanation"] == "0"])) #77, all coded as "Other"
data_more = data_more[~data_more["More_explanation"].isna()]
data_more = data_more[data_more["More_explanation"] != 0]
data_more = data_more[data_more["More_explanation"] != "0"]
len(data_more)

42
14
77


389

In [17]:
print(len(data_better[data_better["Better_explanation"].isna()])) #43, all coded as "Other"
print(len(data_better[data_better["Better_explanation"] == 0])) #14, all coded as "Other"
print(len(data_better[data_better["Better_explanation"] == "0"])) #75, all coded as "Other"
data_better = data_better[~data_better["Better_explanation"].isna()]
data_better = data_better[data_better["Better_explanation"] != 0]
data_better = data_better[data_better["Better_explanation"] != "0"]
len(data_better)

43
14
75


390

In [18]:
data_better[(data_better["O"] == 1) & (data_better["L"] == 0) & (data_better["P"] == 0) & (data_better["S"] == 0)]

Unnamed: 0,ResponseId,Experiment,P_better,L_better,S_better,Better_response,Better_explanation,P,L,S,O,Input
107,R_2TWPdKHd5Uh7azX,PM,0,0,0,A single value is measured.,should be a precise measurement,0,0,0,1,should be a precise measurement
185,R_pQ5XP6HY2UVVKa5,SG,0,0,0,Distribution becomes wider.,I think?_x000D_\n,0,0,0,1,I think?_x000D_\n
257,R_2TWPdKHd5Uh7azX,SG,0,0,0,,i dont know,0,0,0,1,i dont know
313,R_z6vUTP29zTV3hyp,PM,0,0,0,Distribution becomes narrower.,The results would zero in on the same average ...,0,0,0,1,The results would zero in on the same average ...
332,R_297l2QTTJnNac5F,PM,0,0,0,Distribution becomes narrower.,This experiment could easily be replicated by ...,0,0,0,1,This experiment could easily be replicated by ...
349,R_2B5nJghGj4OZo9R,PM,0,0,0,Distribution becomes narrower.,"More trails will reduce random errors, but the...",0,0,0,1,"More trails will reduce random errors, but the..."
394,R_1pS6MYFBKHH82dD,PM,0,0,0,Distribution becomes narrower.,Bias exist,0,0,0,1,Bias exist
458,R_2diLThKqwbWHO9Q,PM,0,0,0,A single value is measured.,we do not know the result,0,0,0,1,we do not know the result
467,R_z6vUTP29zTV3hyp,SG,0,0,0,Distribution stays roughly the same.,There is no way to make the measurements close...,0,0,0,1,There is no way to make the measurements close...
486,R_QhJ6floXQMEMzXH,SG,0,0,0,Distribution stays roughly the same.,Same reason,0,0,0,1,Same reason


### Fix text issues

In [19]:
def fix_encoding_issues(text):
    return text.replace("â€™", "'").replace("\n"," ")

data_sources["Input"] = data_sources["Input"].apply(fix_encoding_issues)
data_more["Input"] = data_more["Input"].apply(fix_encoding_issues)
data_better["Input"] = data_better["Input"].apply(fix_encoding_issues)
data_classical["Input"] = data_classical["Input"].apply(fix_encoding_issues)
data_quantum["Input"] = data_quantum["Input"].apply(fix_encoding_issues)

In [20]:
#ensure all data are in string format
data_sources["Input"] = data_sources["Input"].astype(str)
data_more["Input"] = data_more["Input"].astype(str)
data_better["Input"] = data_better["Input"].astype(str)
data_classical["Input"] = data_classical["Input"].astype(str)
data_quantum["Input"] = data_quantum["Input"].astype(str)

### Find number of examples for each category in each dataframe 

In [21]:
num_examples = pd.DataFrame({"code": ["P","L","S","O"],
              "sources": [data_sources["P"].value_counts()[1], data_sources["L"].value_counts()[1], data_sources["S"].value_counts()[1], data_sources["O"].value_counts()[1]],
              "more": [data_more["P"].value_counts()[1], data_more["L"].value_counts()[1], data_more["S"].value_counts()[1], data_more["O"].value_counts()[1]],
              "better": [data_better["P"].value_counts()[1], data_better["L"].value_counts()[1], data_better["S"].value_counts()[1], data_better["O"].value_counts()[1]],
              "classical": [data_classical["P"].value_counts()[1], data_classical["L"].value_counts()[1], data_classical["S"].value_counts()[1], data_classical["O"].value_counts()[1]],
              "quantum": [data_quantum["P"].value_counts()[1], data_quantum["L"].value_counts()[1], data_quantum["S"].value_counts()[1], data_quantum["O"].value_counts()[1]]
             })

In [22]:
num_examples['sum'] = num_examples[['sources', 'more', 'better','classical','quantum']].sum(axis=1)

In [23]:
num_examples

Unnamed: 0,code,sources,more,better,classical,quantum,sum
0,P,160,23,44,4,108,339
1,L,2212,91,279,103,28,2713
2,S,117,255,36,4,2,414
3,O,410,43,52,58,53,616


Save data in prepared folder

In [24]:
data_folder = cwd + '/Prepared_data/'
data_sources.to_excel(data_folder + "data_sources.xlsx")
data_more.to_excel(data_folder + "data_more.xlsx")
data_better.to_excel(data_folder + "data_better.xlsx")
data_classical.to_excel(data_folder + "data_classical.xlsx")
data_quantum.to_excel(data_folder + "data_quantum.xlsx")