In [209]:
from datascience import *
import numpy as np
import re

In [210]:
results = Table.read_table('centaur-data/1345_customer_results.csv')
results.show(3)

Case ID,Origin,Origin Created At,Content ID,URL,Labeling State,Series,Series Index,Patch,Qualified Reads,Correct Label,Majority Label,Difficulty,Agreement,First Choice Answer,First Choice Votes,First Choice Weight,Second Choice Answer,Second Choice Votes,Second Choice Weight,Internal Notes,Comments,Explanation
5888087,https://centaur-customer-uploads.s3.us-east-1.amazonaws. ...,Jul 26 2021 21:41:47 PM,3264386,https://go.centaurlabs.com/problem/5888087,Gold Standard,,,,2,'no','no',0,1,'no',2,1.54,'yes',0,0,,[],
5888088,https://centaur-customer-uploads.s3.us-east-1.amazonaws. ...,Jul 26 2021 21:41:47 PM,3264387,https://go.centaurlabs.com/problem/5888088,Gold Standard,,,,3,'no','no',0,1,'no',3,2.34,'yes',0,0,,[],
5888089,https://centaur-customer-uploads.s3.us-east-1.amazonaws. ...,Jul 26 2021 21:41:47 PM,3264388,https://go.centaurlabs.com/problem/5888089,Gold Standard,,,,2,'no','no',0,1,'no',2,1.7,'yes',0,0,,[],


<h1>Data Cleaning</h1>

We only want to use rows in the results table where the URL in the column Origin contains a vote number N in the form 'voteN'. So, we'll start by finding those:

In [211]:
sample_url = results.column(1).item(1)
sample_url

'https://centaur-customer-uploads.s3.us-east-1.amazonaws.com/mgh-eeg/210721/jpg/B10_time178433_medoid_vote0.jpg'

In [212]:
sample_split = sample_url.split('.')
sample_split

['https://centaur-customer-uploads',
 's3',
 'us-east-1',
 'amazonaws',
 'com/mgh-eeg/210721/jpg/B10_time178433_medoid_vote0',
 'jpg']

In [213]:
sample_split[-2].split('_')[-1]

'vote0'

In [214]:
re.search('vote[0-8]', sample_url)

<re.Match object; span=(101, 106), match='vote0'>

In [215]:
re.search('vote[0-8]', sample_url).group()

'vote0'

In [216]:
re.search('vote[0-8]', sample_url).string

'https://centaur-customer-uploads.s3.us-east-1.amazonaws.com/mgh-eeg/210721/jpg/B10_time178433_medoid_vote0.jpg'

In [217]:
origin_array = results.column('Origin')
voteN_found_array = make_array()

for url in origin_array:
    '''
    Find the URLs in the Origin column of the results table that contain "voteN", where N is a digit from 0-8.
    Create a true/false array that maps to where "voteN" was found in the column.
    '''
    if re.search('vote[0-8]', url) == None:
        voteN_found_array = np.append(voteN_found_array, 0)
    else: 
        voteN_found_array = np.append(voteN_found_array, 1)
        
voteN_found_array

array([ 1.,  1.,  1., ...,  1.,  1.,  1.])

Numbers below show how many "voteN"-containing URLs were found: 27,000 out of 30,293

In [218]:
np.count_nonzero(voteN_found_array)

27000

In [219]:
len(voteN_found_array)

30293

Pruning the results table to a version containing only the "voteN" rows:

In [220]:
results_voteN = results.with_column('voteN exists', voteN_found_array)
results_voteN.num_rows

30293

In [221]:
results_voteN = results_voteN.where('voteN exists', 1)
results_voteN.num_rows

27000

In [222]:
results_voteN.show(5)

Case ID,Origin,Origin Created At,Content ID,URL,Labeling State,Series,Series Index,Patch,Qualified Reads,Correct Label,Majority Label,Difficulty,Agreement,First Choice Answer,First Choice Votes,First Choice Weight,Second Choice Answer,Second Choice Votes,Second Choice Weight,Internal Notes,Comments,Explanation,voteN exists
5888087,https://centaur-customer-uploads.s3.us-east-1.amazonaws. ...,Jul 26 2021 21:41:47 PM,3264386,https://go.centaurlabs.com/problem/5888087,Gold Standard,,,,2,'no','no',0.0,1.0,'no',2,1.54,'yes',0,0.0,,[],,1
5888088,https://centaur-customer-uploads.s3.us-east-1.amazonaws. ...,Jul 26 2021 21:41:47 PM,3264387,https://go.centaurlabs.com/problem/5888088,Gold Standard,,,,3,'no','no',0.0,1.0,'no',3,2.34,'yes',0,0.0,,[],,1
5888089,https://centaur-customer-uploads.s3.us-east-1.amazonaws. ...,Jul 26 2021 21:41:47 PM,3264388,https://go.centaurlabs.com/problem/5888089,Gold Standard,,,,2,'no','no',0.0,1.0,'no',2,1.7,'yes',0,0.0,,[],,1
5888090,https://centaur-customer-uploads.s3.us-east-1.amazonaws. ...,Jul 26 2021 21:41:47 PM,3264389,https://go.centaurlabs.com/problem/5888090,Gold Standard,,,,1,'no','no',0.0,1.0,'no',1,0.82,'yes',0,0.0,,[],,1
5888091,https://centaur-customer-uploads.s3.us-east-1.amazonaws. ...,Jul 26 2021 21:41:47 PM,3264390,https://go.centaurlabs.com/problem/5888091,In Progress,,,,7,,'yes',,0.571,'yes',4,3.28,'no',3,2.32,,[],,1


Now we'll trim the table down much more.

The columns to be kept will be only the Case ID and the quantitative columns that relate to labelling votes and accuracy. 

In [223]:
results_trimmed = results_voteN.drop(2,3,4,5,6,7,8, "Internal Notes", "Comments", "Explanation", "voteN exists")
results_trimmed.show(3)

Case ID,Origin,Qualified Reads,Correct Label,Majority Label,Difficulty,Agreement,First Choice Answer,First Choice Votes,First Choice Weight,Second Choice Answer,Second Choice Votes,Second Choice Weight
5888087,https://centaur-customer-uploads.s3.us-east-1.amazonaws. ...,2,'no','no',0,1,'no',2,1.54,'yes',0,0
5888088,https://centaur-customer-uploads.s3.us-east-1.amazonaws. ...,3,'no','no',0,1,'no',3,2.34,'yes',0,0
5888089,https://centaur-customer-uploads.s3.us-east-1.amazonaws. ...,2,'no','no',0,1,'no',2,1.7,'yes',0,0


And, we'll extract the the number of expert yes votes from the Origin URLs, and replace the column with just those numbers.

In [224]:
expert_yes_votes = make_array()
for url in results_trimmed.column('Origin'):
    N = re.search('vote[0-8]', url).group()[-1]
    expert_yes_votes = np.append(expert_yes_votes, N)

expert_yes_votes

array(['2', '0', '0', ..., '4', '5', '5'],
      dtype='<U32')

In [225]:
results_trimmed = results_trimmed.with_column('Origin', expert_yes_votes).relabeled('Origin', 'Expert Yes Votes')
results_trimmed

Case ID,Expert Yes Votes,Qualified Reads,Correct Label,Majority Label,Difficulty,Agreement,First Choice Answer,First Choice Votes,First Choice Weight,Second Choice Answer,Second Choice Votes,Second Choice Weight
5888087,2,2,'no','no',0.0,1.0,'no',2,1.54,'yes',0,0.0
5888088,0,3,'no','no',0.0,1.0,'no',3,2.34,'yes',0,0.0
5888089,0,2,'no','no',0.0,1.0,'no',2,1.7,'yes',0,0.0
5888090,0,1,'no','no',0.0,1.0,'no',1,0.82,'yes',0,0.0
5888091,4,7,,'yes',,0.571,'yes',4,3.28,'no',3,2.32
5888092,0,4,'no','no',0.0,1.0,'no',4,3.3,'yes',0,0.0
5888093,0,6,'no','no',0.0,1.0,'no',6,4.94,'yes',0,0.0
5888094,0,2,'no','no',0.0,1.0,'no',2,1.56,'yes',0,0.0
5888095,0,4,'no','no',0.0,1.0,'no',4,3.34,'yes',0,0.0
5888096,0,3,'no','no',0.0,1.0,'no',3,2.56,'yes',0,0.0


It is a bit annoying for searching through this table that all the strings for yes and no actually contain single quotes around them. Let's clean that up:

In [227]:
def strip_quotes(str):
    return str.strip("'")

In [228]:
str1 = "'blargh'"

In [229]:
strip_quotes(str1)

'blargh'

In [230]:
str1

"'blargh'"

In [231]:
results_trimmed.apply(strip_quotes, "Correct Label")

array(['no', 'no', 'no', ..., 'nan', 'yes', 'yes'],
      dtype='<U3')

In [232]:
def fix_strings(table, list_of_columns):
    new_table = table
    for column_name in list_of_columns:
        fixed_strings = table.apply(strip_quotes, column_name)
        new_table = new_table.with_column(column_name, fixed_strings)
    return new_table

In [233]:
results_trimmed = fix_strings(results_trimmed, ['Correct Label', 'Majority Label', 'First Choice Answer', 'Second Choice Answer'])
results_trimmed

Case ID,Expert Yes Votes,Qualified Reads,Correct Label,Majority Label,Difficulty,Agreement,First Choice Answer,First Choice Votes,First Choice Weight,Second Choice Answer,Second Choice Votes,Second Choice Weight
5888087,2,2,no,no,0.0,1.0,no,2,1.54,yes,0,0.0
5888088,0,3,no,no,0.0,1.0,no,3,2.34,yes,0,0.0
5888089,0,2,no,no,0.0,1.0,no,2,1.7,yes,0,0.0
5888090,0,1,no,no,0.0,1.0,no,1,0.82,yes,0,0.0
5888091,4,7,,yes,,0.571,yes,4,3.28,no,3,2.32
5888092,0,4,no,no,0.0,1.0,no,4,3.3,yes,0,0.0
5888093,0,6,no,no,0.0,1.0,no,6,4.94,yes,0,0.0
5888094,0,2,no,no,0.0,1.0,no,2,1.56,yes,0,0.0
5888095,0,4,no,no,0.0,1.0,no,4,3.34,yes,0,0.0
5888096,0,3,no,no,0.0,1.0,no,3,2.56,yes,0,0.0


Now it will be easier to explore the data.

But let's also just do a quick check that the rest of the columns have data types that make sense, and fix anything else that might need fixing:

In [234]:
def check_data_types(table):
    data_types = list()
    for i in np.arange(table.num_columns):
        data_types.append( type(table.column(i).item(0) ) )
    return data_types

In [235]:
check_data_types(results_trimmed)

[int, str, int, str, str, float, float, str, int, float, str, int, float]

Most of this is fine, but the 'Expert Yes Votes' are actually strings. They should be converted to ints:

In [241]:
expert_yes_ints = make_array()
for i in expert_yes_votes:
    expert_yes_ints = np.append(expert_yes_ints, int(i))

expert_yes_ints

array([ 2.,  0.,  0., ...,  4.,  5.,  5.])

In [242]:
results_trimmed.num_rows

27000

In [243]:
len(expert_yes_ints)

27000

In [244]:
results_trimmed = results_trimmed.with_column('Expert Yes Votes', expert_yes_ints)
results_trimmed

Case ID,Expert Yes Votes,Qualified Reads,Correct Label,Majority Label,Difficulty,Agreement,First Choice Answer,First Choice Votes,First Choice Weight,Second Choice Answer,Second Choice Votes,Second Choice Weight
5888087,2,2,no,no,0.0,1.0,no,2,1.54,yes,0,0.0
5888088,0,3,no,no,0.0,1.0,no,3,2.34,yes,0,0.0
5888089,0,2,no,no,0.0,1.0,no,2,1.7,yes,0,0.0
5888090,0,1,no,no,0.0,1.0,no,1,0.82,yes,0,0.0
5888091,4,7,,yes,,0.571,yes,4,3.28,no,3,2.32
5888092,0,4,no,no,0.0,1.0,no,4,3.3,yes,0,0.0
5888093,0,6,no,no,0.0,1.0,no,6,4.94,yes,0,0.0
5888094,0,2,no,no,0.0,1.0,no,2,1.56,yes,0,0.0
5888095,0,4,no,no,0.0,1.0,no,4,3.34,yes,0,0.0
5888096,0,3,no,no,0.0,1.0,no,3,2.56,yes,0,0.0


Now the strings are easy to handle, numbers are appropriate number types, and we can explore the data.

Below, we can see that we have a remarkably neat set of data to work with. It contains exactly 12,000 cases where the expert vote favored 'yes', another 12,000 where the expert vote favored 'no', and 3,000 where the expert vote was split. (And we can also see confirmation that the number of expert yes votes fits as expected with the given "correct label".)

In [253]:
results_trimmed.where('Correct Label', 'yes').num_rows

12000

In [246]:
results_trimmed.where('Expert Yes Votes', are.above(4)).num_rows

12000

In [182]:
results_trimmed.where('Correct Label', 'no').num_rows

12000

In [247]:
results_trimmed.where('Expert Yes Votes', are.below(4)).num_rows

12000

In [183]:
results_trimmed.where('Correct Label', 'nan').num_rows

3000

In [186]:
results_trimmed.where('Expert Yes Votes', '4').num_rows

3000