In [None]:
import csv
import matplotlib.pyplot as plt

import xarray as xr

import pandas as pd
import json

import numpy as np
import seaborn as sns
import datetime as datetime
from netCDF4 import num2date, date2num

# Reading classification csv file from zooniverse

In [None]:
classfile_in = '../zooniverse_raw/sugar-flower-fish-or-gravel-classifications_18_12_16.csv'
subject_in = '../zooniverse_raw/sugar-flower-fish-or-gravel-subjects_18_11_05.csv'

In [None]:
### Functions from https://github.com/zooniverse/Data-digging/blob/master/example_scripts/astronomy_rewind/workflow1to2.py

def JSONParser(data):
    """call json.loads"""
    return json.loads(data)


def load_classifications(filename, json_columns=None):
    """
    Load classifications into pandas dataframe.
    Some columns of the csv are embedded json and need special parsing.
    """
    json_columns = json_columns or ['metadata', 'annotations', 'subject_data']
    converters = {i: JSONParser for i in json_columns}

    return pd.read_csv(filename, converters=converters)

def unpack(series):
    """
    Return the first value in a series.
    All annotations values are lists because of a few multiple tasks.
    The second multiple task always has the value of 'None of the above'
    (For this dataset!)
    """
    return [a[0] for a in series]


def parse_classifications(filename,**kwarg):
    """
    Load classifications and datamunge annotations column.
    """
    data = load_classifications(filename,**kwarg)

    # Only need the first item in the annotations list of json objects
    data['annotations'] = unpack(data['annotations'])
    return data

In [None]:
def get_time_spent(obj):
    from datetime import timedelta, datetime
    time_fmt='%Y-%m-%dT%H:%M:%S.%fZ'
    start=datetime.strptime(obj[11]['started_at'],time_fmt)
    stop=datetime.strptime(obj[11]['finished_at'],time_fmt)
    d=stop-start
    if d.total_seconds() > 3600*10:
        d=0
        return d
    else:
        return d.total_seconds()

In [None]:
classification_data = parse_classifications(classfile_in,json_columns=['metadata', 'annotations', 'subject_data']); classification_data.tail()

In [None]:
classification_data["created_at"] = [datetime.datetime.strptime(classification_data.created_at[i], "%Y-%m-%d %H:%M:%S UTC") for i in classification_data.index]

In [None]:
# Get first index of data after specific date
ind = np.min(np.where(classification_data.created_at > datetime.datetime(2018,10,2))[0]); ind
classification_data = classification_data.iloc[ind:]

In [None]:
classification_data.annotations[classification_data.index[0]]

In [None]:
classification_data.metadata[classification_data.index[0]]

In [None]:
# Get time spend on each classification
classification_data['time_spent'] = [get_time_spent(row) for row in classification_data.itertuples()]

# User statistics

In [None]:
def update_user_dict(user_dic,user_id, dic):
    # user existiert bereits
    if user_dic.get(user_id) != None:
        for key in dic.keys():
            if user_dic[user_id].get(key) != None:
                old_value = user_dic[user_id][key]
                new_value = old_value+dic[key]
                user_dic[user_id][key]=new_value
            else:
                user_dic[user_id][key]=dic[key]
    else:
        user_dic[user_id] = dic
    return user_dic

In [None]:
user_stat = {}
for u,user_classifications in classification_data.groupby('user_name'):
    pattern_types = user_classifications.annotations
    labels=np.array([])
    for classification in pattern_types:
        labels = np.append(labels,[value['tool'] for value in classification['value']])
    [nb_sugar, nb_flower, nb_fish, nb_gravel] = np.bincount(labels.astype(int),minlength=4)
    time_spent = user_classifications.time_spent.sum()
    user_stat = update_user_dict(user_stat, u, {'images_seen': len(user_classifications), 'fish': nb_fish,'gravel': nb_gravel, 'flower': nb_flower, 'sugar':nb_sugar, 'time_spent': time_spent})

In [None]:
DF_user = pd.DataFrame.from_dict(user_stat,orient='index')
DF_user['labels_done'] = DF_user.iloc[:,[1,2,3,4]].sum(axis=1)
DF_user.sort_values('labels_done',inplace=True)
DF_user.head()
DF_user.drop(DF_user.index[DF_user.labels_done < 200],inplace=True)

In [None]:
sns.set_context('talk')
sns.set_style('ticks')
fig=plt.figure()
p1=(DF_user.iloc[:,[1,2,3,4]]).plot(kind='bar', figsize=(15,6), stacked=True)#DF_user.iloc[:,[0]].plot(kind='bar',ax=p1.axes, alpha=0.4)
p1.set_ylabel('labels')
#plt.hlines(500,-1,70)
sns.despine()
plt.tight_layout()
plt.gcf().set_dpi(300)
plt.savefig('/Users/haukeschulz/Desktop/CloudClassificationDayStats_preliminary.png',transparent=True);

In [None]:
sns.set_context('talk')
sns.set_style('ticks')
titles=['Do you like fish?', 'Flower power', 'Sweetness-factor', 'Just Gravel']
for p,pattern in enumerate(['fish','flower','sugar','gravel']):
    plt.figure()
    pattern_percentage = pattern+'_percentage'
    DF_user[pattern_percentage] = DF_user[pattern]/DF_user.labels_done*100
    p1=DF_user.sort_values(pattern_percentage)[pattern_percentage].plot(kind='bar', figsize=(10,6), color='darkblue')
    t=p1.set_ylabel('{} labels relative to your total labels [%]'.format(pattern))
    #p1.set_title('Do you like {}?'.format(pattern))
    p1.set_title(titles[p])
    plt.gcf().set_dpi(300)
    sns.despine()

# Time spent

In [None]:
sns.set_context('talk')
sns.set_style('ticks')
DF_user['images_per_minute'] = 60/(DF_user.time_spent/DF_user.images_seen)
p1=DF_user.sort_values('images_per_minute').images_per_minute.plot(kind='bar', figsize=(10,5), stacked=True, color='darkblue')
#DF_user.iloc[:,[0]].plot(kind='bar',ax=p1.axes, alpha=0.4)
t=p1.set_ylabel('images per minute')
p1.set_title('Speed')
plt.gcf().set_dpi(300)
sns.despine()

In [None]:
sns.set_context('talk')
sns.set_style('ticks')
p1=(DF_user.sort_values('time_spent').time_spent/3600).plot(kind='bar', figsize=(10,5), stacked=True, color='darkblue')
#DF_user.iloc[:,[0]].plot(kind='bar',ax=p1.axes, alpha=0.4)
t=p1.set_ylabel('time spent (hours)')
p1.set_title('Time spent')
plt.gcf().set_dpi(300)
sns.despine()

# Subject set statistics

In [None]:
subject_data = load_classifications(subject_in)
subject_data = subject_data.set_index('subject_id'); subject_data.head()

In [None]:
subjects_name = {60811:'BCO_DJF_Aqua',60812:'BCO_DJF_Terra',60813: 'BCO_MAM_Aqua',\
                 60814: 'BCO_MAM_Terra',60815:'R2_DJF_Aqua',60816:'R2_DJF_Terra',\
                 60817:'R3_DJF_Aqua', 60818:'R3_DJF_Terra',60819:'R3_SON_Aqua',\
                 60835: 'R3_SON_Terra'}
for s, subject_set in subject_data.groupby('subject_set_id'):
    try:
        print(subjects_name[s])
        print(len(subject_set),subject_set.classifications_count.sum(),\
              len(subject_set.classifications_count.nonzero()[0]),\
              np.round(len(subject_set.classifications_count.nonzero()[0])/len(subject_set)*100,1))
        print('maximum number of classifications per image: {}'.format(subject_set.classifications_count.max()))
    except KeyError:
        print('Subset {} not of interest'.format(s))

# Distribution of pattern within region

In [None]:
def update_dict(region_dic,region_id, dic):
    # user existiert bereits
    if region_dic.get(region_id) != None:
        for key in dic.keys():
            if region_dic[region_id].get(key) != None:
                old_value = region_dic[region_id][key]
                new_value = old_value+dic[key]
                region_dic[region_id][key]=new_value
            else:
                region_dic[region_id][key]=dic[key]
    else:
        region_dic[region_id] = dic
    return region_dic

In [None]:
region_stat = {}
for e,entry in enumerate(classification_data.iterrows()):
    nb_fish = nb_gravel = nb_flower = nb_sugar = 0
    subject_id = entry[1].subject_ids
    workflow_c_id = entry[1].workflow_id
    pattern_types = entry[1].annotations
    labels = [value['tool'] for value in pattern_types['value']]
    [nb_sugar, nb_flower, nb_fish, nb_gravel] = np.bincount(labels,minlength=4)
    try:
        ind = np.where(subject_id == subject_data.index.values)[0][0]
        subset_id = subject_data.subject_set_id.values[ind]
        workflow_id = subject_data.workflow_id.values[ind]
        if workflow_c_id == np.float(8073):
            try:
                region_name = subjects_name[subset_id]
                region_stat = update_dict(region_stat, region_name, {'fish': nb_fish,'gravel': nb_gravel, 'flower': nb_flower, 'sugar':nb_sugar})
            except KeyError:
                continue
    except:
        continue

In [None]:
region_stat

In [None]:
total = np.asarray(pd.DataFrame.from_dict(region_stat)).sum(axis=1)

In [None]:
x=pd.DataFrame.from_dict(region_stat,orient='index')
x['region'] = [0,0,1,1,2,2,3,3,4,4]

In [None]:
x

In [None]:
region_result = {}
for region, region_grp in x.groupby('region'):
    region_result[region_grp.index.values[0][:-5]] = region_grp.sum()

In [None]:
region_stat2 = region_result

In [None]:
sum(total)

In [None]:
total_column_names = ['fish', 'flower','gravel','sugar']

In [None]:
plt.bar(total_column_names,total)
sns.despine(offset=10)
plt.gcf().set_dpi(300)

In [None]:
DF=pd.DataFrame.from_dict(region_stat2,orient='columns')
DF.drop('region',inplace=True)

def make_autopct(values):
    def my_autopct(pct):
        total = sum(values)
        val = int(round(pct*total/100.0))
        return '{p:.2f}%  ({v:d})'.format(p=pct,v=val)
    return my_autopct

DF.plot(kind='pie',figsize=(15,22),subplots=True,layout=(3, 2),labels=None,legend=None,autopct='%1.1f%%',yticks=None)
ax=plt.gca()
sns.despine(offset=20)

In [None]:
# or just another layout of the same data

In [None]:
DF=pd.DataFrame.from_dict(region_stat,orient='index')

p=DF.plot(kind='bar',figsize=(20,10),stacked=True)
p.set_ylabel('labels')
sns.despine(offset=20)

The above figure includes both, the practice workflow and the full dataset workflow.

It looks like there is a difference between Aqua and Terra overpasses. Flowers are always less during Aqua than during Terra!

In [None]:
DF['total'] = DF.iloc[:,[0,1,2,3]].sum(axis=1); DF

### Calculating the relative differences between the Aqua and Terra data

In [None]:
DF.loc['BCO_DJF_diff'] = DF.iloc[0,[0,1,2,3]]/DF.iloc[0,4]*100-DF.iloc[1,[0,1,2,3]]/DF.iloc[1,4]*100
DF.loc['BCO_MAM_diff'] = DF.iloc[2,[0,1,2,3]]/DF.iloc[2,4]*100-DF.iloc[3,[0,1,2,3]]/DF.iloc[3,4]*100
DF.loc['R2_DJF_diff'] = DF.iloc[4,[0,1,2,3]]/DF.iloc[4,4]*100-DF.iloc[5,[0,1,2,3]]/DF.iloc[5,4]*100
DF.loc['R3_DJF_diff'] = DF.iloc[6,[0,1,2,3]]/DF.iloc[6,4]*100-DF.iloc[7,[0,1,2,3]]/DF.iloc[7,4]*100
DF.loc['R3_SON_diff'] = DF.iloc[8,[0,1,2,3]]/DF.iloc[8,4]*100-DF.iloc[9,[0,1,2,3]]/DF.iloc[9,4]*100

In [None]:
p=DF.loc[['BCO_DJF_diff','BCO_MAM_diff','R2_DJF_diff','R3_DJF_diff','R3_SON_diff'],['fish','gravel','flower','sugar']].plot(kind='bar',figsize=(12,7),stacked=False)
p.set_ylabel('Aqua-Terra (%)')
sns.despine(offset=20)
plt.gcf().set_dpi(300)

So the difference between the overpasses is quite obvious. However, the *flower* classifications are always less during Aqua overpasses, but for the region BCO in DJF, where there is no significant change at all.
( Check if that changes when the practice dataset is excluded )

# Practical dataset



In [None]:
image_stat = {}
for e,entry in enumerate(classification_data.iterrows()):
    if entry[1].workflow_id == np.float(8072): #Practical workflow
        nb_fish = nb_gravel = nb_flower = nb_sugar = 0
        subject_id = entry[1].subject_ids
        pattern_types = entry[1].annotations
        labels = [value['tool'] for value in pattern_types['value']]
        [nb_sugar, nb_flower, nb_fish, nb_gravel] = np.bincount(labels,minlength=4)
        try:
            ind = np.where(subject_id == subject_data.index.values)[0][1] #<-- 0: BCO_DJF_Aqua, 1: practice 50 images
            subset_id = subject_data.subject_set_id.values[ind]
            if subset_id == np.float(60902): #check again for savety
                try:
                    image_name = subject_id
                    image_stat = update_dict(image_stat, image_name, {'fish': nb_fish,'gravel': nb_gravel, 'flower': nb_flower, 'sugar':nb_sugar})
                except KeyError:
                    continue
                #image_stat[e] = {'fish': nb_fish,'gravel': nb_gravel, 'flower': nb_flower, 'sugar':nb_sugar}
        except:
            continue

In [None]:
practical_image_DF = pd.DataFrame.from_dict(image_stat); practical_image_DF

In [None]:
p=practical_image_DF.plot(kind='pie',layout=(12,5), legend=None, subplots=True,figsize=(30,50));