In [1]:
import numpy as np
import pandas as pd
import sys
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta, date
%matplotlib inline
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 60

### Here lives some of the functions used in the rest of the notebook

In [2]:
def fix_time(time_start,current_time):
    """This function fixes the timestamps used by converting them to seconds, starting at zero.
    
    Args:
        time_start (str): The time at which the first action was done.
        current_time (str): The current time we are converting.

    Returns:
        current_time_fixed: The time of the current action converted into seconds given that the first action was done as 0 seconds.
    """
    #check that the current time is later than first time stamp
    if datetime.combine(date.min, current_time) >= datetime.combine(date.min, time_start):
        current_time_fixed = (datetime.combine(date.min, current_time) - datetime.combine(date.min, time_start)).total_seconds()
    else:
        #ex: this is the case when time_start is 59:00 min and cuurent_time is 02:00 min, so we add an hour to find the duration in between
        current_time_fixed = (datetime.combine(date.min, current_time) + timedelta(hours=1) - datetime.combine(date.min, time_start)).total_seconds()
    return current_time_fixed

def get_duration(row):
    """This function gets the duration of an action given the difference in time
    between the current and next timestamp.
    
    Args:
        row (Pandas element): The row of the action for which we want to find the duration.

    Returns:
        duration: The difference in time in seconds between the Timeshifted and Time variables.
    """
    if pd.notnull(row['Timeshifted']): #check that this is not the last action which will have a NA Timeshifted value
        #check that the time of the next action is indeed later than time of the current actin
        if datetime.combine(date.min, row['Timeshifted']) >= datetime.combine(date.min, row['Time']):
            duration = (datetime.combine(date.min, row['Timeshifted']) - datetime.combine(date.min, row['Time'])).total_seconds()
        else:
            #ex: this is the case when TimeSHifted is 59:00 min and Time is 02:00 min, so we add an hour to find the duration in between
            duration = (datetime.combine(date.min, row['Timeshifted']) + timedelta(hours=1) - datetime.combine(date.min, row['Time'])).total_seconds()
    else:
        duration = 10 #last action lasts zero seconds but we need to put a dummy variable here.
    return duration

def get_action_usage(df,column,action):
    '''Given an action or method, we detect its use using a particular column
    and then extract a list of time coordinates for when
    they were used. These coordinates are in the format (start_time, duration)
    
    Args:
        df (Pandas dataframe): The dataframe to search in.
        column (str): The column where the method or action might be logged.
        action (str): The name of the action or method to search for in the column.
        

    Returns:
        A list of tuples with start times of the action and it's duration [(start1,duration1),(start2,duration2),...]
    '''
    return zip(df[df[column].str.contains(action,na=False)]['Time_seconds'],df[df[column].str.contains(action,na=False)]['Duration'])

# The data

In [3]:
df_all = pd.read_excel('all data v3.xlsx', 'iLab data.txt', index_col=None, na_values=['NA'])

### Preparing a test sample
Let's first use a particular session as a test case. We extract only the data relevant to that case

In [4]:
#Using the example used for sketch.
#(df_all.condition1 == 'Savanna') & (df_all.student1 == 'penguin') & (df_all.student2 == 'gorilla')]
df_test = df_all[df_all['Session Id'] == 'L-2567b17a:120eda25685:-8000']

Next we filter out all actions with "INCORRECT" outcomes

In [5]:
before = df_test.shape[0]
df_test = df_test[df_test['Outcome'] == 'CORRECT']
print "We are left with {0} rows out of {1}".format(df_test.shape[0],before)

We are left with 212 rows out of 285


### We need to extract the time stamp of each action and it's duration

Next we fix the time logs and convert them to seconds. We also recalculate the time between actions now that we have gotten rid of incorrect actions.

In [6]:
time_start = list(df_test['time first action'])[0]
df_test['Time_seconds'] = df_test[['Time']].applymap(lambda current_time: fix_time(time_start,current_time))
df_test['Timeshifted'] = df_test[['Time']].shift(-1)
df_test['Duration'] = df_test[['Time','Timeshifted']].apply(get_duration, axis=1)

In [7]:
utils.fix_time()

NameError: name 'utils' is not defined

### A first plot

In [None]:
color = {"Now try working on this new example":"#252525",
         "intuition": "#bdbdbd",
         "submit": "#969696",
         "evaluation": "#737373",
         "checkIntuition": "#525252",
         'Median': "#33a02c",
         'Average': "#6a3d9a",
         'Sum': "#1f78b4",
         'all': "#ff7f00",
        }
column_to_use = {"Now try working on this new example":"Feedback Text",
                 "intuition": "Selection",
                 "submit": "Selection",
                 "evaluation": "Selection",
                 "checkIntuition": "Selection",
                 "Now try working on this new example":"Feedback Text",
                 'Median': 'Method_Recognized_1_Copied',
                 'Average': 'Method_Recognized_1_Copied',
                 'Sum': 'Method_Recognized_1_Copied',
                 'all': 'Method_Recognized_1_Copied',
                }
to_plot = ["intuition","submit","evaluation","checkIntuition",'Median','Average','Sum','all']

In [None]:
fig = plt.figure(figsize=(18,8))
ax = plt.subplot()
spacing =10
pos = 0
max_time = 0
actions = list(reversed(to_plot))
black = '#252525'
for i,action in enumerate(actions):
    action_use = get_action_usage(df_test,column_to_use[action],action)
    max_time = max(max_time,sum(action_use[-1]))
    ax.broken_barh(action_use,(pos,spacing),facecolors=color[action],alpha=1,linewidth=0)
    pos += spacing

#Add horizontal bar
ax.broken_barh([(0,ax.get_xlim()[1])],(40,0.5),facecolors='white',alpha=1,linewidth=0)

#Add new case bar
new_case = "Now try working on this new example"
action_use = get_action_usage(df_test,column_to_use[new_case],new_case)
max_time = max(max_time,sum(action_use[-1]))
ax.broken_barh(action_use,(0,len(actions)*spacing),facecolors=black,alpha=1,linewidth=0)

ax.set_xlabel('minutes in activity')
ax.set_xticks(range(0,int(max_time),60*5))
ax.set_xticklabels([str(x/60)+'' for x in range(0,int(max_time),60*5)])
ax.set_yticks(range(spacing/2,len(actions)*spacing,spacing))
ax.set_yticklabels(actions)
ax.grid(True)
plt.show()

The black vertical bar in the graph shows when a "new case" is presented

### TODO
* figure out why overlap? for example between submit and evaluation
* add non ready made functions

In [None]:
df_test