In [1]:
import numpy as np
import pandas as pd
import sys
import getpass
import re
from functions import *
from collections import Counter
np.set_printoptions(precision=2)
pd.set_option('precision', 2)

In [2]:
df = pd.read_csv('C:\Users\\'+getpass.getuser()+'\\Google Drive\Sarah Ido folder\data\CCK\MATCHING_phet_cck_user_actions+sophistication_WITHPAUSE_anonymized.txt')
df_scores = pd.read_csv('C:\Users\\'+getpass.getuser()+'\\Google Drive\Sarah Ido folder\data\CCK\MATCHING_phet_cck_user_data_anonymized.txt')
df["student"] = df["student"].astype('category')
df_scores["student"] = df_scores["student"].astype('category')
df["Family"]=df["Family"].str.capitalize()

Let's get the sequences by students in blocks of actions.

In [3]:
def get_blocks(df, students):
    '''gets blocks of sequences a list of students'''
    def convert(action):
        if action == 'Reset':
            return 'X'
#         if action == "Organize" or action == "Build":
#             return 'M'
        else: 
            return action[0]
    
    blocks = {student:'S' for student in students}
    for student in students:
        sequence =  list(df[df['student']==student]['Family'])
        blocks[student] += re.sub(r'(.)\1+', r'\1',''.join([convert(action) for action in sequence]))
    return blocks

def get_frequencies(blocks, shortest=3, longest=11):
    frequencies = {student:Counter() for student in blocks.keys()}
    for student,sequence in blocks.iteritems():
        for seq_length in range(shortest, longest+1):  # loops through different possible sequence lengths
            frequencies[student] += Counter(sequence[i:i+seq_length] for i in range(len(sequence)-seq_length-1))  # counts string matches for every string of the current length
    return frequencies

def remove_rare_frequencies(frequencies, N=2):
    for k in list(frequencies):
            if frequencies[k] < N:
                del frequencies[k]
    return None

def remove_omni_frequencies(frequencies, N=96):
    for k in list(frequencies):
            if frequencies[k] == N:
                del frequencies[k]
    return None

def clean_student_frequencies(all_frequencies, frequencies):
    new_frequencies = {}
    for student, freqs in frequencies.iteritems():
        new_frequencies[student] = freqs & all_frequencies
    return new_frequencies

def keep_frequencies_with(frequencies,keep='P'):
    for k in list(frequencies):
        if keep not in k:
            del frequencies[k]
    return None

def select_frequencies(frequencies, attribute, level):
    new_frequencies = {}
    '''gets frequencies of students given an attribute of the student'''
    relevant_students =  set(df_scores[df_scores[attribute]==level]['student'])
    for student, f in frequencies.iteritems():
        if student in relevant_students:
            new_frequencies[student] = f
    return new_frequencies

## Here is how we get sequences 

In [4]:
df2 = df[df.Activity == 'a1']
blocks = get_blocks(df2,get_students())
frequencies = get_frequencies(blocks, shortest=3, longest=10)
#Count number of time sequences occurs accross all students
sum_frequencies = sum(frequencies.values(), Counter())
#Count number of students that conducted that frequency
counts_frequencies = Counter({f:sum([ 1 if f in freq else 0 for freq in frequencies.values()]) for f in list(sum_frequencies)})

## Here is how to select certain sequences

In [5]:
## Clean up sequences with count less than N
N = 5
remove_rare_frequencies(counts_frequencies,N)
# keep_frequencies_with(counts_frequencies,keep='P')
frequencies = clean_student_frequencies(counts_frequencies, frequencies)
# counts_frequencies.most_common(10)

## Let's compare sequences from students in the different conditions in Activity 1
Get all frequencies of length 3 to 10, cleaning them up by removing those common to all students and the rare ones.

In [6]:
df2 = df[df.Activity == 'a1']
blocks = get_blocks(df2,get_students())
frequencies = get_frequencies(blocks, shortest=3, longest=10)

sum_frequencies = sum(frequencies.values(), Counter())
counts_frequencies = Counter({f:sum([ 1 if f in freq else 0 for freq in frequencies.values()]) for f in list(sum_frequencies)})

# Remove rare frequencies and those done by all students
remove_rare_frequencies(counts_frequencies,3)
remove_omni_frequencies(counts_frequencies,len(get_students()))

# update student frequencies by removing those removed in the collection of all frequencies
frequencies = clean_student_frequencies(counts_frequencies, frequencies)

Next we split the frequencies by student type and compare them

In [7]:
fs = select_frequencies(frequencies,"scaffolding","scaff")
fn = select_frequencies(frequencies,"scaffolding","not")

sum_frequencies = sum(fs.values(), Counter())
count_s = Counter({f:sum([ 1 if f in freq else 0 for freq in fs.values()]) for f in list(sum_frequencies)})
sum_frequencies = sum(fn.values(), Counter())
count_n = Counter({f:sum([ 1 if f in freq else 0 for freq in fn.values()]) for f in list(sum_frequencies)})

In [8]:
(count_s-count_n).most_common(10)

[('PTPT', 20),
 ('PTP', 19),
 ('TPTPT', 19),
 ('PTPTP', 17),
 ('TPTP', 16),
 ('TPTPTP', 15),
 ('TPX', 14),
 ('RTPTPT', 13),
 ('TPTPTPT', 13),
 ('ITPT', 13)]

In [9]:
(count_n-count_s).most_common(10)

[('ORPO', 21),
 ('RPR', 18),
 ('RBORP', 18),
 ('BROBOR', 17),
 ('BORPO', 17),
 ('OPO', 16),
 ('TRB', 15),
 ('ORPOR', 15),
 ('RBOBOBO', 14),
 ('OBTO', 13)]

## Does this difference persist in activity 2?

In [10]:
df2 = df[df.Activity == 'a2']
blocks = get_blocks(df2,get_students())
frequencies = get_frequencies(blocks, shortest=3, longest=10)

sum_frequencies = sum(frequencies.values(), Counter())
counts_frequencies = Counter({f:sum([ 1 if f in freq else 0 for freq in frequencies.values()]) for f in list(sum_frequencies)})

# Remove rare frequencies and those done by all students
remove_rare_frequencies(counts_frequencies,3)
remove_omni_frequencies(counts_frequencies,len(get_students()))

# update student frequencies by removing those removed in the collection of all frequencies
frequencies = clean_student_frequencies(counts_frequencies, frequencies)

In [11]:
fs = select_frequencies(frequencies,"scaffolding","scaff")
fn = select_frequencies(frequencies,"scaffolding","not")

sum_frequencies = sum(fs.values(), Counter())
count_s = Counter({f:sum([ 1 if f in freq else 0 for freq in fs.values()]) for f in list(sum_frequencies)})
sum_frequencies = sum(fn.values(), Counter())
count_n = Counter({f:sum([ 1 if f in freq else 0 for freq in fn.values()]) for f in list(sum_frequencies)})

In [12]:
(count_s-count_n).most_common(10)

[('SBOB', 13),
 ('SBOBO', 12),
 ('IPT', 12),
 ('XBO', 11),
 ('TRTP', 11),
 ('TRTPT', 10),
 ('PTPTR', 10),
 ('TPTPTR', 10),
 ('TPTPT', 10),
 ('XBOBO', 9)]

In [13]:
(count_n-count_s).most_common(10)

[('IBOB', 15),
 ('RPR', 14),
 ('IBOBO', 13),
 ('RPRO', 12),
 ('ORBOROR', 12),
 ('BOROBR', 11),
 ('RBOROR', 11),
 ('BOBOROBO', 11),
 ('IBO', 11),
 ('BOBOROB', 10)]

## Let's compare sequences from students from different clusters

In [14]:
df2 = df[df.Activity == 'a1']
blocks = get_blocks(df2,get_students())
frequencies = get_frequencies(blocks, shortest=3, longest=10)

sum_frequencies = sum(frequencies.values(), Counter())
counts_frequencies = Counter({f:sum([ 1 if f in freq else 0 for freq in frequencies.values()]) for f in list(sum_frequencies)})

# Remove rare frequencies and those done by all students
remove_rare_frequencies(counts_frequencies,3)
remove_omni_frequencies(counts_frequencies,len(get_students()))

# update student frequencies by removing those removed in the collection of all frequencies
frequencies = clean_student_frequencies(counts_frequencies, frequencies)

In [15]:
fs = select_frequencies(frequencies,"Clustergroups","knowledge")
fn = select_frequencies(frequencies,"Clustergroups","attitude")


sum_frequencies = sum(fs.values(), Counter())
count_s = Counter({f:sum([ 1 if f in freq else 0 for freq in fs.values()]) for f in list(sum_frequencies)})
sum_frequencies = sum(fn.values(), Counter())
count_n = Counter({f:sum([ 1 if f in freq else 0 for freq in fn.values()]) for f in list(sum_frequencies)})

remove_rare_frequencies(count_s,3)
remove_omni_frequencies(count_s,len(fs.keys()))
remove_rare_frequencies(count_n,3)
remove_omni_frequencies(count_n,len(fn.keys()))

In [16]:
(count_s-count_n).most_common(10)

[('ROBO', 28),
 ('OROB', 28),
 ('ROB', 28),
 ('OBR', 27),
 ('TOBOBT', 7),
 ('OTPTO', 7),
 ('ROITIT', 6),
 ('TPTPTPTPT', 6),
 ('RBROBOBOB', 6),
 ('ORBORORORO', 5)]

In [17]:
(count_n-count_s).most_common(10)

[('BORO', 65),
 ('ROR', 65),
 ('OBOBOR', 65),
 ('OBOBOB', 64),
 ('OBOBOBO', 62),
 ('BOBOBOB', 61),
 ('BOBOBOBO', 57),
 ('ORP', 37),
 ('ROBOB', 37),
 ('ORBO', 36)]

## Let's compare sequences from students with high and low learning gain

In [18]:
df2 = df[df.Activity == 'a1']
blocks = get_blocks(df2,get_students())
frequencies = get_frequencies(blocks, shortest=3, longest=10)

sum_frequencies = sum(frequencies.values(), Counter())
counts_frequencies = Counter({f:sum([ 1 if f in freq else 0 for freq in frequencies.values()]) for f in list(sum_frequencies)})

# Remove rare frequencies and those done by all students
remove_rare_frequencies(counts_frequencies,3)
remove_omni_frequencies(counts_frequencies,len(get_students()))

# update student frequencies by removing those removed in the collection of all frequencies
frequencies = clean_student_frequencies(counts_frequencies, frequencies)

First we need to split students by learning gains in each activity

In [19]:
def label_learning (row):
    if row['learning'] >= median_learning: return 'high'
    else: return 'low'

df_scores['learning'] = df_scores["post t1"] - df_scores["pre"]
median_learning = np.median(df_scores['learning'])
df_scores['learning1'] = df_scores.apply (lambda row: label_learning (row),axis=1)

df_scores['learning'] = df_scores["post t2"] - df_scores["pre"]
median_learning = np.median(df_scores['learning'])
df_scores['learning2'] = df_scores.apply (lambda row: label_learning (row),axis=1)
# len(df_scores[(df_scores['learning1']=='high') & (df_scores['learning2']=='high')])

In [20]:
fs = select_frequencies(frequencies,"learning1","high")
fn = select_frequencies(frequencies,"learning1","low")


sum_frequencies = sum(fs.values(), Counter())
count_s = Counter({f:sum([ 1 if f in freq else 0 for freq in fs.values()]) for f in list(sum_frequencies)})
sum_frequencies = sum(fn.values(), Counter())
count_n = Counter({f:sum([ 1 if f in freq else 0 for freq in fn.values()]) for f in list(sum_frequencies)})

remove_rare_frequencies(count_s,3)
remove_omni_frequencies(count_s,len(fs.keys()))
remove_rare_frequencies(count_n,3)
remove_omni_frequencies(count_n,len(fn.keys()))

In [21]:
(count_s-count_n).most_common(10)

[('OBOBOR', 47),
 ('OROB', 46),
 ('ROBO', 46),
 ('ROB', 46),
 ('OBR', 45),
 ('TRTB', 14),
 ('BOBOBORO', 14),
 ('OBOBOBORO', 13),
 ('OBOBORO', 11),
 ('OROBOBOBOB', 11)]

In [22]:
(count_n-count_s).most_common(10)

[('BORO', 47),
 ('ROR', 47),
 ('TPRO', 16),
 ('TPR', 15),
 ('BRBOBOBO', 15),
 ('TROBO', 14),
 ('XBOBOBO', 13),
 ('XBOBOBOBO', 13),
 ('XBOBOBOB', 13),
 ('OBOROROBOB', 13)]

### Steps

Mine sequences
1. Find all sequences of length 3 to N per student per activity
* Calculate count per sequences **done**
* Prune collection of sequences for those significant likely to be non-random

Analyze sequences
1. Intepret top 5 sequences 
* Measure uniformity of sequences per condition
* Measure uniformity of sequences per cluster
* Measure correlation between use of sequence and pre to post gains


