In [1]:
import numpy as np
import pandas as pd
import sys
import getpass
import re
from functions import *
from collections import Counter
np.set_printoptions(precision=2)
pd.set_option('precision', 2)

In [2]:
df = pd.read_csv('C:\Users\\'+getpass.getuser()+'\\Google Drive\Sarah Ido folder\data\CCK\MATCHING_phet_cck_user_actions+sophistication_WITHPAUSE_anonymized.txt')
df_scores = pd.read_csv('C:\Users\\'+getpass.getuser()+'\\Google Drive\Sarah Ido folder\data\CCK\MATCHING_phet_cck_user_data_anonymized.txt')
df["student"] = df["student"].astype('category')
df_scores["student"] = df_scores["student"].astype('category')
df["Family"]=df["Family"].str.capitalize()

Let's get the sequences by students in blocks of actions.

In [3]:
def get_blocks(df, students):
    '''gets blocks of sequences a list of students'''
    def convert(action):
        if action == 'Reset':
            return 'X'
#         if action == "Organize" or action == "Build":
#             return 'M'
        else: 
            return action[0]
    
    blocks = {student:'S' for student in students}
    for student in students:
        sequence =  list(df[df['student']==student]['Family'])
        blocks[student] += re.sub(r'(.)\1+', r'\1',''.join([convert(action) for action in sequence]))
    return blocks

def get_frequencies(blocks, shortest=3, longest=11):
    frequencies = {student:Counter() for student in blocks.keys()}
    for student,sequence in blocks.iteritems():
        for seq_length in range(shortest, longest+1):  # loops through different possible sequence lengths
            frequencies[student] += Counter(sequence[i:i+seq_length] for i in range(len(sequence)-seq_length-1))  # counts string matches for every string of the current length
    return frequencies

def remove_rare_frequencies(frequencies, N=2):
    for k in list(frequencies):
            if frequencies[k] < N:
                del frequencies[k]
    return None

def remove_omni_frequencies(frequencies, N=96):
    for k in list(frequencies):
            if frequencies[k] == N:
                del frequencies[k]
    return None

def clean_student_frequencies(all_frequencies, frequencies):
    new_frequencies = {}
    for student, freqs in frequencies.iteritems():
        new_frequencies[student] = freqs & all_frequencies
    return new_frequencies

def keep_frequencies_with(frequencies,keep='P'):
    for k in list(frequencies):
        if keep not in k:
            del frequencies[k]
    return None

def select_frequencies(frequencies, attribute, level):
    new_frequencies = {}
    '''gets frequencies of students given an attribute of the student'''
    relevant_students =  set(df_scores[df_scores[attribute]==level]['student'])
    for student, f in frequencies.iteritems():
        if student in relevant_students:
            new_frequencies[student] = f
    return new_frequencies

def mega_process(activity='a1', level='scaffolding', value1='scaff', value2='not', shortest=3, longest=10):
    df2 = df[df.Activity == activity]
    blocks = get_blocks(df2,get_students())
    frequencies = get_frequencies(blocks, shortest=shortest, longest=longest)

    sum_frequencies = sum(frequencies.values(), Counter())
    counts_frequencies = Counter({f:sum([ 1 if f in freq else 0 for freq in frequencies.values()]) for f in list(sum_frequencies)})

    # Remove frequencies done by few and all students
    remove_rare_frequencies(counts_frequencies,3)
    remove_omni_frequencies(counts_frequencies,len(get_students()))

    # update student frequencies by removing those removed in the collection of all frequencies
    frequencies = clean_student_frequencies(counts_frequencies, frequencies)
    
    f1 = select_frequencies(frequencies,level,value1)
    f2 = select_frequencies(frequencies,level,value2)

    sum_frequencies = sum(f1.values(), Counter())
    count1 = Counter({f:sum([ 1 if f in freq else 0 for freq in f1.values()]) for f in list(sum_frequencies)})
    sum_frequencies = sum(f2.values(), Counter())
    count2 = Counter({f:sum([ 1 if f in freq else 0 for freq in f2.values()]) for f in list(sum_frequencies)})

    return sum_frequencies, count1, count2

def difference(seqs1,seqs2,N=10):
    diff = (seqs1-seqs2).most_common(N)
    print "Sequence: count = seq1 - seq2\n-----------------------------"
    for seq,count in diff:
        print "{0}: \t {1} = {2} - {3} ".format(seq, count, seqs1[seq], seqs2[seq])
    
    diff = (seqs2-seqs1).most_common(N)
    print "\nSequence: count = seq2 - seq1\n----------------------------"
    for seq,count in diff:
        print "{0}: \t {1} = {2} - {3} ".format(seq, count, seqs2[seq], seqs1[seq])

## Here is how we get sequences 

In [4]:
# df2 = df[df.Activity == 'a1']
# blocks = get_blocks(df2,get_students())
# frequencies = get_frequencies(blocks, shortest=3, longest=10)
# #Count number of time sequences occurs accross all students
# sum_frequencies = sum(frequencies.values(), Counter())
# #Count number of students that conducted that frequency
# counts_frequencies = Counter({f:sum([ 1 if f in freq else 0 for freq in frequencies.values()]) for f in list(sum_frequencies)})

## Here is how to select certain sequences

In [5]:
# ## Clean up sequences with count less than N
# N = 5
# remove_rare_frequencies(counts_frequencies,N)
# # keep_frequencies_with(counts_frequencies,keep='P')
# frequencies = clean_student_frequencies(counts_frequencies, frequencies)
# # counts_frequencies.most_common(10)

## Let's compare sequences from students in the different conditions in Activity 1
Get all frequencies of length 3 to 10, cleaning them up by removing those common to all students and the rare ones.

In [6]:
summed_seqs, seq_scaff, seq_not = mega_process(activity='a1', level='scaffolding', value1='scaff', value2='not', shortest=3, longest=10)

We can now look at the difference in number of students that conduct sequences in both groups.

In [7]:
difference(seq_scaff,seq_not,N=10)

Sequence: count = seq1 - seq2
-----------------------------
PTPT: 	 20 = 36 - 16 
PTP: 	 19 = 46 - 27 
TPTPT: 	 19 = 30 - 11 
PTPTP: 	 17 = 29 - 12 
TPTP: 	 16 = 40 - 24 
TPTPTP: 	 15 = 24 - 9 
TPX: 	 14 = 20 - 6 
RTPTPT: 	 13 = 19 - 6 
TPTPTPT: 	 13 = 15 - 2 
ITPT: 	 13 = 28 - 15 

Sequence: count = seq2 - seq1
----------------------------
ORPO: 	 21 = 29 - 8 
RPR: 	 18 = 23 - 5 
RBORP: 	 18 = 23 - 5 
BROBOR: 	 17 = 25 - 8 
BORPO: 	 17 = 22 - 5 
OPO: 	 16 = 26 - 10 
TRB: 	 15 = 26 - 11 
ORPOR: 	 15 = 19 - 4 
RBOBOBO: 	 14 = 38 - 24 
OBTO: 	 13 = 25 - 12 


## Does this difference persist in activity 2?

In [8]:
summed_seqs, seq_scaff, seq_not = mega_process(activity='a2', level='scaffolding', value1='scaff', value2='not', shortest=3, longest=10)
difference(seq_scaff,seq_not,N=10)

Sequence: count = seq1 - seq2
-----------------------------
SBOB: 	 13 = 37 - 24 
SBOBO: 	 12 = 36 - 24 
IPT: 	 12 = 17 - 5 
XBO: 	 11 = 16 - 5 
TRTP: 	 11 = 27 - 16 
TRTPT: 	 10 = 17 - 7 
PTPTR: 	 10 = 14 - 4 
TPTPTR: 	 10 = 12 - 2 
TPTPT: 	 10 = 19 - 9 
XBOBO: 	 9 = 14 - 5 

Sequence: count = seq2 - seq1
----------------------------
IBOB: 	 15 = 28 - 13 
RPR: 	 14 = 27 - 13 
IBOBO: 	 13 = 26 - 13 
RPRO: 	 12 = 13 - 1 
ORBOROR: 	 12 = 13 - 1 
BOROBR: 	 11 = 18 - 7 
RBOROR: 	 11 = 16 - 5 
BOBOROBO: 	 11 = 24 - 13 
IBO: 	 11 = 32 - 21 
BOBOROB: 	 10 = 27 - 17 


## Let's compare sequences from students from different clusters
In activity 1

In [9]:
summed_seqs, seq_know, seq_att = mega_process(activity='a1', level='Clustergroups', value1='knowledge', value2='attitude', shortest=3, longest=10)

**Note**: Here we are doubling the number of counts for the knowledge group since it's half the size

In [10]:
difference(seq_know+seq_know,seq_att,N=10)

Sequence: count = seq1 - seq2
-----------------------------
OBOBOBOBRO: 	 15 = 24 - 9 
ROPI: 	 15 = 20 - 5 
BOBT: 	 15 = 40 - 25 
TPTO: 	 14 = 30 - 16 
RBRBO: 	 14 = 34 - 20 
TPBO: 	 14 = 28 - 14 
OPI: 	 14 = 24 - 10 
TOBOBT: 	 14 = 14 - 0 
OTPTO: 	 13 = 14 - 1 
BOBTB: 	 12 = 20 - 8 

Sequence: count = seq2 - seq1
----------------------------
BRP: 	 19 = 41 - 22 
PBOB: 	 17 = 41 - 24 
TPR: 	 16 = 34 - 18 
BOROBO: 	 16 = 56 - 40 
OBORBOB: 	 15 = 47 - 32 
BOBORBOB: 	 15 = 39 - 24 
BOBRP: 	 15 = 23 - 8 
BRT: 	 15 = 47 - 32 
OBOBRP: 	 15 = 19 - 4 
ORP: 	 14 = 60 - 46 


In activity 2

In [11]:
summed_seqs, seq_know, seq_att = mega_process(activity='a2', level='Clustergroups', value1='knowledge', value2='attitude', shortest=3, longest=10)

**Note**: Here we are doubling the number of counts for the knowledge group since it's half the size

In [12]:
difference(seq_know+seq_know,seq_att,N=10)

Sequence: count = seq1 - seq2
-----------------------------
TPTO: 	 19 = 32 - 13 
PTO: 	 15 = 36 - 21 
BOROT: 	 12 = 24 - 12 
TPTOR: 	 11 = 18 - 7 
BPB: 	 11 = 20 - 9 
IPT: 	 11 = 22 - 11 
BPBO: 	 10 = 18 - 8 
PTOR: 	 10 = 24 - 14 
BRORBOB: 	 10 = 14 - 4 
SBPBO: 	 10 = 12 - 2 

Sequence: count = seq2 - seq1
----------------------------
RTPO: 	 22 = 28 - 6 
OROBOR: 	 21 = 49 - 28 
ROROB: 	 20 = 50 - 30 
TROR: 	 19 = 35 - 16 
ROROR: 	 18 = 42 - 24 
BRT: 	 18 = 44 - 26 
ROBOBOBO: 	 18 = 34 - 16 
OROROR: 	 18 = 38 - 20 
OROBOBOBO: 	 17 = 27 - 10 
RTR: 	 17 = 49 - 32 


## Let's compare sequences from students with high and low learning gain

First we need to split students by learning gains in each activity

In [13]:
def label_learning (row):
    if row['learning'] >= median_learning: return 'high'
    else: return 'low'

df_scores['learning'] = df_scores["post t1"] - df_scores["pre"]
median_learning = np.median(df_scores['learning'])
df_scores['learning1'] = df_scores.apply (lambda row: label_learning (row),axis=1)

df_scores['learning'] = df_scores["post t2"] - df_scores["pre"]
median_learning = np.median(df_scores['learning'])
df_scores['learning2'] = df_scores.apply (lambda row: label_learning (row),axis=1)
# len(df_scores[(df_scores['learning1']=='high') & (df_scores['learning2']=='high')])

Now we look at the different in sequences in activity 1

In [14]:
summed_seqs, seq_high, seq_low = mega_process(activity='a1', level='learning1', value1='high', value2='low', shortest=3, longest=10)

In [15]:
difference(seq_high,seq_low,N=10)

Sequence: count = seq1 - seq2
-----------------------------
BOBOBORO: 	 14 = 42 - 28 
OBOBOBORO: 	 13 = 38 - 25 
TRTB: 	 12 = 14 - 2 
OBOBORO: 	 11 = 46 - 35 
OROBOBOBOB: 	 11 = 24 - 13 
RORT: 	 11 = 22 - 11 
OBOBOROR: 	 11 = 32 - 21 
BOBORO: 	 10 = 47 - 37 
POTO: 	 10 = 18 - 8 
ROBOBOBOB: 	 10 = 29 - 19 

Sequence: count = seq2 - seq1
----------------------------
TPR: 	 15 = 29 - 14 
BRBOBOBO: 	 15 = 22 - 7 
TPRO: 	 14 = 16 - 2 
TROBO: 	 14 = 21 - 7 
XBOBOBO: 	 13 = 19 - 6 
XBOBOBOBO: 	 13 = 19 - 6 
XBOBOBOB: 	 13 = 19 - 6 
OBOROROBOB: 	 13 = 19 - 6 
XBOBOB: 	 12 = 19 - 7 
TROB: 	 12 = 24 - 12 


And activity 2

In [16]:
summed_seqs, seq_high, seq_low = mega_process(activity='a2', level='learning1', value1='high', value2='low', shortest=3, longest=10)

In [17]:
difference(seq_high,seq_low,N=10)

Sequence: count = seq1 - seq2
-----------------------------
PRP: 	 9 = 19 - 10 
TPRT: 	 8 = 19 - 11 
TPRTO: 	 8 = 9 - 1 
TPTOR: 	 8 = 12 - 4 
TOBOBOBO: 	 8 = 8 - 0 
OTPT: 	 8 = 12 - 4 
OROBOBOB: 	 8 = 24 - 16 
RTOR: 	 8 = 24 - 16 
OPT: 	 8 = 13 - 5 
ROP: 	 8 = 14 - 6 

Sequence: count = seq2 - seq1
----------------------------
RBRB: 	 18 = 33 - 15 
ORBRB: 	 15 = 22 - 7 
ORBR: 	 15 = 35 - 20 
RBT: 	 15 = 26 - 11 
BRBRB: 	 15 = 17 - 2 
RBRBO: 	 15 = 21 - 6 
BOBORBR: 	 14 = 20 - 6 
ORBOR: 	 13 = 30 - 17 
OBORBRB: 	 13 = 16 - 3 
OBORBR: 	 13 = 22 - 9 



Now what?

