In [112]:
import numpy as np
import pandas as pd
import sys
import getpass
import re
from functions import *
from collections import Counter
import copy
np.set_printoptions(precision=2)
pd.set_option('precision', 2)
%matplotlib inline
matplotlib.style.use('ggplot')

In [113]:
df = pd.read_csv('C:\Users\\'+getpass.getuser()+'\\Google Drive\Sarah Ido folder\data\CCK\MATCHING_phet_cck_user_actions+sophistication_WITHPAUSE_anonymized.txt')
df_scores = pd.read_csv('C:\Users\\'+getpass.getuser()+'\\Google Drive\Sarah Ido folder\data\CCK\MATCHING_phet_cck_user_data_anonymized.txt')
df["student"] = df["student"].astype('category')
df_scores["student"] = df_scores["student"].astype('category')
df["Family"]=df["Family"].str.capitalize()

Let's get the sequences by students in blocks of actions.

In [144]:
def get_blocks(df, students):
    '''gets blocks of sequences a list of students'''
    def convert(action):
        if action == 'Reset':
            return 'X'
#         if action == "Organize" or action == "Build":
#             return 'M'
        else: 
            return action[0]
    
    blocks = {student:'S' for student in students}
    for student in students:
        sequence =  list(df[df['student']==student]['Family'])
        blocks[student] += re.sub(r'(.)\1+', r'\1',''.join([convert(action) for action in sequence]))
    return blocks

def get_frequencies(blocks, shortest=3, longest=11):
    frequencies = {student:Counter() for student in blocks.keys()}
    for student,sequence in blocks.iteritems():
        for seq_length in range(shortest, longest+1):  # loops through different possible sequence lengths
            frequencies[student] += Counter(sequence[i:i+seq_length] for i in range(len(sequence)-seq_length-1))  # counts string matches for every string of the current length
    return frequencies

def remove_rare_frequencies(frequencies, N=2):
    new_frequencies = copy.copy(frequencies)
    for k in list(new_frequencies):
            if new_frequencies[k] < N:
                del new_frequencies[k]
    return new_frequencies

def remove_omni_frequencies(frequencies, N=96):
    new_frequencies = copy.copy(frequencies)
    for k in list(new_frequencies):
            if new_frequencies[k] == N:
                del new_frequencies[k]
    return new_frequencies

def clean_student_frequencies(all_frequencies, frequencies):
    new_frequencies = {}
    for student, freqs in frequencies.iteritems():
        new_frequencies[student] = freqs & all_frequencies
    return new_frequencies

def keep_frequencies_with(frequencies,keep='P'):
    new_frequencies = copy.copy(frequencies)
    for k in list(new_frequencies):
        if keep not in k:
            del new_frequencies[k]
    return new_frequencies

def select_frequencies(frequencies, attribute, level):
    new_frequencies = {}
    '''gets frequencies of students given an attribute of the student'''
    relevant_students =  set(df_scores[df_scores[attribute]==level]['student'])
    for student, f in frequencies.iteritems():
        if student in relevant_students:
            new_frequencies[student] = f
    return new_frequencies

def mega_process(activity='a1', level='scaffolding', value1='scaff', value2='not', shortest=3, longest=10, keep = None):
    df2 = df[df.Activity == activity]
    blocks = get_blocks(df2,get_students())
    frequencies = get_frequencies(blocks, shortest=shortest, longest=longest)

    sum_frequencies = sum(frequencies.values(), Counter())
    counts_frequencies = Counter({f:sum([ 1 if f in freq else 0 for freq in frequencies.values()]) for f in list(sum_frequencies)})

#     # Remove frequencies done by few and all students
#     counts_frequencies = remove_rare_frequencies(counts_frequencies,3)
#     counts_frequencies = remove_omni_frequencies(counts_frequencies,len(get_students()))

#     # update student frequencies by removing those removed in the collection of all frequencies
#     frequencies = clean_student_frequencies(counts_frequencies, frequencies)
    
    if keep:
        counts_frequencies = keep_frequencies_with(counts_frequencies,keep=keep)
        frequencies = clean_student_frequencies(counts_frequencies, frequencies)
    
    f1 = select_frequencies(frequencies,level,value1)
    f2 = select_frequencies(frequencies,level,value2)

    sum_frequencies = sum(f1.values(), Counter())
    count1 = Counter({f:sum([ 1 if f in freq else 0 for freq in f1.values()]) for f in list(sum_frequencies)})
    sum_frequencies = sum(f2.values(), Counter())
    count2 = Counter({f:sum([ 1 if f in freq else 0 for freq in f2.values()]) for f in list(sum_frequencies)})

    return sum_frequencies, count1, count2

def difference(seqs1,seqs2,N=10):
    diff = (seqs1-seqs2).most_common(N)
    print "Sequence: count = seq1 - seq2\n-----------------------------"
    for seq,count in diff:
        print "{0}: \t {1} = {2} - {3} ".format(seq, count, seqs1[seq], seqs2[seq])
    
    diff = (seqs2-seqs1).most_common(N)
    print "\nSequence: count = seq2 - seq1\n----------------------------"
    for seq,count in diff:
        print "{0}: \t {1} = {2} - {3} ".format(seq, count, seqs2[seq], seqs1[seq])

## Here is how we get sequences 

In [115]:
# df2 = df[df.Activity == 'a1']
# blocks = get_blocks(df2,get_students())
# frequencies = get_frequencies(blocks, shortest=3, longest=10)
# #Count number of time sequences occurs accross all students
# sum_frequencies = sum(frequencies.values(), Counter())
# #Count number of students that conducted that frequency
# counts_frequencies = Counter({f:sum([ 1 if f in freq else 0 for freq in frequencies.values()]) for f in list(sum_frequencies)})

## Here is how to select certain sequences

In [116]:
# ## Clean up sequences with count less than N
# N = 5
# remove_rare_frequencies(counts_frequencies,N)
# # keep_frequencies_with(counts_frequencies,keep='P')
# frequencies = clean_student_frequencies(counts_frequencies, frequencies)
# # counts_frequencies.most_common(10)

## Let's compare sequences from students in the different conditions in Activity 1
Get all frequencies of length 3 to 10, cleaning them up by removing those common to all students and the rare ones.

In [117]:
# summed_seqs, seq_scaff, seq_not = mega_process(activity='a1', level='scaffolding', value1='scaff', value2='not', shortest=3, longest=10)
# difference(seq_scaff,seq_not,N=10)

We can now look at the difference in number of students that conduct sequences in both groups.

In [118]:
## Does this difference persist in activity 2?

In [119]:
# summed_seqs, seq_scaff, seq_not = mega_process(activity='a2', level='scaffolding', value1='scaff', value2='not', shortest=3, longest=10)
# difference(seq_scaff,seq_not,N=10)

## Let's compare sequences from students from different clusters
In activity 1

In [120]:
# summed_seqs, seq_know, seq_att = mega_process(activity='a1', level='Clustergroups', value1='knowledge', value2='attitude', shortest=3, longest=10)
# difference(seq_know+seq_know,seq_att,N=10)

**Note**: Here we are doubling the number of counts for the knowledge group since it's half the size

In activity 2

In [121]:
# summed_seqs, seq_know, seq_att = mega_process(activity='a2', level='Clustergroups', value1='knowledge', value2='attitude', shortest=3, longest=10)
# difference(seq_know+seq_know,seq_att,N=10)

**Note**: Here we are doubling the number of counts for the knowledge group since it's half the size

## Let's compare sequences from students with high and low learning gain

First we need to split students by learning gains in each activity

In [122]:
def label_learning (row,column):
    if row[column] >= median_learning: return 'high'
    else: return 'low'

df_scores['learning1score'] = df_scores["post t1"] - df_scores["pre"]
median_learning = np.median(df_scores['learning1score'])
df_scores['learning1'] = df_scores.apply (lambda row: label_learning (row,"learning1score"),axis=1)

df_scores['learning2score'] = df_scores["post t2"] - df_scores["pre"]
median_learning = np.median(df_scores['learning2score'])
df_scores['learning2'] = df_scores.apply (lambda row: label_learning (row,"learning2score"),axis=1)
# len(df_scores[(df_scores['learning1']=='high') & (df_scores['learning2']=='high')])

Now we look at the different in sequences in activity 1

In [123]:
# summed_seqs, seq_high, seq_low = mega_process(activity='a1', level='learning1', value1='high', value2='low', shortest=3, longest=10)
# difference(seq_high,seq_low,N=10)



And activity 2

In [124]:
# summed_seqs, seq_high, seq_low = mega_process(activity='a2', level='learning2', value1='high', value2='low', shortest=3, longest=10)
# difference(seq_high,seq_low,N=10)

## Well, one thing we are really interested in, is what happens before/after students pause. So let's look at those sequences, and compare low/high learners.

In [125]:
# summed_seqs, seq_high, seq_low = mega_process(activity='a1', level='learning1', value1='high', 
#                                               value2='low', shortest=3, longest=10, keep = "P")
# difference(seq_high,seq_low,N=10)

In [126]:
# summed_seqs, seq_high, seq_low = mega_process(activity='a2', level='learning2', value1='high', 
#                                               value2='low', shortest=3, longest=10, keep = "P")
# difference(seq_high,seq_low,N=10)

#### Now let's pick a sequence visualize how much it's used by students

In [197]:
def plot_seq_gains(activity, seqs):
    df2 = df[df.Activity == activity]
    if activity == 'a1': score_column = "learning1score"
    else: score_column = "learning2score"
        
    blocks = get_blocks(df2,get_students())
    frequencies = get_frequencies(blocks, shortest=3, longest=6)
    sum_frequencies = sum(frequencies.values(), Counter())
    counts_frequencies = Counter({f:sum([ 1 if f in freq else 0 for freq in frequencies.values()]) for f in list(sum_frequencies)})
    totals = {student:float(sum(count.values())) for student,count in frequencies.iteritems()}

    fig, axs = plt.subplots(nrows=len(seqs),ncols=1,figsize=(15,3*len(seqs)))
    fig.suptitle('Normalize frequency of usage of sequences versus learning gain. Students are colored depending on their scaffolding.',fontsize=16)
    for seq,ax in zip(seqs,axs):
        counts = keep_frequencies_with(counts_frequencies,keep=seq)
        cleaned_frequencies = clean_student_frequencies(counts, frequencies)

        f1 = select_frequencies(cleaned_frequencies,"scaffolding","scaff")
        f2 = select_frequencies(cleaned_frequencies,"scaffolding","not")

        total1 = {student:float(sum(count.values()))/totals[student] for student,count in f1.iteritems()}
        values1 = {k:(v, float(df_scores[df_scores["student"]==k][score_column]) ) for k,v in total1.iteritems()}
        total2 = {student:float(sum(count.values()))/totals[student] for student,count in f2.iteritems()}
        values2 = {k:(v, float(df_scores[df_scores["student"]==k][score_column]) ) for k,v in total2.iteritems()}
        x,y = zip(*values1.values())
        rec1 = ax.scatter(x,y, c='r',s=100, alpha=0.4)
        x,y = zip(*values2.values())
        rec2 = ax.scatter(x,y, c='b',s=100, alpha=0.4)
        ax.axhline(0, color='darkgrey')
        ax.set_ylabel(seq)
        ax.legend((rec1, rec2), ('Scaffolded', 'Not scaffolded'))
        
#         ax.set_xlabel('Normalized frequency of usage')
#         ax.set_ylabel('Learning gain in activity 1 (post-pre)')
#         ax.set_title('Normalize frequency of usage of {0} versus learning gain. Students are colored depending on their scaffolding.'.format(seq))

### Want to make a matrix where rows are sequences, columns students, cells represent number of times sequence was performed by student

In [240]:
df2 = df[df.Activity == 'a1']
blocks = get_blocks(df2,get_students())
frequencies = get_frequencies(blocks, shortest=3, longest=6)
sum_frequencies = sum(frequencies.values(), Counter())
counts_frequencies = Counter({f:sum([ 1 if f in freq else 0 for freq in frequencies.values()]) for f in list(sum_frequencies)})
totals = {student:float(sum(count.values())) for student,count in frequencies.iteritems()}

students, sequences = frequencies.keys(), sum_frequencies.keys()
A = np.zeros((len(students), len(sequences)))
A.shape

for st,seqs in frequencies.iteritems():
    i = students.index(st)
    for sq,count in seqs.iteritems():
        j = sequences.index(sq)
        A[i][j] = count

We normalize the matrix

In [236]:
B = (A.T / np.sum(A,axis=1)).T

We calculate correlations with learning outcomes for each sequence

In [237]:
gains = [round(float(df_scores[df_scores["student"]==student]['learning1score']),5) for student in students]

from scipy.stats import pearsonr
correlation_measure = pearsonr
corr = []
for sq in sequences:
    j = sequences.index(sq)
    c = correlation_measure(matrix[:,j],gains)
    corr.append((sq, c[0], c[1]))    

Now let's keep only the significant sequences.

**Note**: we still need to do some boostrapping and permutation test to confirm that these are significant results.

In [238]:
sig_corr = []
for s,c,p in corr:
    if p <= 0.05 and abs(c) > 0.39:
        sig_corr.append((s,c,p))
        print s,c,p

IBITPR 0.39226281515 7.7284363845e-05
RPOPT 0.39226281515 7.7284363845e-05
BOROEI 0.39226281515 7.7284363845e-05
TBRPTR 0.39226281515 7.7284363845e-05
TPOTPO 0.39226281515 7.7284363845e-05
TIEBO 0.39226281515 7.7284363845e-05
TRBTRT 0.39226281515 7.7284363845e-05
POTBRO 0.39226281515 7.7284363845e-05
OBIROR 0.39226281515 7.7284363845e-05
RERIPX 0.39226281515 7.7284363845e-05
ORITBO 0.39226281515 7.7284363845e-05
BOPBO 0.39226281515 7.7284363845e-05
OETR 0.39226281515 7.7284363845e-05
RTPTI 0.39226281515 7.7284363845e-05
TOBRBT 0.39226281515 7.7284363845e-05
BOPXB 0.39226281515 7.7284363845e-05
XTRIBO 0.39226281515 7.7284363845e-05
RBRETR 0.39226281515 7.7284363845e-05
OIEOBO 0.39226281515 7.7284363845e-05
TIPITR 0.39226281515 7.7284363845e-05
IXPBOB 0.39226281515 7.7284363845e-05
BTRBT 0.39226281515 7.7284363845e-05
PTPBRO 0.39226281515 7.7284363845e-05


In [241]:
# plot_seq_gains('a1',zip(*sig_corr)[0])

Clearly, how much students did an action doesn't matter as much as who did it at least once (or more?)
Let's repeat the analysis using # of students who did the sequence instead of frequency.

In [245]:
df2 = df[df.Activity == 'a1']
blocks = get_blocks(df2,get_students())
frequencies = get_frequencies(blocks, shortest=3, longest=6)
sum_frequencies = sum(frequencies.values(), Counter())
counts_frequencies = Counter({f:sum([ 1 if f in freq else 0 for freq in frequencies.values()]) for f in list(sum_frequencies)})
totals = {student:float(sum(count.values())) for student,count in frequencies.iteritems()}

students, sequences = frequencies.keys(), sum_frequencies.keys()
A = np.zeros((len(students), len(sequences)))
A.shape

for st,seqs in frequencies.iteritems():
    i = students.index(st)
    for sq,count in seqs.iteritems():
        j = sequences.index(sq)
        if count > 3: x =1
        else: x = 0
        A[i][j] = x

In [251]:
from scipy.stats import ttest_ind
gains = np.array([round(float(df_scores[df_scores["student"]==student]['learning1score']),5) for student in students])

tests = []
for sq in sequences:
    j = sequences.index(sq)
    a = gains[np.where(A[:,j]==1)]
    b = gains[np.where(A[:,j]==0)]
    t = ttest_ind(a, b, equal_var=False)
    tests.append((sq, t[0], t[1]))

In [254]:
sig_tests = []
for s,t,p in tests:
    if p <= 0.05:
        sig_corr.append((s,t,p))
        print s, '\t', round(t,2),'\t', round(p,3)

IBOBO 	4.62 	0.001
RPTP 	5.61 	0.0
RBO 	-2.8 	0.007
PTI 	6.12 	0.003
RBORBO 	3.5 	0.003
IBOB 	4.62 	0.001
TOROB 	-6.48 	0.013
OBOBRB 	4.47 	0.001
BTB 	2.47 	0.028
TPO 	2.11 	0.05
ITPO 	13.17 	0.0
ROBT 	-19.76 	0.0
BORT 	-2.21 	0.044
