In [1]:
import math
import numpy as np
import pandas as pd
import sys
import re
from functions import *
from mining_functions import *
from collections import Counter
from matplotlib.backends.backend_pdf import PdfPages
np.set_printoptions(precision=2)
pd.set_option('precision', 2)
# %matplotlib inline
matplotlib.style.use('ggplot')

In [2]:
PATH = '/Google Drive/Jonathan Sarah Ido folder/data/CCK/'
def get_path(path = PATH):
    if os.name == 'posix':
        return os.environ['HOME']+path #'/Google Drive/Jonathan Sarah Ido folder/data/CCK/'
    elif os.name == 'nt':
        return os.environ['USERPROFILE']+ path.replace('/','\\') #'\\Google Drive\Jonathan Sarah Ido folder\data\CCK\\'
    else:
        raise Exception('OS not recongnized. I\'m confused.')
gitpath = '/git/Phet-log-analyzer/cck/raw_data_parsing_check/'
df = pd.read_csv(get_path(path = gitpath) + 'phet_cck_user_actions+sophistication_WITHPAUSE_more_circuit_info.txt',index_col=False)
# dfx = pd.read_csv('C:\Users\Sarah\Documents\git\Phet-log-analyzer\cck\\raw_data_parsing_check\phet_cck_user_actions+sophistication_WITHPAUSE_more_circuit_info.txt',index_col=False)
df["student"] = df["student"].astype('category')
df["Family"]=df["Family"].str.capitalize()
df["Family_tool"]=df["Family_tool"].str.capitalize()
df["Family_default"]=df["Family_default"].str.capitalize()
df["Family_both"]=df["Family_both"].str.capitalize()

df_scores = pd.read_csv(data_path + 'MATCHING_phet_cck_user_data_anonymized.txt')
df_scores["student"] = df_scores["student"].astype('category')

### Functions used to calculate information gain, plot use, etc...

In [3]:
def add_text(attribute,family_category,N, shortest_seq_length, longest_seq_length,B):
    text = """Showing sequences for students split by {0}, using the categories {1}.
            Removed sequences used by less than {2}.
            Found sequences of lenght {3} to {4}.
            Using {5} time bins""".format(attribute,family_category,N, shortest_seq_length, longest_seq_length,B)
    fig = plt.figure(figsize=(10, 4))
    ax = plt.gca()
    ax.text(0.5,0.5,text,
        horizontalalignment='center',
        verticalalignment='center',
        fontsize = 15)
    plt.axis('off')
    return fig

In [4]:
def rank_sequences(sequence_counts,B,axesnum=None):
    ranks = []
    for seq,counts in sequence_counts.iteritems():
#         if np.sum(counts)>0:
        ranks.append((seq,calc_infogain(counts,B,axesnum)))
    return sorted(ranks, key=lambda tup: tup[1])

def get_top_seqs(ranks,N):
    return ranks[-N:]

### First we split students students

In [5]:
df_scores.hist(column='pre')

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000000000AA3EE48>]], dtype=object)

#### Since the pretest has a bimodal distribution we split the students in two: high and low pre. We then split the students that had a low pre given their performance on the resistor activity (pick median given only low pre students).

In [6]:
pre_threshold = 0.6
df_scores['split pre'] = df_scores.apply (lambda row: label_learning (pre_threshold,row,"pre"),axis=1)
post_threshold = np.median(df_scores[df_scores["pre"]<0.6]['z post t2'])
df_scores['split post t2'] = df_scores.apply (lambda row: label_learning (post_threshold,row,"z post t2"),axis=1)

In [7]:
for x in ['high','low']:
    for y in ['high','low']:
        print x+" pre, ",y+" post: ", len(set(df_scores[(df_scores['split pre']==x)&(df_scores['split post t2']==y)]['student']))

high pre,  high post:  19
high pre,  low post:  3
low pre,  high post:  38
low pre,  low post:  36


#### In each group, how many had used the phet before?

In [8]:
for x in ['high','low']:
    for y in ['high','low']:
        print x+" pre, ",y+" post: ", sum(df_scores[(df_scores['split pre']==x)&(df_scores['split post t2']==y)]['used this circuit sim before?'])

high pre,  high post:  5
high pre,  low post:  3
low pre,  high post:  15
low pre,  low post:  14


Since we have any even number in each, we can disregard this factor as a covariate

### We find sequences for each student per group per time bin.
Using the following parameters

In [9]:
students = get_students()
CUT_OFF_SEQ_USE = 0.2 #we keep only sequences used once by at least 10% of students
N = int(CUT_OFF_SEQ_USE*len(students))
shortest_seq_length = 2
longest_seq_length = 12
B = 4  #number of bins

### PARAMETERS
attributes = [('split pre','high', 'low'),
              ('split post t2','high', 'low')]

family_categories = ["Family","Family_tool","Family_default"]
# family_categories = ["Family"]

# sequence_counts = get_sequence_use_by_timebin(df,students,family_category,B,attribute,level1,level2,shortest_seq_length,longest_seq_length,N)
# for k,v in sequence_counts.iteritems():
#     print k,v, np.sum(v)

### We can calculate the information gain of each sequence by time bin

In [10]:
parse_axis = {0:'time',1:'group',None:'time and group'}

pdf = PdfPages('infogain_results.pdf')

for attribute,level1,level2 in attributes:
    for family_category in family_categories:
        print "For attribute {0}, categories {1}".format(attribute,family_category)
        pdf.savefig(add_text(attribute,family_category,N, shortest_seq_length, longest_seq_length,B))
        sequence_counts = get_sequence_use_by_timebin(df,students,family_category,B,attribute,level1,level2,shortest_seq_length,longest_seq_length,N)
        ylabels = [level1,level2]
        for axis in [0,1,None]:
            tops = get_top_seqs(rank_sequences(sequence_counts,B,axis),3)
            for seq,infogain in tops:
                title = '{0}: infogain {1} by {2}'.format(seq,round(infogain,3),parse_axis[axis])
                plot = plot_heat_map(sequence_counts[seq],title, ylabels)
                pdf.savefig( plot )
                plot.clf()
pdf.close()

For attribute split pre, categories Family
Getting sequence use over 4 time bins for 96 students split by split pre. 
            Keeping only sequences used once by at least 19 students.
For attribute split pre, categories Family_tool
Getting sequence use over 4 time bins for 96 students split by split pre. 
            Keeping only sequences used once by at least 19 students.




For attribute split pre, categories Family_default
Getting sequence use over 4 time bins for 96 students split by split pre. 
            Keeping only sequences used once by at least 19 students.
For attribute split post t2, categories Family
Getting sequence use over 4 time bins for 96 students split by split post t2. 
            Keeping only sequences used once by at least 19 students.
For attribute split post t2, categories Family_tool
Getting sequence use over 4 time bins for 96 students split by split post t2. 
            Keeping only sequences used once by at least 19 students.
For attribute split post t2, categories Family_default
Getting sequence use over 4 time bins for 96 students split by split post t2. 
            Keeping only sequences used once by at least 19 students.
