## Setup

Testing CAA Library

In [1]:
import sys
sys.path.append("../lib")

In [2]:
import math
import random
import uuid
import os
import copy
import itertools
from collections.abc import Iterable
from datetime import datetime as dt

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import Bounds
from scipy.optimize import minimize
from scipy import optimize

In [3]:
import logging

#logging.basicConfig(level=logging.DEBUG)
logging.basicConfig(level=logging.INFO)
#logging.basicConfig(level=logging.WARNING)

logger = logging.getLogger("main")

In [4]:
#logging.getLogger().setLevel(logging.DEBUG)
logging.getLogger().setLevel(logging.INFO)
#logging.getLogger().setLevel(logging.WARNING)
logger.debug("Test debug")
logger.info("Test info")
logger.warning("Test warning")

INFO:main:Test info


In [5]:
from tutor.domain import Domain
from tutor.curriculum_factory import CurriculumFactory
from tutor.simple_curriculum import SimpleCurriculum
from tutor.tutor import SimpleTutor
from tutor.action import Attempt, HintRequest

from learner.selfeff_learner import SelfEfficacyLearner
from learner.modular_learner import ModularLearner
from learner.binary_skill_cog import BinarySkillCognition
from learner.decider import *

from simulate.modlearner_simulation import ModLearnerSimulation
from simulate.simulation import SimulationBatch

from analytics.batch import BatchCalculator
from analytics.student_stats import StudentStatCalc

from log_db import mongo
from log_db.curriculum_mapper import DB_Curriculum_Mapper

In [6]:
from CanonicalAutocorrelationAnalysis.model.caa import CAAComputation

In [7]:
from CanonicalAutocorrelationAnalysis.model.caa import *
from CanonicalAutocorrelationAnalysis.model.caaObject import *
from CanonicalAutocorrelationAnalysis.model.utils import l1Norm, l2Norm, r2Compute

In [8]:
# Get path to current project directory
cwd = os.path.abspath(".")
base_dir = os.path.abspath(os.path.join(cwd, os.pardir))
logger.debug("Base directory for the project:\n%s" % base_dir)

In [9]:
# Setup connection to database
data_out = "sim-%s" % str(uuid.uuid4())
data_path = os.path.join(base_dir,"test", "data", data_out)
logger.info("Writing simulation results to directory: %s" % data_path)
db_name = "motivsim"
db_params  = mongo.get_db_params(db_name)
logger.info("got db params: %s" % str(db_params))
db_util = mongo.Data_Utility(data_path, db_params)
db = db_util.db

INFO:learner.decider:Writing simulation results to directory: /rdata/Sandbox/MotivSim/test/data/sim-190871b3-acf7-4c5b-af9f-2b325f979de5
INFO:learner.decider:got db params: {'settingId': 'motivsim', 'url': 'localhost', 'port': '27017', 'name': 'motivsim', 'user': '', 'pswd': ''}


In [None]:
logger.info("Clearing database before starting new simulation")
db_util.clear_db()

## 2. EDA of simulated Data

In [10]:
# Test db connection
db_util.peak()

INFO:log_db.mongo:collection name, kcs, has 1126 documents
INFO:log_db.mongo:collection name, simbatches, has 4 documents
INFO:log_db.mongo:collection name, domains, has 56 documents
INFO:log_db.mongo:collection name, students, has 707 documents
INFO:log_db.mongo:collection name, units, has 9 documents
INFO:log_db.mongo:collection name, decisions, has 3144520 documents
INFO:log_db.mongo:collection name, curriculums, has 3 documents
INFO:log_db.mongo:collection name, steps, has 25107 documents
INFO:log_db.mongo:collection name, finalsimstudents, has 697 documents
INFO:log_db.mongo:collection name, problems, has 6178 documents
INFO:log_db.mongo:collection name, actions, has 3144520 documents
INFO:log_db.mongo:collection name, sections, has 44 documents
INFO:log_db.mongo:collection name, tutor_events, has 2835296 documents


In [14]:
# Get available batches of simulated students
batch_list = [batch for batch in db.simbatches.find()]
batch_desc = ["Simple diligent students",
              "Diligent Students with variable values",
              "Diligent Students with domain-level self-efficacy",
              "Test BIRT Batch"
             ]
batches = {}
for i, batch in enumerate(batch_list):
    logger.info(f"batch #{i}: \tID: {batch['_id']}\tdesc: {batch['desc']}")
    
    if batch['desc'] not in batches:
        batches[batch['desc']] = [batch]
    else:
        batches[batch['desc']].append(batch)
    logger.info(f"{len(batches[batch['desc']])} batch(s) with description: {batch['desc']}")

INFO:learner.decider:batch #0: 	ID: 21fe12a8-82d3-46c6-8638-ee1ee80961e2	desc: Simple diligent students
INFO:learner.decider:1 batch(s) with description: Simple diligent students
INFO:learner.decider:batch #1: 	ID: de394fe3-bd4d-4dc0-a165-020ff208bba2	desc: Diligent Students with variable values
INFO:learner.decider:1 batch(s) with description: Diligent Students with variable values
INFO:learner.decider:batch #2: 	ID: 9d8a46e8-b52c-406e-8b59-b595b443e64a	desc: Diligent Students with domain-level self-efficacy
INFO:learner.decider:1 batch(s) with description: Diligent Students with domain-level self-efficacy
INFO:learner.decider:batch #3: 	ID: 4e54dd98-7ec5-455e-81c4-7fc643e0d37f	desc: Test BIRT Batch
INFO:learner.decider:1 batch(s) with description: Test BIRT Batch


In [13]:
# Calculating stats with analytic methods

calc = StudentStatCalc(db)
batcher = BatchCalculator()

In [21]:
sid = batches[batch_desc[3]][0]['student_ids']
logger.info(f"Got {len(sid)} student IDs")

INFO:learner.decider:Got 2 student IDs


In [26]:
sim_students, runtime = batcher.time_batch(calc.get_stu_parameters, sid, 2)
logger.info(f"Calculated student params: {sim_students.shape}\tRuntime: {runtime} seconds")

INFO:learner.decider:Calculated student params: (2, 23)	Runtime: 0.024755 seconds


In [27]:
action_dist, runtime = batcher.time_batch(calc.action_stats, sid, 2)
logger.info(f"Calculated student action stats: {action_dist.shape}\tRuntime: {runtime} seconds")

INFO:learner.decider:Calculated student action stats: (2, 9)	Runtime: 2.972665 seconds


In [25]:
tx_stats, runtime = batcher.time_batch(calc.total_tx_stats, sid, 2)
logger.info(f"Calculated student activity stats: {tx_stats.shape}\tRuntime: {runtime} seconds")

ValueError: Wrong number of items passed 0, placement implies 1

In [46]:
    
stu_stats = pd.concat([sim_students, action_dist, tx_stats], axis=1)
logger.info(f"Merged new stats together: {stu_stats.shape}")

INFO:learner.decider:Calculating for student set 0
INFO:learner.decider:Calculated student params: (100, 23)	Runtime: 0.330774 seconds
INFO:learner.decider:Calculated student action stats: (100, 9)	Runtime: 206.061496 seconds
INFO:learner.decider:Calculated student activity stats: (100, 9)	Runtime: 200.856345 seconds
INFO:learner.decider:Merged new stats together: (100, 41)
INFO:learner.decider:Calculating for student set 1
INFO:learner.decider:Calculated student params: (100, 23)	Runtime: 0.272403 seconds
INFO:learner.decider:Calculated student action stats: (100, 9)	Runtime: 239.062265 seconds
INFO:learner.decider:Calculated student activity stats: (100, 9)	Runtime: 214.651635 seconds
INFO:learner.decider:Merged new stats together: (100, 41)
INFO:learner.decider:Calculating for student set 2
INFO:learner.decider:Calculated student params: (100, 23)	Runtime: 0.295801 seconds
INFO:learner.decider:Calculated student action stats: (100, 9)	Runtime: 200.429601 seconds
INFO:learner.decider

In [19]:
import itertools

In [53]:
def get_tx_fields(sids, fields):
    tx = pd.DataFrame(db.tutor_events.find({"stu_id": {'$in': sids}}))
    # Add kc field that reduces list of kcs to 1 kc
    tx['kc'] = tx.apply(lambda x: x['kcs'][0]['_id'], axis=1)
    return tx.loc[:, fields]

In [54]:
fields = ["_id", 'stu_id', 'kc', 'unit_id', 'section_id', 'prob_id', 'step_id', "duration"]
tx, runtime = batcher.time_batch(get_tx_fields, sid[:2], 1, fields)
logger.info(f"Extracted tx for set of students in {runtime} seconds: {tx.shape}")

INFO:learner.decider:Extracted tx for set of students in 4.580158 seconds: (37023, 8)


In [55]:
stu_id = random.choice(sid)
logger.info(f"getting tx for student with id: {stu_id}")
tx = pd.DataFrame(db.tutor_events.find({"stu_id": stu_id}))
print(tx.shape)
print(tx.columns)
tx.index = tx["_id"]

INFO:learner.decider:getting tx for student with id: 009f2978-63e8-4a1c-ae5f-bb6c0627fd19


(18256, 17)
Index(['_id', 'type', 'time', 'curric_id', 'unit_id', 'section_id', 'prob_id',
       'step_id', 'stu_id', 'duration', 'outcome', 'kcs', 'plt', 'plt1',
       'hints_used', 'hints_avail', 'attempt'],
      dtype='object')


In [56]:
cols = ["duration", "outcome", "plt", "plt1", "hints_used", "hints_avail", "attempt"]
d = tx.loc[:, cols] 

In [57]:
#Change time field to time since first tx
min_time = d["time"].min()
tts = (d["time"] - min_time).apply(lambda x: x.total_seconds())
d["time"] = tts


KeyError: 'time'

In [58]:
# One-hot encode "outcome"
print(d.shape)
outcome_cols = pd.get_dummies(d['outcome'], drop_first=True)
d = pd.concat([d, outcome_cols], axis=1)
print(d.shape)
d.drop(columns=["outcome"], inplace=True)
print(d.shape)

(18256, 7)
(18256, 9)
(18256, 8)


In [59]:
d.sort_values(by="duration").head()

Unnamed: 0_level_0,duration,plt,plt1,hints_used,hints_avail,attempt,Hint,Incorrect
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
02f737e4-bb7b-4145-a270-26b26c55c70b,0.25,0.980979,0.980979,0,3,0,0,0
45f31967-1eb2-4f74-81ab-1ddddf74b388,0.25,0.976403,0.976403,1,2,2,0,1
23be2baa-8a65-4d86-83ae-3d1e6169d5dc,0.25,0.347849,0.347849,0,3,1,0,0
9ea6aa42-2ebc-456d-991a-3096d82dd373,0.25,0.947491,0.947491,0,3,0,0,0
c6788fde-69a4-426a-b429-041853ca1fa3,0.25,0.415144,0.415144,0,3,1,0,0


### Notes



* Matrix must me all numerical
** Convert categorical columns to one-hot encoded
** Ensure one-hot encoding drops one value to remove multi-colinearity
* Convert Dataframe to numpy array (DataFrame.to_numpy())


## Test CAA Code

In [62]:
d.shape

(18256, 8)

In [63]:
d.head()

Unnamed: 0_level_0,duration,plt,plt1,hints_used,hints_avail,attempt,Hint,Incorrect
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
6b55c262-9caa-4ff6-a9f1-94375314c9b5,21.18302,0.297174,0.747215,0,3,0,0,0
62f10773-0a66-40a9-9aba-8db48957f369,5.89601,0.242252,0.317893,0,3,0,0,1
bf73be9e-229f-47ae-8c72-595d6e741b45,4.781048,0.317893,0.317893,0,3,1,0,1
3a4c0baf-356f-4e87-bff3-8db3fcb3a635,7.516365,0.317893,0.317893,0,3,2,0,1
6a5062a2-60af-4ee9-b84e-d25de82143f2,2.406777,0.317893,0.317893,0,3,3,1,0


In [60]:
caa = CAAComputation(d.to_numpy(), 0.35, 0.35)

In [66]:
for key in caa.__dict__:
    print(key)

US
VS
projections
ds
rs
penalty1
penalty2
trainingData
mean
std


In [61]:
for i, proj in enumerate(caa.projections):
    logger.info(f"Projection #{i}")
    logger.info("---- U ----")
    for col, val in zip(d.columns, proj.u.tolist()[0]):
        logger.info(f"Column: {col}\t{val}")
        
    logger.info("---- V ----")
    for col, val in zip(d.columns, proj.v.tolist()[0]):
        logger.info(f"Column: {col}\t{val}")


INFO:learner.decider:Projection #0
INFO:learner.decider:---- U ----
INFO:learner.decider:Column: duration	-0.0
INFO:learner.decider:Column: plt	0.0
INFO:learner.decider:Column: plt1	7.325756281035037e-06
INFO:learner.decider:Column: hints_used	-0.9559014333753064
INFO:learner.decider:Column: hints_avail	0.0
INFO:learner.decider:Column: attempt	-0.29368767358772113
INFO:learner.decider:Column: Hint	0.0
INFO:learner.decider:Column: Incorrect	0.0
INFO:learner.decider:---- V ----
INFO:learner.decider:Column: duration	-0.0
INFO:learner.decider:Column: plt	0.0
INFO:learner.decider:Column: plt1	0.0
INFO:learner.decider:Column: hints_used	-0.0
INFO:learner.decider:Column: hints_avail	1.0
INFO:learner.decider:Column: attempt	-0.0
INFO:learner.decider:Column: Hint	0.0
INFO:learner.decider:Column: Incorrect	0.0
INFO:learner.decider:Projection #1
INFO:learner.decider:---- U ----
INFO:learner.decider:Column: duration	0.0
INFO:learner.decider:Column: plt	1.0
INFO:learner.decider:Column: plt1	0.0
INF

In [121]:
Projection.distance(caa.projections[0], caa.projections[1])

2.8284208332643277

## Test CAA Implementation

In [76]:
dataPoints = d.to_numpy()

In [77]:
penalty1 = 0.35
penalty2 = 0.35
maxProj = None
minr2 = None
scale = True
doubleInit = True
orthogonality = False

In [78]:
row, features = dataPoints.shape


In [79]:
d.head()

Unnamed: 0_level_0,time,plt,plt1,hints_used,hints_avail,attempt,Hint,Incorrect,Hint,Incorrect
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2e0f5ecf-47de-4cc3-9df1-39a90d924387,0.0,0.445192,0.284751,0,3,0,1,0,1,0
3dba0c56-d692-4934-9854-c9364eeba6a4,16.982,0.284751,0.284751,1,2,1,0,0,0,0
998b39d9-e9d6-458a-bc4b-9ee72f76dc2d,21.078,0.333106,0.6646,0,3,0,0,0,0,0
c47e1913-77b0-493e-9132-f51fa9c76104,23.704,0.6646,0.874762,0,3,0,0,0,0,0
adfd1a8a-45ef-4a8b-945c-3117edc683f6,27.32,0.874762,0.9598,0,3,0,0,0,0,0


In [80]:

assert(1./features <= penalty1 and penalty1 <= 1)
assert(1./features <= penalty2 and penalty2 <= 1)
assert(minr2 is None or minr2 <= 1)

if maxProj is None:
    maxProj = features

if scale:
    std = np.std(dataPoints, axis = 0)
    std[std == 0] = 1
    X = (dataPoints - np.mean(dataPoints, axis = 0)) / std
else:
    X = dataPoints


In [81]:
np.matmul(X.T, X)

array([[ 31092.        ,  -1516.57698521,  -1461.46821528,
          -810.88843529,    810.88843529,   3210.69349333,
         -1062.32213109,   2372.14097455,  -1062.32213109,
          2372.14097455],
       [ -1516.57698521,  31092.        ,  30256.6237758 ,
         -3184.94173363,   3184.94173363, -11203.40741957,
           507.48670871, -11155.51382826,    507.48670871,
        -11155.51382826],
       [ -1461.46821528,  30256.6237758 ,  31092.        ,
         -3658.96062114,   3658.96062114, -11556.04766753,
          -750.50939885, -12162.01882203,   -750.50939885,
        -12162.01882203],
       [  -810.88843529,  -3184.94173363,  -3658.96062114,
         31092.        , -31092.        ,   8740.81697747,
         -3332.15589369,  -2527.94222145,  -3332.15589369,
         -2527.94222145],
       [   810.88843529,   3184.94173363,   3658.96062114,
        -31092.        ,  31092.        ,  -8740.81697747,
          3332.15589369,   2527.94222145,   3332.15589369,
          2

In [82]:
Co = np.matmul(X.T, X) / row
uList, vList, rSquare, dList = [], [], [], []

In [83]:
# Remove diagonal values to avoid max
Co[np.diag_indices_from(Co)] = 0


In [89]:

    
for _ in range(maxProj):
    maxCorr = np.unravel_index(np.argmax(np.abs(Co), axis=None), Co.shape)

    u, v = np.zeros((1, features)), np.zeros((1, features))
    u[0, maxCorr[0]] = 1.
    v[0, maxCorr[1]] = 1.

    if doubleInit:
        c1 = c2 = 0.5 * np.sqrt(features)
        try:
            u, v = computeProjection(Co, u, v, c1, c2)
        except:
            logger.info("Return CAA 1")

            #return CAA(uList, vList, dList, rSquare, penalty1, penalty2, dataPoints)

    c1 = penalty1 * np.sqrt(features)
    c2 = penalty2 * np.sqrt(features)
    try:
        u, v = computeProjection(Co, u, v, c1, c2)
    except:
        logger.info("Return CAA 2")
        # return CAA(uList, vList, dList, rSquare, penalty1, penalty2, dataPoints)
    d = np.dot(np.dot(u,Co),v.T).flatten()
    r = r2Compute(u, v, X).flatten()

    # Append values to the list
    if minr2 is None or r >= minr2:
        uList.append(u)
        vList.append(v)
        dList.append(d)
        rSquare.append(r)
        
    # Update Correlation Matrix
    Co -= d * (np.matmul(u.T,v) + np.matmul(v.T,u))
    if orthogonality:
        selection = np.ones_like(Co)
        notNull = (np.abs(u) + np.abs(v) != 0).flatten()
        selection[notNull,:] = 0
        selection[:,notNull] = 0
        Co[selection == 0] = 0

INFO:learner.decider:Return CAA 2
INFO:learner.decider:Return CAA 1
INFO:learner.decider:Return CAA 1
INFO:learner.decider:Return CAA 2
INFO:learner.decider:Return CAA 1
INFO:learner.decider:Return CAA 2
