## Setup

Testing CAA Library

In [1]:
import sys
sys.path.append("../lib")

In [2]:
import math
import random
import uuid
import os
import copy
import itertools
from collections.abc import Iterable
from datetime import datetime as dt

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.optimize import Bounds
from scipy.optimize import minimize
from scipy import optimize

from sklearn.cluster import DBSCAN

import statsmodels.api as sm


In [3]:
import logging

#logging.basicConfig(level=logging.DEBUG)
logging.basicConfig(level=logging.INFO)
#logging.basicConfig(level=logging.WARNING)

logger = logging.getLogger("main")

In [4]:
# logging.getLogger().setLevel(logging.DEBUG)
logging.getLogger().setLevel(logging.INFO)
#logging.getLogger().setLevel(logging.WARNING)
logger.debug("Test debug")
logger.info("Test info")
logger.warning("Test warning")

INFO:main:Test info


In [5]:
from tutor.domain import Domain
from tutor.curriculum_factory import CurriculumFactory
from tutor.simple_curriculum import SimpleCurriculum
from tutor.tutor import SimpleTutor
from tutor.action import Attempt, HintRequest

from learner.selfeff_learner import SelfEfficacyLearner
from learner.modular_learner import ModularLearner
from learner.binary_skill_cog import BinarySkillCognition
from learner.decider import *

from simulate.modlearner_simulation import ModLearnerSimulation
from simulate.simulation import SimulationBatch
from simulate.self_eff_simulation import SelfEffSimulation
from simulate.modlearner_simulation import ModLearnerSimulation

from log_db import mongo
from log_db.curriculum_mapper import DB_Curriculum_Mapper
from log_db.learner_mapper import DBLearnerMapper

from analytics.batch import *
from analytics.student_stats import StudentStatCalc
from analytics.featurization import *
from analytics.cae import *

In [6]:
from CanonicalAutocorrelationAnalysis.model.caa import CAAComputation
from CanonicalAutocorrelationAnalysis.model.caaObject import *
from CanonicalAutocorrelationAnalysis.model.utils import l1Norm, l2Norm, r2Compute

In [7]:
# Get path to current project directory
cwd = os.path.abspath(".")
base_dir = os.path.abspath(os.path.join(cwd, os.pardir))
logger.debug("Base directory for the project:\n%s" % base_dir)

In [8]:
# Setup connection to database
data_out = "sim-%s" % str(uuid.uuid4())
data_path = os.path.join(base_dir,"test", "data", data_out)
logger.info("Writing simulation results to directory: %s" % data_path)
db_name = "motivsim"
db_params  = mongo.get_db_params(db_name)
logger.info("got db params: %s" % str(db_params))
db_util = mongo.Data_Utility(data_path, db_params)
db = db_util.db

INFO:analytics.cae:Writing simulation results to directory: /rdata/Sandbox/MotivSim/test/data/sim-00664f3a-5bf6-4d52-b06c-f329cf0d7f1e
INFO:analytics.cae:got db params: {'settingId': 'motivsim', 'url': 'localhost', 'port': '27017', 'name': 'motivsim', 'user': '', 'pswd': ''}


In [9]:
clear_db = False
if clear_db:
    logger.info("Clearing database before starting new simulation")
    #db_util.clear_db()
else:
    logger.info("Skipping Clearing database")

INFO:analytics.cae:Skipping Clearing database


## 1. Simulate Data

In [10]:
run_sim = False
num_students=20
sim_batch_desc = "Test BIRT Batch"

In [11]:
def gen_test_curric(db, db_params):
    domain_params = {'m_l0': 0.45,
                     'sd_l0': 0.155,
                     'm_t': 0.25,
                     'sd_t': 0.13,#0.03,
                     'm_s': 0.155,
                     'sd_s': 0.055,
                     'm_g': 0.15,#0.6,
                     'sd_g': 0.105
                    }
    curric_params = {'num_units': 2,
                     'mean_sections': 4,
                     'stdev_sections': 2,
                     'mean_unit_kcs': 22,
                     'stdev_unit_kcs': 23,
                     'section_kcs_lambda': 6,
                     'mean_steps': 10,
                     'stdev_steps': 4,
                     'mean_prob_kcs': 6,
                     'stdev_prob_kcs': 3,
                     'num_practice': 100
                    }

    domain, curric = CurriculumFactory.gen_curriculum(domain_params, curric_params)
    db.domains.insert_one(domain.to_dict())
    db.kcs.insert_many([kc.__dict__ for kc in domain.kcs])
    curric_util = DB_Curriculum_Mapper(db_params)
    curric_util.write_to_db(curric)

    return domain, curric

In [12]:
def gen_students(num_students, domain, curric, persist=True):
    stus = []
    for i in range(num_students):
        cog = BinarySkillCognition(domain)
        ev_decider = EVDecider()
        decider = DiligentDecider(ev_decider)
        stu = ModularLearner(domain, cog, decider)
        stus.append(stu)

    return stus


def sim_students(db, num_students, domain, curric):
    students = gen_students(num_students, domain, curric)
    logger.info(f"Persisting {len(students)} initialized students to db")
    db.students.insert_many([stu.to_dict() for stu in students])
    # Init simulation batch
    batch = SimulationBatch(sim_batch_desc)

    # Simulate Students
    for i, stu in enumerate(students):
        logger.info("Simulating student #%i" % i)
        sim = ModLearnerSimulation(domain, curric, stu)
        batch.add_sim(sim)
        sim.run()

    logger.info("Inserting %i simulated students to db" % len(students))
    result = db.finalsimstudents.insert_many([stu.to_dict() for stu in students])
    logger.info("Db insert success: %s" % result.acknowledged)

    logger.info("Inserting simulation batch to db")
    result = db.simbatches.insert_one(batch.to_dict())
    logger.info("Db insert success: %s" % result.acknowledged)

    return batch, students

In [13]:
simbatch = db.simbatches.find_one({"desc": sim_batch_desc})
if simbatch is None:
    logger.info("Generating new simulation. None found in db")

    # generate simualted data for test
    domain, curric = gen_test_curric(db, db_params)
    batch, students = sim_students(db, num_students, domain, curric)
    logger.info(f"Simulated {len(students)} in batch with id: {batch._id}")

else:
    logger.info(f"Found simulation batch: {str(simbatch)}")
    lmapper = DBLearnerMapper(db)
    students = [lmapper.get_modlearner_from_db(sid) for sid in simbatch['student_ids']]
    batch = SimulationBatch.from_dict(simbatch)
    logger.info(f"Recovered {len(students)} students from batch with id: {batch._id}")


INFO:analytics.cae:Found simulation batch: {'_id': '95273d3e-f9b2-4edc-8ed9-117c62dea350', 'run_time': datetime.datetime(2020, 12, 20, 22, 59, 19, 77000), 'desc': 'Test BIRT Batch', 'student_ids': ['f97f9d07-5048-4434-81f2-18bc23331c32', '367f9ea2-9c2a-46aa-b747-172f00cc994f', '5bc27c57-33e7-4479-84d0-744c3e38096c', 'e486a130-8382-4a24-86d0-e26081651938', 'fb149dff-426a-4d1f-9cc5-2b4ea8d2178d', '8ff912ee-f2a2-4224-970f-1d29cf213f44', 'fd5d46fb-8bb0-480b-8fa1-4b66c3d71db8', '3384137f-26f4-477b-83b8-cead72b10646', '6d1e7c67-7127-45e3-88f4-462446be123a', '18b22ae5-534d-4616-bb88-07847a3e9d1b', 'a3badde6-1754-4360-84a0-042e4ba64350', '40709376-ccdf-44dc-868a-ed6f06bf9461', 'a88c1ff5-9e6b-4194-9b1e-edbf78f93260', 'fa5deb56-4126-4a97-9708-7e7c52f0e8d7', '5e4b9b36-52ec-4af5-862d-2de1aeaeaccf', 'a72add58-1751-46fb-b70b-1c1a5512aa03', '809b51a7-069a-47be-adac-a93dfa8f100d', '8d2fbcdf-60f3-4ea8-9a0a-ed16566ebc06', 'f852d44e-a2db-42f0-ac14-06830becb6ae', '88504e70-a0b4-4eec-b6b2-f563623bb071']}
I

## 2. EDA of simulated Data

In [14]:
# Test db connection
db_util.peak()

INFO:log_db.mongo:collection name, kcs, has 62 documents
INFO:log_db.mongo:collection name, caa_batches, has 3 documents
INFO:log_db.mongo:collection name, simbatches, has 1 documents
INFO:log_db.mongo:collection name, caa_models, has 60 documents
INFO:log_db.mongo:collection name, domains, has 1 documents
INFO:log_db.mongo:collection name, students, has 20 documents
INFO:log_db.mongo:collection name, units, has 2 documents
INFO:log_db.mongo:collection name, decisions, has 77972 documents
INFO:log_db.mongo:collection name, curriculums, has 1 documents
INFO:log_db.mongo:collection name, steps, has 7173 documents
INFO:log_db.mongo:collection name, finalsimstudents, has 20 documents
INFO:log_db.mongo:collection name, problems, has 1774 documents
INFO:log_db.mongo:collection name, actions, has 77972 documents
INFO:log_db.mongo:collection name, sections, has 10 documents
INFO:log_db.mongo:collection name, tutor_events, has 72892 documents


In [15]:
# Get available batches of simulated students
batch_list = [batch for batch in db.simbatches.find()]
batch_desc = ["Simple diligent students",
              "Diligent Students with variable values",
              "Diligent Students with domain-level self-efficacy",
              "Test BIRT Batch"
             ]
batches = {}
for i, batch in enumerate(batch_list):
    logger.info(f"batch #{i}: \tID: {batch['_id']}\tdesc: {batch['desc']}")
    
    if batch['desc'] not in batches:
        batches[batch['desc']] = [batch]
    else:
        batches[batch['desc']].append(batch)
    logger.info(f"{len(batches[batch['desc']])} batch(s) with description: {batch['desc']}")

INFO:analytics.cae:batch #0: 	ID: 95273d3e-f9b2-4edc-8ed9-117c62dea350	desc: Test BIRT Batch
INFO:analytics.cae:1 batch(s) with description: Test BIRT Batch


In [16]:
# Calculating stats with analytic methods

calc = StudentStatCalc(db)
batcher = BatchCalculator()

In [17]:
sid = batches[batch_desc[3]][0]['student_ids']
logger.info(f"Got {len(sid)} student IDs")

INFO:analytics.cae:Got 20 student IDs


In [18]:
sim_students, runtime = batcher.time_batch(calc.get_stu_parameters, sid, 2)
logger.info(f"Calculated student params: {sim_students.shape}\tRuntime: {runtime} seconds")

INFO:analytics.cae:Calculated student params: (20, 23)	Runtime: 0.191179 seconds


In [19]:
action_dist, runtime = batcher.time_batch(calc.action_stats, sid, 2)
logger.info(f"Calculated student action stats: {action_dist.shape}\tRuntime: {runtime} seconds")

INFO:analytics.cae:Calculated student action stats: (20, 9)	Runtime: 3.094064 seconds


In [20]:
tx_stats, runtime = batcher.time_batch(calc.total_tx_stats, sid, 2)
logger.info(f"Calculated student activity stats: {tx_stats.shape}\tRuntime: {runtime} seconds")

ValueError: Wrong number of items passed 0, placement implies 1

In [None]:
    
stu_stats = pd.concat([sim_students, action_dist, tx_stats], axis=1)
logger.info(f"Merged new stats together: {stu_stats.shape}")

In [21]:
import itertools

In [22]:
def get_tx_fields(sids, fields):
    tx = pd.DataFrame(db.tutor_events.find({"stu_id": {'$in': sids}}))
    # Add kc field that reduces list of kcs to 1 kc
    tx['kc'] = tx.apply(lambda x: x['kcs'][0]['_id'], axis=1)
    return tx.loc[:, fields]

In [23]:
fields = ["_id", 'stu_id', 'kc', 'unit_id', 'section_id', 'prob_id', 'step_id', "duration"]
tx, runtime = batcher.time_batch(get_tx_fields, sid[:2], 1, fields)
logger.info(f"Extracted tx for set of students in {runtime} seconds: {tx.shape}")

INFO:analytics.cae:Extracted tx for set of students in 0.386329 seconds: (7913, 8)


In [24]:
stu_id = random.choice(sid)
logger.info(f"getting tx for student with id: {stu_id}")
tx = pd.DataFrame(db.tutor_events.find({"stu_id": stu_id}))
print(tx.shape)
print(tx.columns)
tx.index = tx["_id"]

INFO:analytics.cae:getting tx for student with id: 8d2fbcdf-60f3-4ea8-9a0a-ed16566ebc06


(3783, 17)
Index(['_id', 'type', 'time', 'curric_id', 'unit_id', 'section_id', 'prob_id',
       'step_id', 'stu_id', 'duration', 'outcome', 'kcs', 'plt', 'plt1',
       'hints_used', 'hints_avail', 'attempt'],
      dtype='object')


In [25]:
cols = ["duration", "outcome", "plt", "plt1", "hints_used", "hints_avail", "attempt"]
d = tx.loc[:, cols] 

In [26]:
#Change time field to time since first tx
min_time = d["time"].min()
tts = (d["time"] - min_time).apply(lambda x: x.total_seconds())
d["time"] = tts


KeyError: 'time'

In [30]:
# One-hot encode "outcome"
print(d.shape)
outcome_cols = pd.get_dummies(d['outcome'], drop_first=True)
d = pd.concat([d, outcome_cols], axis=1)
print(d.shape)
d.drop(columns=["outcome"], inplace=True)
print(d.shape)

(3783, 7)
(3783, 9)
(3783, 8)


In [31]:
d.sort_values(by="duration").head()

Unnamed: 0_level_0,duration,plt,plt1,hints_used,hints_avail,attempt,Hint,Incorrect
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
d1d02b37-9232-48b5-810f-79a639a46491,0.25,0.957973,0.957973,2,1,3,0,0
d07b1abf-3a84-4fae-87b1-62975ca3a50b,0.25,0.424861,0.424861,0,3,1,0,1
74a84638-1bd5-4ce2-9da7-6fe7c84d0ddc,0.25,0.974121,0.974121,1,2,1,0,1
c06b154d-55ef-4877-86b2-5bd0f8442c12,0.25,0.424861,0.386858,0,3,0,0,1
d992ddab-bc64-4b0c-a295-b3e28f893f2c,0.25,0.366144,0.366144,1,2,1,0,0


### Notes



* Matrix must me all numerical
** Convert categorical columns to one-hot encoded
** Ensure one-hot encoding drops one value to remove multi-colinearity
* Convert Dataframe to numpy array (DataFrame.to_numpy())


## Test CAA Code

In [27]:
d.shape

(3783, 7)

In [32]:
d.head()

Unnamed: 0_level_0,duration,plt,plt1,hints_used,hints_avail,attempt,Hint,Incorrect
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5579b5c5-8b39-4538-ad59-3fd34a98a526,14.463594,0.779622,0.60097,0,3,0,0,1
77224cf6-5243-491d-b9a4-e30e70398434,17.837305,0.60097,0.60097,0,3,1,0,1
725359d3-9cf1-4104-85a9-8b059fbcbac0,19.041699,0.60097,0.60097,0,3,2,0,1
0cba3f0a-2eba-4bfa-bb9e-e1b076522449,19.514657,0.60097,0.60097,0,3,3,0,1
4c71e19a-7081-486b-a752-87f33224ccb8,11.875565,0.60097,0.60097,0,3,4,0,1


In [33]:
caa = CAAComputation(d.to_numpy(), 0.35, 0.35)

In [34]:
for key in caa.__dict__:
    print(key)

US
VS
projections
ds
rs
penalty1
penalty2
trainingData
mean
std


In [35]:
for i, proj in enumerate(caa.projections):
    logger.info(f"Projection #{i}")
    logger.info("---- U ----")
    for col, val in zip(d.columns, proj.u.tolist()[0]):
        logger.info(f"Column: {col}\t{val}")
        
    logger.info("---- V ----")
    for col, val in zip(d.columns, proj.v.tolist()[0]):
        logger.info(f"Column: {col}\t{val}")


INFO:analytics.cae:Projection #0
INFO:analytics.cae:---- U ----
INFO:analytics.cae:Column: duration	-0.03955730300691882
INFO:analytics.cae:Column: plt	0.0
INFO:analytics.cae:Column: plt1	8.926952765082996e-06
INFO:analytics.cae:Column: hints_used	-0.9860974353247222
INFO:analytics.cae:Column: hints_avail	0.0
INFO:analytics.cae:Column: attempt	-0.16139103985393224
INFO:analytics.cae:Column: Hint	0.0
INFO:analytics.cae:Column: Incorrect	0.0
INFO:analytics.cae:---- V ----
INFO:analytics.cae:Column: duration	-0.0
INFO:analytics.cae:Column: plt	0.0
INFO:analytics.cae:Column: plt1	0.0
INFO:analytics.cae:Column: hints_used	-0.0
INFO:analytics.cae:Column: hints_avail	1.0
INFO:analytics.cae:Column: attempt	-0.0
INFO:analytics.cae:Column: Hint	0.0
INFO:analytics.cae:Column: Incorrect	0.0
INFO:analytics.cae:Projection #1
INFO:analytics.cae:---- U ----
INFO:analytics.cae:Column: duration	-0.0
INFO:analytics.cae:Column: plt	1.0
INFO:analytics.cae:Column: plt1	0.0
INFO:analytics.cae:Column: hints_u

In [36]:
Projection.distance(caa.projections[0], caa.projections[1])

2.828420812423267

## Test CAA Implementation

In [37]:
dataPoints = d.to_numpy()

In [None]:
penalty1 = 0.35
penalty2 = 0.35
maxProj = None
minr2 = None
scale = True
doubleInit = True
orthogonality = False

In [None]:
row, features = dataPoints.shape


In [None]:
d.head()

In [None]:

assert(1./features <= penalty1 and penalty1 <= 1)
assert(1./features <= penalty2 and penalty2 <= 1)
assert(minr2 is None or minr2 <= 1)

if maxProj is None:
    maxProj = features

if scale:
    std = np.std(dataPoints, axis = 0)
    std[std == 0] = 1
    X = (dataPoints - np.mean(dataPoints, axis = 0)) / std
else:
    X = dataPoints


In [None]:
np.matmul(X.T, X)

In [None]:
Co = np.matmul(X.T, X) / row
uList, vList, rSquare, dList = [], [], [], []

In [None]:
# Remove diagonal values to avoid max
Co[np.diag_indices_from(Co)] = 0


In [None]:

    
for _ in range(maxProj):
    maxCorr = np.unravel_index(np.argmax(np.abs(Co), axis=None), Co.shape)

    u, v = np.zeros((1, features)), np.zeros((1, features))
    u[0, maxCorr[0]] = 1.
    v[0, maxCorr[1]] = 1.

    if doubleInit:
        c1 = c2 = 0.5 * np.sqrt(features)
        try:
            u, v = computeProjection(Co, u, v, c1, c2)
        except:
            logger.info("Return CAA 1")

            #return CAA(uList, vList, dList, rSquare, penalty1, penalty2, dataPoints)

    c1 = penalty1 * np.sqrt(features)
    c2 = penalty2 * np.sqrt(features)
    try:
        u, v = computeProjection(Co, u, v, c1, c2)
    except:
        logger.info("Return CAA 2")
        # return CAA(uList, vList, dList, rSquare, penalty1, penalty2, dataPoints)
    d = np.dot(np.dot(u,Co),v.T).flatten()
    r = r2Compute(u, v, X).flatten()

    # Append values to the list
    if minr2 is None or r >= minr2:
        uList.append(u)
        vList.append(v)
        dList.append(d)
        rSquare.append(r)
        
    # Update Correlation Matrix
    Co -= d * (np.matmul(u.T,v) + np.matmul(v.T,u))
    if orthogonality:
        selection = np.ones_like(Co)
        notNull = (np.abs(u) + np.abs(v) != 0).flatten()
        selection[notNull,:] = 0
        selection[:,notNull] = 0
        Co[selection == 0] = 0

## Test CAE classification

### Student-level CAE

#### Featurizing

In [None]:
# Add Off-task labels per transaction


#### Generating Embeddings

In [38]:
logger.info("Testing workflow for building CAA embedding for each data segment")

batch_calc = BatchCalculator()

col = "tutor_events"
base_query = {"stu_id": {"$in": [stu._id for stu in students]},
              "type": "TutorInput"
             }
logger.info(f"{col} collection has {db[col].count_documents(base_query)} documents associated with {len(students)} students using query: {base_query}")
segmenter = Segmenter(db[col], base_query)
idx_fields = ['stu_id']
batches = segmenter.get_batches(idx_fields, 1)
caa_mdls = []
col_names = []
caa_batch = CAABatch("Test CAA Embedding", col_names)

INFO:analytics.cae:Testing workflow for building CAA embedding for each data segment
INFO:analytics.cae:tutor_events collection has 72852 documents associated with 20 students using query: {'stu_id': {'$in': ['f97f9d07-5048-4434-81f2-18bc23331c32', '367f9ea2-9c2a-46aa-b747-172f00cc994f', '5bc27c57-33e7-4479-84d0-744c3e38096c', 'e486a130-8382-4a24-86d0-e26081651938', 'fb149dff-426a-4d1f-9cc5-2b4ea8d2178d', '8ff912ee-f2a2-4224-970f-1d29cf213f44', 'fd5d46fb-8bb0-480b-8fa1-4b66c3d71db8', '3384137f-26f4-477b-83b8-cead72b10646', '6d1e7c67-7127-45e3-88f4-462446be123a', '18b22ae5-534d-4616-bb88-07847a3e9d1b', 'a3badde6-1754-4360-84a0-042e4ba64350', '40709376-ccdf-44dc-868a-ed6f06bf9461', 'a88c1ff5-9e6b-4194-9b1e-edbf78f93260', 'fa5deb56-4126-4a97-9708-7e7c52f0e8d7', '5e4b9b36-52ec-4af5-862d-2de1aeaeaccf', 'a72add58-1751-46fb-b70b-1c1a5512aa03', '809b51a7-069a-47be-adac-a93dfa8f100d', '8d2fbcdf-60f3-4ea8-9a0a-ed16566ebc06', 'f852d44e-a2db-42f0-ac14-06830becb6ae', '88504e70-a0b4-4eec-b6b2-f56362

In [39]:
penalty1 = 0.35
penalty2 = 0.35
for query, batch in batches:
    logger.info(f"Got batch with shape {batch.shape}")# using query: {str(query)}")
    sid = batch['stu_id'][0]
    logger.info(f"Student_id: {sid}")
    data_proc = SimpleCAEPreprocessor(batch)
    d = data_proc.process_data()
    if len(col_names) == 0:
        col_names = d.columns.tolist()
    logger.info(f"computing cae on dataframe: {d.shape}")
    #caa = compute_cae(d, data_proc, batch['_id'])
    data_idx = batch['_id']
    caa = StudentCAAModel.from_caa_obj(CAAComputation(d.to_numpy(), penalty1, penalty2), data_proc, data_idx, sid)
    #caa.student_id = sid

    caa_batch.add(caa)
caa_batch.col_names = col_names

INFO:analytics.cae:Got batch with shape (3552, 17)
INFO:analytics.cae:Student_id: a72add58-1751-46fb-b70b-1c1a5512aa03
INFO:analytics.cae:computing cae on dataframe: (3552, 8)
INFO:analytics.cae:Got batch with shape (3950, 17)
INFO:analytics.cae:Student_id: f97f9d07-5048-4434-81f2-18bc23331c32
INFO:analytics.cae:computing cae on dataframe: (3950, 8)
INFO:analytics.cae:Got batch with shape (2778, 17)
INFO:analytics.cae:Student_id: 40709376-ccdf-44dc-868a-ed6f06bf9461
INFO:analytics.cae:computing cae on dataframe: (2778, 8)
INFO:analytics.cae:Got batch with shape (3619, 17)
INFO:analytics.cae:Student_id: e486a130-8382-4a24-86d0-e26081651938
INFO:analytics.cae:computing cae on dataframe: (3619, 8)
INFO:analytics.cae:Got batch with shape (5219, 17)
INFO:analytics.cae:Student_id: 5e4b9b36-52ec-4af5-862d-2de1aeaeaccf
INFO:analytics.cae:computing cae on dataframe: (5219, 8)
INFO:analytics.cae:Got batch with shape (3802, 17)
INFO:analytics.cae:Student_id: 88504e70-a0b4-4eec-b6b2-f563623bb071
I

In [40]:
for mdl in caa_batch.mdls:
    logger.info(f"mdl id: {mdl._id}")

INFO:analytics.cae:mdl id: f2eb58ec-757e-48b3-89d4-eba7bda52611
INFO:analytics.cae:mdl id: 006033cf-2168-4306-9a54-6f1b278ab148
INFO:analytics.cae:mdl id: fc538d8b-5787-4377-ab08-0046fa95e84f
INFO:analytics.cae:mdl id: e612a489-c00f-4e4c-abe2-508848ea0ca6
INFO:analytics.cae:mdl id: 5f04e977-fa49-47ba-a6fd-dc742e424d3d
INFO:analytics.cae:mdl id: d1cbbe43-ca7a-4b9f-93c3-e78d23462c9b
INFO:analytics.cae:mdl id: 2d4c3e5b-bfea-46b5-b04f-4cdb33538750
INFO:analytics.cae:mdl id: 36f3dd5a-d254-43e1-a4c1-1ab1cab6c5cd
INFO:analytics.cae:mdl id: 6df98987-7075-41bf-b62e-971b600ceb22
INFO:analytics.cae:mdl id: 58d52459-5656-40dc-bf48-ccf6ef378740
INFO:analytics.cae:mdl id: d4dc001f-29b6-44b8-bd83-8587ea69a853
INFO:analytics.cae:mdl id: 0999b69c-42e4-4441-98e6-1a02eaf9ee1a
INFO:analytics.cae:mdl id: 27861d13-424f-4dae-8e45-72bde9e333cd
INFO:analytics.cae:mdl id: 79d6b3fe-6bf2-4afe-9f11-020cb73eb76e
INFO:analytics.cae:mdl id: 34fbb377-e371-45c7-9ab9-bd6a79192ddb
INFO:analytics.cae:mdl id: 4a72d246-1ceb

In [41]:
db.caa_models.insert_many([mdl.to_dict() for mdl in caa_batch.mdls])
db.caa_batches.insert_one(caa_batch.to_dict())

<pymongo.results.InsertOneResult at 0x7f6158c77f00>

#### Clustering CAE

In [42]:
logger.info("**** Testing CAA Batch Operations ****")
logger.debug(f"Projection index: {caa_batch.get_index()}")

m = caa_batch.get_distances()
#logger.debug(f"cae distances: {m.head()}")
X = m.to_numpy()
m.head()


INFO:analytics.cae:**** Testing CAA Batch Operations ****


Unnamed: 0,1ad26555-30b0-4d95-8332-a2ebe3fecd19,30379396-7cbb-407c-a062-4c76b2d1fb4d,96afd0a4-574b-4def-9d8c-39635dad90b8,193f3b50-8535-400e-a102-7cce890c5cfa,808f6506-f04e-4a02-b5cc-c49f763bd084,431c400d-2071-408f-a17a-562f537e433d,a877595f-9815-418b-9241-55a2ea7b6be4,8970ec88-436a-45d8-8e5f-e0b0b4f35784,8c062649-3a02-489c-8b01-bfb0dd328a1d,ffff8d33-c816-4049-8461-3a4e0ba8fe59,...,4948ede6-5d2a-4634-91f5-09e09cad88d7,b3324b06-6206-40b6-b69f-f2b84545f008,ea14fa25-ca36-440f-8125-6204132450d2,9803d9ab-a88f-498f-9d44-ecb9d708a7d2,12808be4-67ee-44d7-b27d-384dd0edf403,458f71d2-99af-4649-acd5-5c311d248b2d,02842c78-e977-4f22-a5ee-7f40803326d4,5060fe4a-83c7-4cf6-94f8-655e87782973,9d7f92df-bd6d-4da5-952a-3a2c2452c7f3,57650ebe-6ad9-45c5-867d-bbc805aae281
1ad26555-30b0-4d95-8332-a2ebe3fecd19,0.0,2.82842,3.08041,2.82841,2.82843,2.82843,2.82843,2.82838,0.311069,2.82842,...,2.82841,2.82843,2.82848,0.0879752,2.82842,3.08041,2.82841,2.82843,2.82843,3.01566
30379396-7cbb-407c-a062-4c76b2d1fb4d,2.82842,0.0,2.82843,2.82843,3.25445,3.18442,2.82843,1.41495,2.82842,0.0,...,2.82843,3.25034,2.82843,2.82842,4.54511e-06,2.82843,2.82841,3.2423,3.20967,3.19243
96afd0a4-574b-4def-9d8c-39635dad90b8,3.08041,2.82843,0.0,2.82843,1.41422,2.82843,2.82841,2.82843,2.88847,2.82843,...,2.82843,1.41422,1.41427,3.03013,2.82843,1.60702e-05,2.82843,1.41422,2.82841,1.69944
193f3b50-8535-400e-a102-7cce890c5cfa,2.82841,2.82843,2.82843,0.0,2.82843,2.82847,3.20014,2.04048,2.81714,2.82843,...,0.626278,2.82843,2.82849,2.82841,2.82843,2.82843,0.102639,2.82843,2.82843,2.82846
808f6506-f04e-4a02-b5cc-c49f763bd084,2.82843,3.25445,1.41422,2.82843,0.0,1.57737,2.82841,2.82844,2.82843,3.25445,...,2.82843,0.0104278,1.41433,2.82843,3.25445,1.41423,2.82843,0.0304683,1.52176,1.73465


In [128]:
# Test clustering

#eps = 0.3
min_samples = 2
for eps in np.arange(1,3,0.1):
#    for min_samples in range(2,6):
    logger.info(f"******Running DBSCAN with EPS={eps} and min_sample={min_samples}******")
    clusterer = DBSCAN(eps=eps, min_samples=min_samples, metric="precomputed")
    clusters = clusterer.fit(X)
    labels = clusters.labels_
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise_ = list(labels).count(-1)
    logger.info('Estimated number of clusters: %d' % n_clusters_)
    logger.info('Estimated number of noise points: %d' % n_noise_)
    logger.info(f"Cluster labels: {clusters.labels_}")

INFO:analytics.cae:******Running DBSCAN with EPS=1.0 and min_sample=2******
INFO:analytics.cae:Estimated number of clusters: 10
INFO:analytics.cae:Estimated number of noise points: 2
INFO:analytics.cae:Cluster labels: [ 0  1  2  3  4  5  6  7  0  1  0  1  0  1  0  1  0  0  1  3  5  4  0  1
  8  7  3  6  9  0  1  0  1  8  3  4  7  6  9  0  1  5  4  3  2  6  7  0
  1  4  5  0  1  3  5  4  0  1  0  1  3  4  5  0  1  5  3  4  0  1  3  0
  1  3  5  4  0  1  5  3  4 -1  0  1  2  3  4  5 -1]
INFO:analytics.cae:******Running DBSCAN with EPS=1.1 and min_sample=2******
INFO:analytics.cae:Estimated number of clusters: 10
INFO:analytics.cae:Estimated number of noise points: 2
INFO:analytics.cae:Cluster labels: [ 0  1  2  3  4  5  6  7  0  1  0  1  0  1  0  1  0  0  1  3  5  4  0  1
  8  7  3  6  9  0  1  0  1  8  3  4  7  6  9  0  1  5  4  3  2  6  7  0
  1  4  5  0  1  3  5  4  0  1  0  1  3  4  5  0  1  5  3  4  0  1  3  0
  1  3  5  4  0  1  5  3  4 -1  0  1  2  3  4  5 -1]
INFO:analytics.cae:*

In [43]:
# Get cluster labels for EDA
# 3 clusters over 20 students
#eps = 2
#min_samples = 2

# 10 cluster over 20 students
eps = 1
min_samples = 2

logger.info(f"******Running DBSCAN with EPS={eps} and min_sample={min_samples}******")
clusterer = DBSCAN(eps=eps, min_samples=min_samples, metric="precomputed")
clusters = clusterer.fit(X)
labels = clusters.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
logger.info('Estimated number of clusters: %d' % n_clusters_)
logger.info('Estimated number of noise points: %d' % n_noise_)
logger.info(f"Cluster labels: {clusters.labels_}")

INFO:analytics.cae:******Running DBSCAN with EPS=1 and min_sample=2******
INFO:analytics.cae:Estimated number of clusters: 10
INFO:analytics.cae:Estimated number of noise points: 2
INFO:analytics.cae:Cluster labels: [ 0  1  2  3  4  5  6  7  0  1  0  1  0  1  0  1  0  0  1  3  5  4  0  1
  8  7  3  6  9  0  1  0  1  8  3  4  7  6  9  0  1  5  4  3  2  6  7  0
  1  4  5  0  1  3  5  4  0  1  0  1  3  4  5  0  1  5  3  4  0  1  3  0
  1  3  5  4  0  1  5  3  4 -1  0  1  2  3  4  5 -1]


#### EDA of projection Labels

##### Cluster Distribution

In [73]:
# Map cluster labels with projection ids and original student ids
y = pd.DataFrame({"proj_id": caa_batch.get_index(), "cluster": clusters.labels_})
pid_map = {pid: proj.caa_model_id for pid, proj in caa_batch.projections.items()}
caa_map = {mdl._id: mdl for mdl in caa_batch.mdls}
pids = pid_map.keys()
caa_mids = [pid_map[pid] for pid in pids]
sids = [caa_map[mid].student_id for mid in caa_mids]
pid_map = pd.DataFrame({"proj_id": pids, "caa_model_id": caa_mids, "student_id": sids})

logger.info(f"premerge shape: {y.shape}")
y = pd.merge(y, pid_map, on="proj_id", how="inner")
logger.info(f"post-merge shape: {y.shape}")
cluster_counts = y['cluster'].value_counts()
for lbl in cluster_counts.index: 
    logger.info(f"Cluster label: {lbl}\t count: {cluster_counts[lbl]}")

INFO:analytics.cae:premerge shape: (89, 2)
INFO:analytics.cae:post-merge shape: (89, 4)
INFO:analytics.cae:Cluster label: 0	 count: 20
INFO:analytics.cae:Cluster label: 1	 count: 19
INFO:analytics.cae:Cluster label: 3	 count: 12
INFO:analytics.cae:Cluster label: 4	 count: 11
INFO:analytics.cae:Cluster label: 5	 count: 10
INFO:analytics.cae:Cluster label: 7	 count: 4
INFO:analytics.cae:Cluster label: 6	 count: 4
INFO:analytics.cae:Cluster label: 2	 count: 3
INFO:analytics.cae:Cluster label: -1	 count: 2
INFO:analytics.cae:Cluster label: 9	 count: 2
INFO:analytics.cae:Cluster label: 8	 count: 2


###### Cluster Centers

In [74]:
#logger.info(pd.pivot_table(y, values=['cluster'].count))
cluster_counts = y['cluster'].value_counts()
#logger.info(f"Counts of each cluster label\n {cluster_counts}")
y.loc[:, 'US'] = y.apply(lambda x: caa_batch.projections[x['proj_id']].u, axis=1)
y.loc[:, 'VS'] = y.apply(lambda x: caa_batch.projections[x['proj_id']].v, axis=1)
y.loc[:, 'd'] = y.apply(lambda x: caa_batch.projections[x['proj_id']].d[0], axis=1)

us = y.apply(lambda x: pd.Series(x['US'][0].tolist(), index=caa_batch.col_names), axis=1)
new_colnames = {col: f"u-{col}" for col in us.columns}
u_cols = new_colnames.values()
us.rename(columns=new_colnames, inplace=True)
vs = y.apply(lambda x: pd.Series(x['VS'][0].tolist(), index=caa_batch.col_names), axis=1)
new_colnames = {col: f"v-{col}" for col in vs.columns}
v_cols = new_colnames.values()
vs.rename(columns=new_colnames, inplace=True)
y1 = pd.concat([y, us, vs], axis=1)
logger.info(y1.columns)

cols = us
cluster_center = y1.groupby('cluster')[list(u_cols) + list(v_cols)].mean()
cluster_center.head(15)


INFO:analytics.cae:Index(['proj_id', 'cluster', 'caa_model_id', 'student_id', 'US', 'VS', 'd',
       'u-duration', 'u-plt', 'u-plt1', 'u-hints_used', 'u-hints_avail',
       'u-attempt', 'u-Hint', 'u-Incorrect', 'v-duration', 'v-plt', 'v-plt1',
       'v-hints_used', 'v-hints_avail', 'v-attempt', 'v-Hint', 'v-Incorrect'],
      dtype='object')


Unnamed: 0_level_0,u-duration,u-plt,u-plt1,u-hints_used,u-hints_avail,u-attempt,u-Hint,u-Incorrect,v-duration,v-plt,v-plt1,v-hints_used,v-hints_avail,v-attempt,v-Hint,v-Incorrect
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
-1,8.3e-05,-0.290516,-0.3787,0.0,0.0,0.5,-0.1489519,0.0,0.0,0.0,0.0,0.479705,-0.141156,0.0,0.0,0.5
0,-0.009407,0.0,5e-06,-0.978399,0.0,-0.181005,4.900299e-07,3.660838e-06,0.0,1e-06,3.844038e-07,0.0,1.0,0.0,2.350795e-07,0.0
1,0.0,1.0,0.0,0.0,0.0,-7.298047e-07,0.0,-6.139717e-07,-4.453342e-07,0.0,1.0,0.0,0.0,-1e-06,0.0,-2e-06
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.820755e-05,0.0,0.0,-6.595132e-06,0.0,0.0,1.0,0.0,0.0
3,-0.816841,0.0,0.0,-2.2e-05,1.9e-05,-0.0006068102,0.0,-0.4905022,0.0,0.0,3.589555e-06,0.0,0.0,0.0,1.0,-2e-06
4,9e-06,-0.69657,-0.677054,4.6e-05,0.0,0.0,0.0,4.985184e-06,4.436163e-06,0.0,0.0,0.0,-5e-06,1.0,0.0,5.8e-05
5,2.8e-05,-0.5877,-0.71937,0.0,0.0,7.589852e-06,-0.2228513,0.0,2.470073e-06,0.0,0.0,1e-06,-3e-06,4e-06,0.0,1.0
6,1.0,0.0,0.0,0.0,0.0,8.910013e-06,0.0,0.0,0.0,0.0,0.0,2e-06,-2e-06,6e-06,0.0,1.0
7,-0.825613,0.0,0.0,-2.1e-05,2.1e-05,-0.1486372,1.802847e-05,-0.3111315,0.0,1.0,0.0003327132,0.0,0.0,-4e-06,0.0,0.0
8,-0.505219,0.0,0.0,-1.1e-05,1.4e-05,-0.609168,0.0,-0.6108352,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


##### CAE Regression on Diligence

In [97]:
# Build dataframe with CAE and diligence measures
stu_dict = {stu._id: stu for stu in students}

y2 = y1.groupby("student_id")['cluster'].value_counts().reset_index(name="cluster_count").pivot(index="student_id", columns="cluster", values='cluster_count').fillna(0)
y2.rename(columns={col: f"cluster_{col}" for col in y2.columns}, inplace=True)

y3 = y1.pivot(index="student_id", columns="cluster", values='d').fillna(0)
y3.rename(columns={col: f"cluster_{col}" for col in y3.columns}, inplace=True)

stu_cae = y2.multiply(y3)
stu_cae['diligence'] = pd.Series([stu_dict[sid].decider.diligence for sid in stu_cae.index.tolist()], index=stu_cae.index)

# Drop the noise cluster (-1)
stu_cae.drop(columns=stu_cae.columns[0], inplace=True) 
logger.info(stu_cae.columns)



stu_cae.head(20)

INFO:analytics.cae:Index(['cluster_0', 'cluster_1', 'cluster_2', 'cluster_3', 'cluster_4',
       'cluster_5', 'cluster_6', 'cluster_7', 'cluster_8', 'cluster_9',
       'diligence'],
      dtype='object', name='cluster')


cluster,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,cluster_7,cluster_8,cluster_9,diligence
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
18b22ae5-534d-4616-bb88-07847a3e9d1b,1.029739,0.9742,0.0,0.556747,0.515276,0.527902,0.0,0.0,0.0,0.0,2.291546
3384137f-26f4-477b-83b8-cead72b10646,1.092725,0.98014,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.838617
367f9ea2-9c2a-46aa-b747-172f00cc994f,1.035056,0.980793,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.064241
40709376-ccdf-44dc-868a-ed6f06bf9461,1.013789,0.972943,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.218027
5bc27c57-33e7-4479-84d0-744c3e38096c,1.026154,0.981919,0.0,0.64143,0.58998,0.621623,0.0,0.0,0.0,0.0,1.85169
5e4b9b36-52ec-4af5-862d-2de1aeaeaccf,1.010476,0.984437,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.407868
6d1e7c67-7127-45e3-88f4-462446be123a,1.016217,0.98503,0.0,0.431586,0.521105,0.769438,0.0,0.0,0.0,0.0,1.91343
809b51a7-069a-47be-adac-a93dfa8f100d,1.010969,0.986007,0.0,0.642887,0.0,0.0,0.411744,0.733909,0.745133,0.300937,2.157396
88504e70-a0b4-4eec-b6b2-f563623bb071,1.018548,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.947434
8d2fbcdf-60f3-4ea8-9a0a-ed16566ebc06,1.036437,0.982865,0.0,0.423346,0.452667,0.696214,0.0,0.0,0.0,0.0,2.590934


In [98]:
xcols = [col for col in stu_cae.columns.tolist() if "cluster" in col]
print(xcols)

['cluster_0', 'cluster_1', 'cluster_2', 'cluster_3', 'cluster_4', 'cluster_5', 'cluster_6', 'cluster_7', 'cluster_8', 'cluster_9']


In [99]:

xcols = [col for col in stu_cae.columns.tolist() if "cluster" in col]
ycol = "diligence"
X = stu_cae.loc[:, xcols]
Y = stu_cae.loc[:, ycol]
mdl = sm.OLS(Y, X).fit()
logger.info(mdl.summary())


INFO:analytics.cae:                                 OLS Regression Results                                
Dep. Variable:              diligence   R-squared (uncentered):                   0.990
Model:                            OLS   Adj. R-squared (uncentered):              0.979
Method:                 Least Squares   F-statistic:                              94.36
Date:                Wed, 06 Jan 2021   Prob (F-statistic):                    1.54e-08
Time:                        20:21:18   Log-Likelihood:                          3.2885
No. Observations:                  20   AIC:                                      13.42
Df Residuals:                      10   BIC:                                      23.38
Df Model:                          10                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
----------------------