EDA of new simulation to validate expected relationships given new simulation changes

# Setup

In [1]:
import sys
sys.path.append("../lib")

In [2]:
import math
import random
import uuid
import os
import copy
import itertools
from collections.abc import Iterable
import datetime as dt

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.optimize import Bounds
from scipy.optimize import minimize
from scipy import optimize
from scipy.stats import pearsonr

from sklearn.cluster import DBSCAN
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import FactorAnalysis

from factor_analyzer.factor_analyzer import calculate_kmo
from factor_analyzer import FactorAnalyzer

import statsmodels.api as sm



In [3]:
import logging

#logging.basicConfig(level=logging.DEBUG)
logging.basicConfig(level=logging.INFO)
#logging.basicConfig(level=logging.WARNING)

logger = logging.getLogger("main")

In [4]:
#logging.getLogger().setLevel(logging.DEBUG)
logging.getLogger().setLevel(logging.INFO)
#logging.getLogger().setLevel(logging.WARNING)
logger.debug("Test debug")
logger.info("Test info")
logger.warning("Test warning")

INFO:main:Test info


In [5]:
# from tutor.domain import Domain
from tutor.curriculum_factory import CurriculumFactory
from tutor.simple_curriculum import SimpleCurriculum
from tutor.tutor import SimpleTutor
from tutor.action import Attempt, HintRequest

In [6]:
from learner.selfeff_learner import SelfEfficacyLearner
from learner.modular_learner import ModularLearner
from learner.cognition import *
from learner.decider import *

In [7]:
from simulate.modlearner_simulation import ModLearnerSimulation
from simulate.simulation import *
from simulate.script_helpers import SimHelper

In [8]:
from analytics.student_stats import *
from analytics.cae import *
from analytics.featurization import *
from analytics.batch import *
from analytics.step import *
from analytics.transaction import *
from analytics.session import *

In [9]:
from log_db import mongo
from log_db.curriculum_mapper import DB_Curriculum_Mapper
from log_db.learner_mapper import DBLearnerMapper

In [10]:
from CanonicalAutocorrelationAnalysis.model.caa import CAAComputation
from CanonicalAutocorrelationAnalysis.model.caaObject import *
from CanonicalAutocorrelationAnalysis.model.utils import l1Norm, l2Norm, r2Compute

In [11]:
# Get path to current project directory
cwd = os.path.abspath(".")
base_dir = os.path.abspath(os.path.join(cwd, os.pardir))
logger.info("Base directory for the project:\n%s" % base_dir)

INFO:analytics.session:Base directory for the project:
/rdata/Sandbox/MotivSim


In [12]:
# Setup connection to database
data_out = "sim-%s" % str(uuid.uuid4())
data_path = os.path.join(base_dir,"test", "data", data_out)
logger.info("Writing simulation results to directory: %s" % data_path)
db_name = "motivsim"
db_params  = mongo.get_db_params(db_name)
logger.info("got db params: %s" % str(db_params))
db_util = mongo.Data_Utility(data_path, db_params)
db = db_util.db

INFO:analytics.session:Writing simulation results to directory: /rdata/Sandbox/MotivSim/test/data/sim-0efd8ff0-9653-4218-a763-68af93323c3a
INFO:analytics.session:got db params: {'settingId': 'motivsim', 'url': 'localhost', 'port': '27017', 'name': 'motivsim', 'user': '', 'pswd': ''}


In [13]:
# Test db connection
db_util.peak()

INFO:log_db.mongo:collection name, kcs, has 489 documents
INFO:log_db.mongo:collection name, caa_batches, has 0 documents
INFO:log_db.mongo:collection name, simbatches, has 0 documents
INFO:log_db.mongo:collection name, sessions, has 0 documents
INFO:log_db.mongo:collection name, caa_models, has 0 documents
INFO:log_db.mongo:collection name, domains, has 1 documents
INFO:log_db.mongo:collection name, students, has 10 documents
INFO:log_db.mongo:collection name, units, has 20 documents
INFO:log_db.mongo:collection name, decisions, has 21021 documents
INFO:log_db.mongo:collection name, curriculums, has 1 documents
INFO:log_db.mongo:collection name, steps, has 57347 documents
INFO:log_db.mongo:collection name, finalsimstudents, has 0 documents
INFO:log_db.mongo:collection name, problems, has 13703 documents
INFO:log_db.mongo:collection name, actions, has 21021 documents
INFO:log_db.mongo:collection name, sections, has 70 documents
INFO:log_db.mongo:collection name, tutor_events, has 19019

In [14]:
clear_db = True
if clear_db:
    logger.info("Clearing database before starting new simulation")
    db_util.clear_db()
else:
    logger.info("Skipping Clearing database")

INFO:analytics.session:Clearing database before starting new simulation


# 1. Simulating learners

In [15]:
num_students = 5
num_sessions = 20

In [16]:
sim_helper = SimHelper(db)

In [17]:
def get_cog_params():
    ability = -2
    while (ability < -1) or (ability > 1):
        ability = np.random.normal(0, 0.6)
    return {"ability": ability}

def gen_students(num_students, domain, curric, 
                 cog_mod, cog_params, dec_mod, dec_params):
    stus = []
    for i in range(num_students):
        cp = cog_params()
        cog = cog_mod(domain, **cp)
        dp = dec_params()
        dec = dec_mod(**dp)
        decider = DiligentDecider(dec)
        stu = ModularLearner(domain, cog, decider)
        stus.append(stu)
        
    return stus

def simulate_students(curric, students, batch):    
    
    env = simpy.Environment()

    mastery_thres = 0.95
    m_ses_len = 45
    sd_ses_len = 8
    max_ses_len = 60
    sim_start = dt.datetime.now()

#     mod = round(len(students) / 1)
    mod = 1
    for i, stu in enumerate(students):
        if i % mod == 0:
            logger.info("Simulating student #%i" % i)
        # Create associated tutor
        tutor = SimpleTutor(curric, stu._id, mastery_thres)

        # Initialize simulation processes
        sim = SingleStudentSim(db, env, sim_start, stu, tutor,
                               num_sessions, m_ses_len, sd_ses_len, max_ses_len)
        batch.add_sim(sim)

        env.process(sim.run())

    env.run()
                
    logger.info("Inserting %i simulated students to db" % len(students))
    result = db.finalsimstudents.insert_many([stu.to_dict() for stu in students])
    logger.info("Db insert success: %s" % result.acknowledged)

    logger.info("Inserting simulation batch to db")
    result = db.simbatches.insert_one(batch.to_dict())
    logger.info("Db insert success: %s" % result.acknowledged)

    return batch, students


In [18]:
new_curric = None
new_domain = None

In [19]:
def get_domain_params(params=None):
    if params is None:
        #params = [0.7 , 0.05, 0.05, 0.01, 0.15, 0.05, 0.3 , 0.1 , 0.5 , 0.02]
        params = [0.4 , 0.1, 0.1, 0.03, 0.1, 0.05, 0.1 , 0.05 , 0.1 , 0.07]
    param_keys = ['m_l0', 'sd_l0', 'm_l0_sd', 'sd_l0_sd', 'm_t',
                'sd_t', 'm_s','sd_s', 'm_g', 'sd_g']
    domain_params = {k:v for k,v in zip(param_keys, params)}
    return domain_params

def gen_test_curric(db, db_params):
    global new_curric, new_domain
    if new_curric is None:
        logger.info("Generating new curriculum")
        
        domain_params = get_domain_params()


        curric_params = {'num_units': 20,
                         'mean_sections': 4,
                         'stdev_sections': 2,
                         'mean_unit_kcs': 22,
                         'stdev_unit_kcs': 23,
                         'section_kcs_lambda': 6,
                         'mean_steps': 10,
                         'stdev_steps': 4,
                         'mean_prob_kcs': 6,
                         'stdev_prob_kcs': 3,
                         'num_practice': 100
                        }

        domain, curric = CurriculumFactory.gen_curriculum(domain_params, curric_params)
        db.domains.insert_one(domain.to_dict())
        db.kcs.insert_many([kc.__dict__ for kc in domain.kcs])
        curric_util = DB_Curriculum_Mapper(db_params)
        curric_util.write_to_db(curric)
        
        new_curric = curric
        new_domain = domain

        return domain, curric
    else:
        logger.info("New curriculum already generated")
        return new_domain, new_curric

In [20]:
def get_sim_batch(desc):
    simbatch = db.simbatches.find_one({"desc": desc})
    if simbatch is None:
        logger.info("Generating new simulation. None found in db")

        # generate simualted data for test
        domain, curric = gen_test_curric(db, db_params)
        students = gen_students(num_students, domain, curric, 
                                cog_mod, get_cog_params, 
                                dec_mod, get_dec_params)   
        logger.info(f"Persisting {len(students)} initialized students to db")
        db.students.insert_many([stu.to_dict() for stu in students])
        batch = SimulationBatch(desc)
        simulate_students(curric, students, batch)    
        logger.info(f"Simulated {len(students)} in batch with id: {batch._id}")
    else:
        logger.info(f"Found simulation batch: {str(simbatch['desc'])}")
        lmapper = DBLearnerMapper(db)
        students = [lmapper.get_modlearner_from_db(sid) for sid in simbatch['student_ids']]
        batch = SimulationBatch.from_dict(simbatch)
        logger.info(f"Recovered {len(students)} students from batch with id: {batch._id}")
        
    return students, batch

In [21]:
def get_sim_batch(desc):
    simbatch = db.simbatches.find_one({"desc": desc})

    logger.info("Generating new simulation. None found in db")

    # generate simualted data for test
    domain, curric = gen_test_curric(db, db_params)
    students = gen_students(num_students, domain, curric, 
                            cog_mod, get_cog_params, 
                            dec_mod, get_dec_params)   
    logger.info(f"Persisting {len(students)} initialized students to db")
    db.students.insert_many([stu.to_dict() for stu in students])
    batch = SimulationBatch(desc)
    simulate_students(curric, students, batch)    
    logger.info(f"Simulated {len(students)} in batch with id: {batch._id}")
        
    return students, batch

## Simple Diligent students

In [22]:
sim_batch_desc = "Simple diligent students"
cog_mod = BiasSkillCognition
dec_mod = EVDecider

def get_cog_params():
    # Helper for getting parameters for BiasSkillCognition Module
    ability = random.triangular(-1,1)
    logger.info(f"Generating student with ability: {ability}")
    return {'ability': ability}

def get_dec_params():
    return {}

In [None]:
students, batch = get_sim_batch(sim_batch_desc)

INFO:analytics.session:Generating new simulation. None found in db
INFO:analytics.session:Generating new curriculum
INFO:tutor.cogtutor_curriculum:Generated 20 units with with a total of 598 kcs
INFO:log_db.curriculum_mapper:Writing curriculum with id, 1c95a66f-7d47-48a9-9c19-874e916ea141, to db
INFO:log_db.curriculum_mapper:Writing 16889 problem to db
INFO:log_db.curriculum_mapper:Writing 70560 steps to db
INFO:analytics.session:Generating student with ability: -0.5418028628238127
INFO:analytics.session:Generating student with ability: -0.0217932452004288
INFO:analytics.session:Generating student with ability: -0.34936832087362546
INFO:analytics.session:Generating student with ability: -0.5352304091609776
INFO:analytics.session:Generating student with ability: 0.13224013003494228
INFO:analytics.session:Persisting 5 initialized students to db
INFO:analytics.session:Simulating student #0
DEBUG:simulate.simulation:Class start hour: 11	minute: 0
INFO:analytics.session:Simulating student #

## Self Efficacy Diligent students

In [None]:
sim_batch_desc = "Self-Efficacy students"
cog_mod = BiasSkillCognition
dec_mod = DomainSelfEffDecider

def get_cog_params():
    # Helper for getting parameters for BiasSkillCognition Module
    return {'ability': random.triangular(-1,1)}

def get_dec_params():
    se = -1
    while (se <= 0) or (se >1):
        se = random.gauss(0.5, 0.2)
    return {'attr': {'self_eff': se}}

In [None]:
students, batch = get_sim_batch(sim_batch_desc)

## Intrinsic Interest Diligent students

In [None]:
sim_batch_desc = "Intrinsic Interest students"
cog_mod = BiasSkillCognition
dec_mod = MathInterestDecider

def get_cog_params():
    # Helper for getting parameters for BiasSkillCognition Module
    return {'ability': random.triangular(-1,1)}

def get_dec_params():
    return {'attr': {'interest': np.random.normal(0,1)}}

In [None]:
students, batch = get_sim_batch(sim_batch_desc)

## Self-eff/Interest Diligent students

In [None]:
sim_batch_desc = "Combo students"
cog_mod = BiasSkillCognition
dec_mod = MathIntSelfEffDecider

def get_cog_params():
    # Helper for getting parameters for BiasSkillCognition Module
    return {'ability': random.triangular(-1,1)}

def get_dec_params():
    se = -1
    while (se <= 0) or (se >1):
        se = random.gauss(0.5, 0.2)
    return {'attr': {'self_eff': se, 'interest': np.random.normal(0,1)}}

In [None]:
students, batch = get_sim_batch(sim_batch_desc)

# Validation EDA

## Setup

In [None]:
def get_stu_parameters(sids, mastery_thres):
    sim_students = calc.get_stu_attributes(sids)
    sim_students = pd.concat([sim_students, calc.get_mastery(sids, mastery_thres)], axis=1)
    return sim_students


In [None]:
def get_tx_fields(sids, fields=None):
    tx = pd.DataFrame(db.tutor_events.find({"stu_id": {'$in': sids}, "type": "TutorInput"}))
    # Add kc field that reduces list of kcs to 1 kc
    tx['kc'] = tx.apply(lambda x: x['kcs'][0]['_id'], axis=1)
    if fields is None:
        return tx
    else:
        return tx.loc[:, fields]

In [None]:
def get_tx(sids):
    tx = pd.DataFrame(db.tutor_events.find({"stu_id": {'$in': sid}, 'type': "TutorInput"}))
    tx.index = tx['_id']
    tx['kc'] = tx.explode('kcs')['kcs'].apply(lambda x: x['_id'])
    return tx

def lbl_nondil_tx(tx):
    detector = Detector(db) 
    gt_lblr = TransactionAnnotator(db)
    
    kc_long_tx = detector.get_kc_long_cutoff(tx)
    kc_short_tx = detector.get_kc_short_cutoff(tx)

    # Add Ground truth labels (using global db var)

    tx['is_offtask'] = gt_lblr.label_offtask_tx(tx)
    tx['is_guess'] = gt_lblr.label_guess_tx(tx)

    # Add detector labels
    tx['detect_offtask'] = detector.is_off_task(tx, kc_stats=kc_long_tx)
    tx['detect_guess'] = detector.is_guess(tx, kc_stats=kc_short_tx)
    return tx
    


In [None]:
def calc_accuracy(tx_stats):
    d =  tx_stats['Correct']/ tx_stats['Total Tx']
    d.rename("accuracy", inplace=True)
    return d

def calc_err_on_1_opp(tx):
    d = tx[tx['attempt'] == 0]
    # Get 1st opportunity
    cols = [col for col in d.columns if col not in ['stu_id', 'kc']]
    opp1 = d.groupby(['stu_id', 'kc']).apply(lambda x: x.iloc[0][cols]).reset_index()
    opp1['is_correct'] = opp1['outcome'] == "Correct"
    ## Calculate P(Correct) per student
    opp1 =  opp1.groupby('stu_id')['is_correct'].apply(lambda x: np.sum(x) / len(x))
    opp1.rename("err_on_1_opp", inplace=True)
    return opp1

def calc_avg_prac_opp(tx, mastery_thres=0.9):
    # Get the learner knowledge at the end of each practice step
    d = tx.groupby(['stu_id', 'kc', 'step_id'])['plt1'].apply(lambda x: x[-1]).reset_index()
    d['has_mastered'] = d['plt1'] >= mastery_thres
    # Getting student-kc pairings for analysis
    stu_kc = d.loc[d['has_mastered'], ['stu_id','kc']].drop_duplicates()
    # Get just tx for relevant stu-kc's
    d = pd.merge(tx, stu_kc, on=['stu_id', 'kc'], how='inner')
    d = tx[['stu_id', 'kc', 'step_id']].drop_duplicates().groupby(['stu_id', 'kc']).count().reset_index()
    d = d.groupby('stu_id')['step_id'].mean()
    d.rename('avg_prac_opp', inplace=True)
    return d

def calc_p_guess(tx):
    d = tx.groupby('stu_id')['detect_guess'].apply(lambda x: np.sum(x) / len(x))
    d.rename("p_guess", inplace=True)
    return d

def calc_detected_offtask(tx):
    if "detect_offtask" not in tx:
        tx = lbl_nondil_tx(tx)
    # Student-level off-task vs detected off-task
    d = tx.groupby("stu_id")['detect_offtask'].mean()

    d.rename("mean_detect_offtask", inplace=True)
    return d

def calc_detected_guess(tx):
    if "detect_guess" not in tx:
        tx = lbl_nondil_tx(tx)
    # Student-level off-task vs detected off-task
    d = tx.groupby("stu_id")['detect_guess'].mean()

    d.rename("mean_detect_guess", inplace=True)
    return d

def calc_time_on_task(tx):
    d = tx.pivot_table(index="stu_id", columns="is_offtask", values="duration", fill_value=0, aggfunc=np.sum).reset_index()
    d.index = d['stu_id']
    d.rename(columns={False: "time_on_task", True: "time_off_task"}, inplace=True)
    d.drop(columns=['stu_id'], inplace=True)
    d['time_on_task'] = d['time_on_task'] / 3600
    d['time_off_task'] = d['time_off_task'] / 3600
    return d

    
    


In [None]:
def get_student_sessions(sids):
    # Get login-logout transactions for students
    tx = pd.DataFrame(db.tutor_events.find({"stu_id": {'$in': sid}, "type": {"$in": ["SessionStart", "SessionEnd"]}}))
    ses_ids = tx['session_id'].unique().tolist()
    # Append session metadata
    sessions = pd.DataFrame(db.sessions.find({"_id": {'$in': ses_ids}}))
    sessions.drop(columns=['type'], inplace=True)
    sessions.rename(columns={"_id": "session_id"}, inplace=True)
    tx = pd.merge(tx, sessions, on="session_id", how='inner')
    return tx

def calc_session_stats(sids):
    tx = get_student_sessions(sids)
    
    # Calc student-session stats
    session_stats = tx.pivot(index=['stu_id', 'session_id'], columns='type', values='time').reset_index()
    session_stats = pd.merge(tx.loc[:, ['stu_id', 'session_id', 'start', 'end']].drop_duplicates(), session_stats, on=['stu_id', 'session_id'])

    # Start/end speed
    session_stats['start speed'] = session_stats.apply(lambda x: (x['SessionStart'] - x['start']).total_seconds()/60, axis=1)
    session_stats['early finish'] = session_stats.apply(lambda x: (x['end'] - x['SessionEnd']).total_seconds()/60, axis=1)

    # session length
    session_stats['session length'] = session_stats.apply(lambda x: (x['SessionEnd'] - x['SessionStart']).total_seconds()/60, axis=1)
    session_stats['class length'] = session_stats.apply(lambda x: (x['end'] - x['start']).total_seconds()/60, axis=1)
    session_stats['pct class'] = session_stats['session length'] / session_stats['class length']
    
    return session_stats

def calc_stu_session_stats(sids):
    session_stats = calc_session_stats(sids)
    stu_session_stats = session_stats.groupby('stu_id')['start speed', 'early finish', 'session length', 'pct class'].agg('mean', 'std')
    return stu_session_stats


def calc_student_dil_stats(sids):
    tx = get_tx(sid)
    tx = lbl_nondil_tx(tx)

    detect_offtask = calc_detected_offtask(tx)
    detect_guess = calc_detected_guess(tx)
    time_on_task = calc_time_on_task(tx)

    return pd.concat([detect_guess, detect_offtask, time_on_task], axis=1)



## Calculate Stats

In [None]:
# Calculating stats with analytic methods

calc = StudentStatCalc(db)
batch_calc = BatchCalculator()
detector = Detector(db)
tx_lblr = TransactionAnnotator(db)
step_calc = StepCalculator(db)
tx_calc = TransactionCalculator(db)
session_calc = SessionCalculator(db)

In [None]:
# Get students batches
batches = [batch for batch in db.simbatches.find()]
batch_descs = ["Simple diligent students",
               "Self-Efficacy students",
               "Intrinsic Interest students",
               "Combo students"
              ]
sids = {desc: [] for desc in batch_descs}
for i, batch in enumerate(batches):
    logger.info(f"batch #{i}: \nID: {batch['_id']}\ndesc: {batch['desc']}")
    if batch['desc'] in batch_descs:
        logger.info(f"recovered {len(batch['student_ids'])} student ids for batch {batch['desc']}")
        sids[batch['desc']].append(batch['student_ids'])

In [None]:

stu_stats = {desc: {} for desc in batch_descs}
mastery_thres = 0.9
stu_prob_stats = {desc: {} for desc in batch_descs}
kc_stats = {desc: {} for desc in batch_descs}
det_err = {desc: {} for desc in batch_descs}
for desc in batch_descs:

    if desc == 'Simple diligent students':
        sid = sids[desc][0]
        logger.info(f"EDA of {len(sid)} students for batch, {desc}")


    if len(sid) > 0:
        logger.info(f"Calculating for student set:\t{desc}")
        args = [get_stu_parameters, sid, 10, [mastery_thres]]
        sim_students, runtime = batch_calc.time_calc(batch_calc.batch_calc, args)
        logger.info(f"Calculated student params: {sim_students.shape}\tRuntime: {runtime} seconds")

        args = [calc.get_action_counts, sid, 10]
        action_dist, runtime = batch_calc.time_calc(batch_calc.batch_calc, args)
        logger.info(f"Calculated student action stats: {action_dist.shape}\tRuntime: {runtime} seconds")

        args = [calc.total_tx_stats, sid, 10]
        tx_stats, runtime = batch_calc.time_calc(batch_calc.batch_calc, args)
        logger.info(f"Calculated student activity stats: {tx_stats.shape}\tRuntime: {runtime} seconds")


        args = [session_calc.calc_stu_session_stats, sid, 10]
        ses_stats, runtime = batch_calc.time_calc(batch_calc.batch_calc, args)
        logger.info(f"Calculated student activity stats: {ses_stats.shape}\tRuntime: {runtime} seconds")
        
        tx = get_tx(sid)
        tx = pd.concat([tx, tx_lblr.label_nondil_tx(tx)], axis=1)
        # Merge decisions with tx
        decisions, actions = tx_lblr.get_tx_decisions(tx)
        tx = tx_lblr.merge_decisions(tx, actions, decisions)
        
        # Get step rollup
        steps = step_calc.rollup_tx(tx)
        steps = pd.concat([steps, step_calc.label_knowledge(steps)], axis=1)

        
        # Calculate student non-diligent stats
        detect_offtask = calc.calc_detected_offtask(tx)
        detect_guess = calc.calc_detected_guess(tx)
        time_on_task = calc.calc_time_on_task(tx)

        # Count opportunities at each knowledge level
        kc_prac = step_calc.count_practice_challenge(steps)
        over_prac = kc_prac.apply(lambda x: x['high_knowledge'] / np.sum(x), axis=1)
        
        # Calculate work rate:
        work_rate = calc.calc_avg_work_rate(steps)
        
        # Calculate expected work time
        d = pd.DataFrame(db.kcs.find({"_id": {"$in": steps['kc'].tolist()}})).rename(columns={'_id': "kc"})
        steps = pd.merge(steps, d, on="kc", how="outer")
        steps['expected_duration'] = steps['Attempts'] * steps['m_time']
        exp_work_rate = steps.groupby('stu_id')['duration'].sum() / steps.groupby('stu_id')['expected_duration'].sum()
        
        # Calculate hint usage by knowledge level
        stu_level_hints = steps.groupby(['stu_id', 'knowledge_level'])['Hint'].mean()
        # Fit line for each student tendency to request hint
    
        
        # Calculate detector error
        # Plot Detector vs ground truth
        tx['offtask_tp'] = (tx['is_offtask'] == True) & (tx['detect_offtask'] == True)
        tx['offtask_fp'] = (tx['is_offtask'] == False) & (tx['detect_offtask'] == True)
        tx['offtask_tn'] = (tx['is_offtask'] == False) & (tx['detect_offtask'] == False)
        tx['offtask_fn'] = (tx['is_offtask'] == True) & (tx['detect_offtask'] == False)

        total_stats = {}
        total_stats['ot_tp'] = np.sum(tx['offtask_tp']) / tx.shape[0]
        total_stats['ot_fp'] = np.sum(tx['offtask_fp']) / tx.shape[0]
        total_stats['ot_tn'] = np.sum(tx['offtask_tn']) / tx.shape[0]
        total_stats['ot_fn'] = np.sum(tx['offtask_fn']) / tx.shape[0]

        tx['guess_tp'] = (tx['is_guess'] == True) & (tx['detect_guess'] == True)
        tx['guess_fp'] = (tx['is_guess'] == False) & (tx['detect_guess'] == True)
        tx['guess_tn'] = (tx['is_guess'] == False) & (tx['detect_guess'] == False)
        tx['guess_fn'] = (tx['is_guess'] == True) & (tx['detect_guess'] == False)

        total_stats['g_tp'] = np.sum(tx['guess_tp']) / tx.shape[0]
        total_stats['g_fp'] = np.sum(tx['guess_fp']) / tx.shape[0]
        total_stats['g_tn'] = np.sum(tx['guess_tn']) / tx.shape[0]
        total_stats['g_fn'] = np.sum(tx['guess_fn']) / tx.shape[0]
        det_err[desc] = total_stats

        stu_stats[desc] = pd.concat([sim_students, action_dist, tx_stats, ses_stats, 
                                     detect_offtask, detect_guess, time_on_task, kc_prac, 
                                     over_prac, work_rate, exp_work_rate], axis=1)
        logger.info(f"Merged new stats together: {stu_stats[desc].shape}")


## Simulation Noise

### Session Opportunity

In [None]:
plt.figure(figsize=(15,4))
l = len(batch_descs)
for i,desc in enumerate(batch_descs):
    sstats = stu_stats[desc]
    col = 'total opportunity'
    plt.subplot(1,l, i+1)
    plt.hist(sstats['total opportunity'])
    plt.title(col)    
plt.show()
    

### Detector error rates

In [None]:
plt.figure(figsize=(15,4))
l = len(batch_descs)
for i,desc in enumerate(batch_descs):
    d = det_err[desc]
    logger.info(f"************** {desc} ******************")
    logger.info(f"Offtask Error Rates: \nTP: {d['ot_tp']}\tFP: {d['ot_fp']}\tTN: {d['ot_tn']}\tFN: {d['ot_fn']}")
    logger.info(f"Guess Error Rates: \nTP: {d['g_tp']}\tFP: {d['g_fp']}\tTN: {d['g_tn']}\tFN: {d['g_fn']}")
plt.show()

### Work Rate - Tutor Policy Overpractice

In [None]:

know_lvls = ['low_knowledge', 'low-mid_knowledge','mid_knowledge', 'mid-high_knowledge', 'high_knowledge']
for i,desc in enumerate(batch_descs):
    sstats = stu_stats[desc]
    logger.info(f"************** {desc} ******************")
    plt.figure(figsize=(15,4))    
    for j, col in enumerate(know_lvls):
        plt.subplot(1,len(know_lvls), j+1)
        plt.hist(sstats[col])
        plt.xlabel(col)
        plt.title(col)

    plt.show()

## Diligence

### Total Time

In [None]:

cols = ['Total Time(hours)', 'pct class', 'start speed', 'early finish', 'session length']
for i,desc in enumerate(batch_descs):
    sstats = stu_stats[desc]
    logger.info(f"************** {desc} ******************")
    plt.figure(figsize=(20,4))    
    for j, ycol in enumerate(cols):
        plt.subplot(1,len(cols), j+1)
        xcol = 'diligence'
        x = sstats[xcol]
        y = sstats[ycol]
        plt.scatter(x,y)
        plt.xlabel(xcol)
        plt.ylabel(ycol)
        corr, pval = pearsonr(x,y)
        logger.info(f"{ycol} Correlation with {xcol}: \nR = {corr}\t pval = {pval}")


    plt.show()

### Off-task & Guessing

In [None]:

cols = ['Pct OffTask', 'Pct Guess', 'mean_detect_offtask', 'mean_detect_guess']
for i,desc in enumerate(batch_descs):
    sstats = stu_stats[desc]
    logger.info(f"************** {desc} ******************")
    plt.figure(figsize=(20,4))    
    for j, ycol in enumerate(cols):
        plt.subplot(1,len(cols), j+1)
        xcol = 'diligence'
        x = sstats[xcol]
        y = sstats[ycol]
        plt.scatter(x,y)
        plt.xlabel(xcol)
        plt.ylabel(ycol)
        corr, pval = pearsonr(x,y)
        logger.info(f"{ycol} Correlation with {xcol}: \nR = {corr}\t pval = {pval}")


    plt.show()

### Time-on-task

In [None]:

cols = ['Total Time(hours)', 'time_on_task', 'time_off_task']
for i,desc in enumerate(batch_descs):
    sstats = stu_stats[desc]
    logger.info(f"************** {desc} ******************")
    plt.figure(figsize=(20,4))    
    for j, ycol in enumerate(cols):
        plt.subplot(1,len(cols), j+1)
        xcol = 'diligence'
        x = sstats[xcol]
        y = sstats[ycol]
        plt.scatter(x,y)
        plt.xlabel(xcol)
        plt.ylabel(ycol)
        corr, pval = pearsonr(x,y)
        logger.info(f"{ycol} Correlation with {xcol}: \nR = {corr}\t pval = {pval}")


    plt.show()

## Cog Ability

### Knowledge & Learning

In [None]:

cols = ['pre-sim total skill', 'final-sim total skill', 'final-sim total mastery', 'total learning', 'total mastered']
for i,desc in enumerate(batch_descs):
    sstats = stu_stats[desc]
    logger.info(f"************** {desc} ******************")
    plt.figure(figsize=(20,4))    
    for j, ycol in enumerate(cols):
        plt.subplot(1,len(cols), j+1)
        xcol = 'cog_ability'
        x = sstats[xcol]
        y = sstats[ycol]
        plt.scatter(x,y)
        plt.xlabel(xcol)
        plt.ylabel(ycol)
        corr, pval = pearsonr(x,y)
        logger.info(f"{ycol} Correlation with {xcol}: \nR = {corr}\t pval = {pval}")


    plt.show()

### Early Finish

In [None]:

cols = ['start speed', 'early finish', 'pct class', 'session length']
for i,desc in enumerate(batch_descs):
    sstats = stu_stats[desc]
    logger.info(f"************** {desc} ******************")
    plt.figure(figsize=(20,4))    
    for j, ycol in enumerate(cols):
        plt.subplot(1,len(cols), j+1)
        xcol = 'cog_ability'
        x = sstats[xcol]
        y = sstats[ycol]
        plt.scatter(x,y)
        plt.xlabel(xcol)
        plt.ylabel(ycol)
        corr, pval = pearsonr(x,y)
        logger.info(f"{ycol} Correlation with {xcol}: \nR = {corr}\t pval = {pval}")


    plt.show()

### Work Rate

### Offtask

In [None]:

cols = ['Pct Offtask', 'time_off_task', 'mean_detect_offtask', 'time_on_task']
for i,desc in enumerate(batch_descs):
    sstats = stu_stats[desc]
    logger.info(f"************** {desc} ******************")
    plt.figure(figsize=(20,4))    
    for j, ycol in enumerate(cols):
        plt.subplot(1,len(cols), j+1)
        xcol = 'cog_ability'
        x = sstats[xcol]
        y = sstats[ycol]
        plt.scatter(x,y)
        plt.xlabel(xcol)
        plt.ylabel(ycol)
        corr, pval = pearsonr(x,y)
        logger.info(f"{ycol} Correlation with {xcol}: \nR = {corr}\t pval = {pval}")


    plt.show()

## Self-Efficacy

### Work Rate

### Offtask

In [None]:

cols = ['Pct Offtask', 'time_off_task', 'mean_detect_offtask', 'time_on_task']

for i,desc in enumerate(descs):
        sstats = stu_stats[desc]  
        logger.info(f"************** {desc} ******************")
        plt.figure(figsize=(20,4))    
        for j, ycol in enumerate(cols):
            plt.subplot(1,len(cols), j+1)
            xcol = 'dec_self_eff'
            x = sstats[xcol]
            y = sstats[ycol]
            plt.scatter(x,y)
            plt.xlabel(xcol)
            plt.ylabel(ycol)
            corr, pval = pearsonr(x,y)
            logger.info(f"{ycol} Correlation with {xcol}: \nR = {corr}\t pval = {pval}")


        plt.show()

## Intrinsic Interest

### Total Time

### Offtask

### Time-on-task