# Tuning hyperparameters for domain

This notebook explores the hyperparameters for generating a domain. The domain as assumed to be a simply non-hierarchical structure where all kc's are independent.

In [1]:
import sys
sys.path.append("../")

In [2]:
import math
import random
import uuid
import os
import copy
from collections.abc import Iterable
from datetime import datetime as dt

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import Bounds
from scipy.optimize import minimize
from scipy import optimize

In [3]:
import logging

#logging.basicConfig(level=logging.DEBUG)
#logging.basicConfig(level=logging.INFO)
logging.basicConfig(level=logging.WARNING)

logger = logging.getLogger("main")

In [4]:
from tutor.domain import Domain
from tutor.cogtutor_curriculum import CogTutorCurriculum
from tutor.tutor import SimpleTutor
from tutor.action import Attempt, HintRequest
from learner.domain_tuner import DomainTuner

from simulate.self_eff_simulation import SelfEffSimulation
from log_db import mongo
from log_db.curriculum_mapper import DB_Curriculum_Mapper

In [5]:
# Get path to current project directory
cwd = os.path.abspath(".")
base_dir = os.path.abspath(os.path.join(cwd, os.pardir))
logger.debug("Base directory for the project:\n%s" % base_dir)

In [6]:
# Setup connection to database
data_out = "sim-%s" % str(uuid.uuid4())
data_path = os.path.join(base_dir,"test", "data", data_out)
logger.info("Writing simulation results to directory: %s" % data_path)
db_name = "motivsim"
db_params  = mongo.get_db_params(db_name)
logger.info("got db params: %s" % str(db_params))
db_util = mongo.Data_Utility(data_path, db_params)
db = db_util.db

In [7]:
logger.info("Clearing database before starting new simulation")
db_util.clear_db()

## Multi-simulation run

### Simulation Run Parameters

In [48]:
num_sims=20
stu_per_sim = 20

### Simualtion Runs

In [21]:
def gen_curriculum(domain,
                   num_units=1,
                   mean_sections=4,
                   stdev_sections=2,
                   mean_unit_kcs=5,
                   stdev_unit_kcs=5,
                   section_kcs_lambda=2,
                   mean_steps=3,
                   stdev_steps=1,
                   mean_prob_kcs=2,
                   stdev_prob_kcs=1,
                   num_practice=100,
                   
                  ):# Generating the Curriculum and domain together
#    domain = Domain()
    curric = CogTutorCurriculum(domain)
    curric.generate(num_units=num_units,
                   mean_sections=mean_sections,
                   stdev_sections=stdev_sections,
                   mean_unit_kcs=mean_unit_kcs,
                   stdev_unit_kcs=stdev_unit_kcs,
                   section_kcs_lambda=section_kcs_lambda,
                   mean_steps=mean_steps,
                   stdev_steps=stdev_steps,
                   mean_prob_kcs=mean_prob_kcs,
                   stdev_prob_kcs=stdev_prob_kcs,
                   num_practice=num_practice
                   )
    
    # Insert domain to db
    #db.domains.insert_one(domain.to_dict())
    #db.kcs.insert_many([kc.__dict__ for kc in domain.kcs])

    # Insert Curriculum to db
    #curric_util = DB_Curriculum_Mapper(db_params)
    #curric_util.write_to_db(curric)
    
    return curric

In [22]:
def gen_students(domain, num_students=2):
    students = [DomainTuner(domain) for i in range(num_students)]
    logger.info("Sample student:\n%s" % str(students[0]))
    logger.info("Inserting %i students to db" % len(students))
    result = db.students.insert_many([stu.to_dict() for stu in students])
    logger.info("Db insert success: %s" % result.acknowledged)
    return students

In [23]:
def simulate_students(domain, curric, students):    
    for i, stu in enumerate(students):
        logger.info("Simulating student #%i" % i)
        sim = SelfEffSimulation(domain, curric, stu)
        sim.run()

In [24]:
def calc_sim_stats(curric, students):

    stu_ids = [stu._id for stu in students]
    tx = pd.DataFrame(db.tutor_events.find({'type': "Tutor Input", 'stu_id': {"$in": stu_ids}}))
    logger.info("Learner Transactions: %s" % str(tx.shape))
    # Add kc field that reduces list of kcs to 1 kc
    tx['kc'] = tx.apply(lambda x: x['kcs'][0]['_id'], axis=1)
    
    # Aggregate Learner transaction stats

    # Total Transaction counts
    stu_stats = tx.groupby('stu_id').agg({'_id': 'count', 
                                          'duration': np.sum,
                                         })
    stu_stats.rename(columns={'_id': "Total Tx",
                              'duration': 'Total Time'}, 
                             inplace = True)
    stu_stats['Total Time(hours)'] = stu_stats['Total Time'].apply(lambda x: x / 3600)
    #logger.info("Number of students: %i" % stu_stats.shape[0])
    #logger.info(stu_stats["Total Tx"].describe())

    # Total of each outcome
    d = tx.groupby(['stu_id','outcome'])['_id'].count().reset_index().pivot(index='stu_id', columns='outcome', values='_id')
    # Proporation of each outcome
    if len(d.columns) > 1:
        d['Total'] = d.sum(axis=1)
    else:
        d['Total'] = d.iloc[:,0]

    for col in d.columns:
        if col != 'Total':
            d['Pct %s' % col] = d[col] / d['Total']
    stu_stats = pd.concat([stu_stats, d], axis=1)
    
    # Calculate attempts per ste
    stu_step_stats = tx.groupby(['stu_id', 'step_id'])['_id'].count().reset_index()
    stu_step_stats.rename(columns={'_id': 'step attempts'}, inplace=True)
    step_stats = stu_step_stats.groupby('stu_id').describe()
    
    # Calculate opportunities per kc per student
    stu_kc_stats = tx[['stu_id', 'kc', 'step_id']].drop_duplicates().groupby(['stu_id', 'kc']).count().reset_index()
    stu_kc_stats.rename(columns={'step_id': 'kc opportunities'}, inplace=True)
    kc_stats = stu_kc_stats.groupby('kc').describe()

    
    # consolidate distributional stats of resulting data

    stats = {}
    stats['Step attempts mean'] = step_stats[('step attempts', 'mean')].mean()
    stats['Step attempts std'] = step_stats[('step attempts', 'std')].mean()
    accuracy_dist = stu_stats['Pct Correct'].describe()
    stats['Mean Pct Correct'] = accuracy_dist['mean']
    stats['Std Pct Correct'] = accuracy_dist['std']
    stats['KC opportunity mean'] = kc_stats[('kc opportunities', 'mean')].mean()
    stats['KC opportunity std'] = kc_stats[('kc opportunities', 'std')].mean()
    return stats

In [25]:
def run_sim(num_stu, hyperparams=None):
    domain = Domain()
    # Set domain hyperparams
    if hyperparams is not None:
        domain.set_kc_hyperparams(**hyperparams)
    logger.info("*** domain has %i kcs before curric *****" % len(domain.kcs) )
    curric = gen_curriculum(domain)
    logger.info("*** domain has %i kcs *****" % len(domain.kcs) )
    students = gen_students(domain, num_stu)
    simulate_students(domain, curric, students)
    stats = calc_sim_stats(curric, students)

    return stats

### Tuning Objective

Tuning on distribution of opportunities per kc, attempts per step, and student accuracy


In [14]:
target = {
    'Step attempts mean': 1,
    'Step attempts std': 0.4,
    'Mean Pct Correct': 0.8,
    'Std Pct Correct': 0.1,
    'KC opportunity mean': 7,
    'KC opportunity std': 3
}

def target_obj(pred, target=target):
    d = pd.DataFrame([pred, target])
    err = math.sqrt(np.sum(d.apply(lambda x: (x[1] - x[0])**2, axis=0)))
    return err


In [44]:

def eval_sim_params(params, num_sims, stu_per_sim):
    start = dt.now()
    logger.error("running eval sim %i times with params: %s" % (num_sims, str(params)))
    db_util = mongo.Data_Utility(data_path, db_params)
    param_dict = {
            'm_l0': params[0],
            'sd_l0':params[1],
            'm_t':params[2],
            'sd_t':params[3],
            'm_s':params[4],
            'sd_s':params[5],
            'm_g':params[6],
            'sd_g':params[7]
    }
    errs = []
    for i in range(num_sims):
        result = run_sim(stu_per_sim, param_dict)
        errs.append(target_obj(result))
        #param_dict['error'] = err
    db_util.clear_db()
    end = dt.now()
    err = np.mean(errs)
    stdev_err = np.std(errs)
    logger.error("Run took %f seconds with mean err: %f\t stdev err: %f" % (((end - start).total_seconds(), err, stdev_err)))

    return err

    

### Run Tuning simulation runs

In [16]:
params = {
            'm_l0':0.5,
            'sd_l0':0.1,
            'm_t':0.2,
            'sd_t':0.03,
            'm_s':0.05,
            'sd_s':0.03,
            'm_g':0.8,
            'sd_g':0.15
}
bounds = Bounds([0.2, 0.01, 0.01, 0.0, 0.0, 0.0, 0.0, 0.0],
               [0.7, 0.3, 0.8, 0.05, 0.3, 0.1, 1.0, 0.2])
bounds = [(0.2,0.7), (0.01,0.3), (0.01,0.8), (0.01,0.05),
        (0.01,0.3), (0.01,0.1), (0.2,1.0),(0.01,0.2)]

In [49]:
# Test specific case
#params = [0.7,  0.3,  0.8,  0.05, 0.3,  0.1,  1.0,  0.01]
#params = [0.9,  0.3,  0.1,  0.01, 0.01, 0.01, 0.2,  0.01]
params = [0.45 , 0.155, 0.45 , 0.03 , 0.155, 0.055, 0.6  , 0.105]
reps = 10

for i in range(reps):
    start = dt.now()
    result = eval_sim_params(params, num_sims, stu_per_sim)
    end = dt.now()
    time = (end - start).total_seconds()    
print("Done")

ERROR:main:running eval sim 20 times with params: [0.45, 0.155, 0.45, 0.03, 0.155, 0.055, 0.6, 0.105]
ERROR:main:Run took 53.179712 seconds with mean err: 2.176829	 stdev err: 0.235250
ERROR:main:running eval sim 20 times with params: [0.45, 0.155, 0.45, 0.03, 0.155, 0.055, 0.6, 0.105]
ERROR:main:Run took 36.435451 seconds with mean err: 2.314155	 stdev err: 0.392584
ERROR:main:running eval sim 20 times with params: [0.45, 0.155, 0.45, 0.03, 0.155, 0.055, 0.6, 0.105]
ERROR:main:Run took 43.639701 seconds with mean err: 2.294023	 stdev err: 0.367257
ERROR:main:running eval sim 20 times with params: [0.45, 0.155, 0.45, 0.03, 0.155, 0.055, 0.6, 0.105]
ERROR:main:Run took 39.801944 seconds with mean err: 2.243192	 stdev err: 0.292613
ERROR:main:running eval sim 20 times with params: [0.45, 0.155, 0.45, 0.03, 0.155, 0.055, 0.6, 0.105]
ERROR:main:Run took 41.133298 seconds with mean err: 2.249730	 stdev err: 0.308134
ERROR:main:running eval sim 20 times with params: [0.45, 0.155, 0.45, 0.03,

Done


In [27]:
reps = 0 # Set to zero so it won't accidntally execute
times = []
for i in range(reps):
    params = [
        random.uniform(bounds[0][0], bounds[0][1]),
        random.uniform(bounds[1][0], bounds[1][1]),
        random.uniform(bounds[2][0], bounds[2][1]),
        random.uniform(bounds[3][0], bounds[3][1]),
        random.uniform(bounds[4][0], bounds[4][1]),
        random.uniform(bounds[5][0], bounds[5][1]),
        random.uniform(bounds[6][0], bounds[6][1]),
        random.uniform(bounds[7][0], bounds[7][1])
    ]
    start = dt.now()
    result = eval_sim_params(params, num_sims, stu_per_sim)
    end = dt.now()
    times.append((end - start).total_seconds())
    #print("Run took %f seconds" % times[-1])
    #print(result)
print("average run time: %f seconds" % np.mean(times))

ERROR:main:running eval sim with params: [0.35181748774918264, 0.09538464127699922, 0.7443278300947559, 0.03752100221889215, 0.15953677932157578, 0.0711925048830483, 0.7326071730675183, 0.1537402111589559]
ERROR:main:Run took 1.869538 seconds with err: 3.792628
ERROR:main:running eval sim with params: [0.2900439330216229, 0.04804394441896681, 0.10869293695652898, 0.03036798709543087, 0.23675498080350257, 0.010941541053258187, 0.9329190156266962, 0.1214169799540074]
ERROR:main:Run took 3.374368 seconds with err: 15.335938
ERROR:main:running eval sim with params: [0.21398870591739877, 0.14807752438332306, 0.03296899467837834, 0.03574692248100567, 0.2939841848873683, 0.05978041538505007, 0.9562710580560692, 0.04059258349988195]
ERROR:main:Run took 10.909906 seconds with err: 64.973859
ERROR:main:running eval sim with params: [0.4605252475537505, 0.2821039369099674, 0.07811591196597117, 0.030007804737349167, 0.20089476881950183, 0.04322572178040254, 0.9223326748653229, 0.12178478401650884]

average run time: 2.847385 seconds


In [46]:
#bounds = [(0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0),
#          (0.7, 0.3, 0.8, 0.05, 0.3, 0.1, 1.0, 0.2)]
bounds = [(0.2,0.7), (0.01,0.3), (0.1,0.8), (0.01,0.05),
        (0.01,0.3), (0.01,0.1), (0.2,1.0),(0.01,0.2)]
#init_params = list(params.values())
#print("Initial parameters: %s" % str(init_params))
print("Bounds: %s" % str(bounds))
start = dt.now()
min_result = optimize.shgo(eval_sim_params, bounds, args=(num_sims, stu_per_sim))
end = dt.now()
print("operation took: %s" % str((end - start)))
print("Minimize result: %s" % str(min_result))

ERROR:main:running eval sim 10 times with params: [0.2  0.01 0.1  0.01 0.01 0.01 0.2  0.01]


Bounds: [(0.2, 0.7), (0.01, 0.3), (0.1, 0.8), (0.01, 0.05), (0.01, 0.3), (0.01, 0.1), (0.2, 1.0), (0.01, 0.2)]


ERROR:main:Run took 46.068432 seconds with mean err: 11.219693	 stdev err: 2.679520
ERROR:main:running eval sim 10 times with params: [0.7  0.3  0.8  0.05 0.3  0.1  1.   0.2 ]
ERROR:main:Run took 5.810732 seconds with mean err: 4.525129	 stdev err: 0.448024
ERROR:main:running eval sim 10 times with params: [0.7  0.01 0.1  0.01 0.01 0.01 0.2  0.01]
ERROR:main:Run took 19.757907 seconds with mean err: 4.355984	 stdev err: 2.428491
ERROR:main:running eval sim 10 times with params: [0.7  0.3  0.1  0.01 0.01 0.01 0.2  0.01]
ERROR:main:Run took 13.217389 seconds with mean err: 4.709852	 stdev err: 1.575105
ERROR:main:running eval sim 10 times with params: [0.7  0.3  0.8  0.01 0.01 0.01 0.2  0.01]
ERROR:main:Run took 4.396909 seconds with mean err: 4.319794	 stdev err: 0.384686
ERROR:main:running eval sim 10 times with params: [0.7  0.3  0.8  0.05 0.01 0.01 0.2  0.01]
ERROR:main:Run took 5.298064 seconds with mean err: 4.423306	 stdev err: 0.362549
ERROR:main:running eval sim 10 times with pa

ERROR:main:Run took 127.875131 seconds with mean err: 24.110957	 stdev err: 20.046480
ERROR:main:running eval sim 10 times with params: [0.7  0.3  0.1  0.05 0.01 0.01 1.   0.01]
ERROR:main:Run took 20.201263 seconds with mean err: 16.819161	 stdev err: 10.691887
ERROR:main:running eval sim 10 times with params: [0.7  0.3  0.1  0.05 0.01 0.01 1.   0.2 ]
ERROR:main:Run took 15.972637 seconds with mean err: 11.357581	 stdev err: 6.443439
ERROR:main:running eval sim 10 times with params: [0.7  0.3  0.1  0.05 0.01 0.01 0.2  0.2 ]
ERROR:main:Run took 51.809987 seconds with mean err: 17.448186	 stdev err: 12.587684
ERROR:main:running eval sim 10 times with params: [0.7  0.3  0.1  0.01 0.3  0.01 0.2  0.01]
ERROR:main:Run took 15.857254 seconds with mean err: 6.036892	 stdev err: 2.891824
ERROR:main:running eval sim 10 times with params: [0.7  0.3  0.1  0.01 0.3  0.1  0.2  0.01]
ERROR:main:Run took 19.260125 seconds with mean err: 5.360428	 stdev err: 2.291106
ERROR:main:running eval sim 10 tim

ERROR:main:Run took 4.731955 seconds with mean err: 4.547638	 stdev err: 0.505236
ERROR:main:running eval sim 10 times with params: [0.7  0.01 0.8  0.01 0.01 0.01 1.   0.01]
ERROR:main:Run took 3.390488 seconds with mean err: 5.031869	 stdev err: 0.384954
ERROR:main:running eval sim 10 times with params: [0.7  0.01 0.8  0.01 0.01 0.01 1.   0.2 ]
ERROR:main:Run took 2.534353 seconds with mean err: 3.938398	 stdev err: 0.930641
ERROR:main:running eval sim 10 times with params: [0.7  0.01 0.8  0.01 0.01 0.01 0.2  0.2 ]
ERROR:main:Run took 4.973611 seconds with mean err: 5.235959	 stdev err: 1.352028
ERROR:main:running eval sim 10 times with params: [0.7  0.01 0.1  0.05 0.01 0.01 0.2  0.01]
ERROR:main:Run took 15.013995 seconds with mean err: 5.325220	 stdev err: 2.330600
ERROR:main:running eval sim 10 times with params: [0.7  0.01 0.1  0.05 0.3  0.01 0.2  0.01]
ERROR:main:Run took 26.336986 seconds with mean err: 10.021991	 stdev err: 5.653051
ERROR:main:running eval sim 10 times with par

ERROR:main:Run took 3.277719 seconds with mean err: 3.798864	 stdev err: 0.372685
ERROR:main:running eval sim 10 times with params: [0.2  0.3  0.8  0.05 0.01 0.1  1.   0.2 ]
ERROR:main:Run took 4.512380 seconds with mean err: 3.813964	 stdev err: 0.423776
ERROR:main:running eval sim 10 times with params: [0.2  0.3  0.8  0.05 0.01 0.1  0.2  0.2 ]
ERROR:main:Run took 7.360674 seconds with mean err: 5.173037	 stdev err: 0.903759
ERROR:main:running eval sim 10 times with params: [0.2  0.3  0.8  0.05 0.01 0.01 1.   0.01]
ERROR:main:Run took 5.107965 seconds with mean err: 4.108144	 stdev err: 0.640535
ERROR:main:running eval sim 10 times with params: [0.2  0.3  0.8  0.05 0.01 0.01 1.   0.2 ]
ERROR:main:Run took 4.944975 seconds with mean err: 3.876160	 stdev err: 0.227551
ERROR:main:running eval sim 10 times with params: [0.2  0.3  0.8  0.05 0.01 0.01 0.2  0.2 ]
ERROR:main:Run took 11.230141 seconds with mean err: 8.517354	 stdev err: 7.809644
ERROR:main:running eval sim 10 times with param

ERROR:main:Run took 22.471111 seconds with mean err: 17.058438	 stdev err: 1.947074
ERROR:main:running eval sim 10 times with params: [0.2  0.3  0.1  0.01 0.01 0.1  0.2  0.2 ]
ERROR:main:Run took 106.228483 seconds with mean err: 26.820213	 stdev err: 14.214829
ERROR:main:running eval sim 10 times with params: [0.2  0.3  0.1  0.01 0.01 0.01 1.   0.01]
ERROR:main:Run took 13.260695 seconds with mean err: 15.190043	 stdev err: 1.841884
ERROR:main:running eval sim 10 times with params: [0.2  0.3  0.1  0.01 0.01 0.01 1.   0.2 ]
ERROR:main:Run took 20.185917 seconds with mean err: 14.220025	 stdev err: 4.221118
ERROR:main:running eval sim 10 times with params: [0.2  0.3  0.1  0.01 0.01 0.01 0.2  0.2 ]
ERROR:main:Run took 170.783302 seconds with mean err: 43.741991	 stdev err: 45.218613
ERROR:main:running eval sim 10 times with params: [0.2  0.01 0.8  0.01 0.01 0.01 0.2  0.01]
ERROR:main:Run took 3.964038 seconds with mean err: 4.021434	 stdev err: 0.185613
ERROR:main:running eval sim 10 tim

ERROR:main:Run took 31.398064 seconds with mean err: 30.368338	 stdev err: 7.922965
ERROR:main:running eval sim 10 times with params: [0.2  0.01 0.1  0.05 0.01 0.1  1.   0.2 ]
ERROR:main:Run took 31.266463 seconds with mean err: 25.392280	 stdev err: 11.806228
ERROR:main:running eval sim 10 times with params: [0.2  0.01 0.1  0.05 0.01 0.1  0.2  0.2 ]
ERROR:main:Run took 233.388985 seconds with mean err: 39.366882	 stdev err: 18.625491
ERROR:main:running eval sim 10 times with params: [0.2  0.01 0.1  0.05 0.01 0.01 1.   0.01]
ERROR:main:Run took 43.987977 seconds with mean err: 41.481631	 stdev err: 22.166228
ERROR:main:running eval sim 10 times with params: [0.2  0.01 0.1  0.05 0.01 0.01 1.   0.2 ]
ERROR:main:Run took 40.652818 seconds with mean err: 33.440976	 stdev err: 17.427933
ERROR:main:running eval sim 10 times with params: [0.2  0.01 0.1  0.05 0.01 0.01 0.2  0.2 ]
ERROR:main:Run took 195.703591 seconds with mean err: 42.789520	 stdev err: 17.172679
ERROR:main:running eval sim 1

operation took: 2:02:16.521214
Minimize result:      fun: 2.5050927976898616
    funl: array([2.5050928])
 message: 'Optimization terminated successfully.'
    nfev: 266
     nit: 2
   nlfev: 9
   nlhev: 0
   nljev: 1
 success: True
       x: array([0.45 , 0.155, 0.45 , 0.03 , 0.155, 0.055, 0.6  , 0.105])
      xl: array([[0.45 , 0.155, 0.45 , 0.03 , 0.155, 0.055, 0.6  , 0.105]])


#### Optimial Result:


m_l0: 0.45 <br>
sd_l0: 0.155  <br>
m_t: 0.45  <br>
sd_t: 0.03  <br>
m_s: 0.155 <br>
sd_s: 0.055 <br>
m_g: 0.6 <br>
sd_g: 0.105 <br>

In [54]:

bnds = Bounds([0.2, 0.01, 0.1, 0.01, 0.01, 0.01, 0.2, 0.01],
               [0.7, 0.3, 0.8, 0.05, 0.3, 0.1, 1.0, 0.2])

In [55]:
# Local minima near global minima
init_params = min_result.x
print("Initial parameters: %s" % str(init_params))
print("Bounds: %s" % str(bnds))
start = dt.now()
# Commented out for now
min_result = minimize(eval_sim_params, init_params, 
                      args=(num_sims, stu_per_sim),
                      method='powell', bounds=bnds)
print("operation took: %s" % str((end - start)))
print("Minimize result: %s" % str(min_result))

ERROR:main:running eval sim 20 times with params: [0.45  0.155 0.45  0.03  0.155 0.055 0.6   0.105]


Initial parameters: [0.45  0.155 0.45  0.03  0.155 0.055 0.6   0.105]
Bounds: Bounds([0.2, 0.01, 0.1, 0.01, 0.01, 0.01, 0.2, 0.01], [0.7, 0.3, 0.8, 0.05, 0.3, 0.1, 1.0, 0.2])


ERROR:main:Run took 38.769469 seconds with mean err: 2.304904	 stdev err: 0.250643


TypeError: only integer scalar arrays can be converted to a scalar index

 <br>### EDA of Simulation Runs