# Tuning hyperparameters for domain

This notebook explores the hyperparameters for generating a domain. The domain as assumed to be a simply non-hierarchical structure where all kc's are independent.

In [1]:
import sys
sys.path.append("../lib")

In [2]:
import math
import random
import uuid
import os
import copy
from collections.abc import Iterable
from datetime import datetime as dt

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import Bounds
from scipy.optimize import minimize
from scipy import optimize

In [3]:
import logging

#logging.basicConfig(level=logging.DEBUG)
#logging.basicConfig(level=logging.INFO)
logging.basicConfig(level=logging.WARNING)

logger = logging.getLogger("main")

In [4]:
from tutor.domain import Domain
from tutor.cogtutor_curriculum import CogTutorCurriculum
from tutor.tutor import SimpleTutor
from tutor.action import Attempt, HintRequest
from learner.domain_tuner import DomainTuner

from simulate.self_eff_simulation import SelfEffSimulation
from log_db import mongo
from log_db.curriculum_mapper import DB_Curriculum_Mapper

In [5]:
# Get path to current project directory
cwd = os.path.abspath(".")
base_dir = os.path.abspath(os.path.join(cwd, os.pardir))
logger.debug("Base directory for the project:\n%s" % base_dir)

In [6]:
# Setup connection to database
data_out = "sim-%s" % str(uuid.uuid4())
data_path = os.path.join(base_dir,"test", "data", data_out)
logger.info("Writing simulation results to directory: %s" % data_path)
db_name = "motivsim"
db_params  = mongo.get_db_params(db_name)
logger.info("got db params: %s" % str(db_params))
db_util = mongo.Data_Utility(data_path, db_params)
db = db_util.db

In [7]:
logger.info("Clearing database before starting new simulation")
db_util.clear_db()

## Multi-simulation run

### Simulation Run Parameters

In [8]:
num_sims=2
stu_per_sim = 2

### Simualtion Runs

In [9]:
def gen_curriculum(domain,
                   num_units=1,
                   mean_sections=4,
                   stdev_sections=2,
                   mean_unit_kcs=5,
                   stdev_unit_kcs=5,
                   section_kcs_lambda=2,
                   mean_steps=3,
                   stdev_steps=1,
                   mean_prob_kcs=2,
                   stdev_prob_kcs=1,
                   num_practice=100,
                   
                  ):# Generating the Curriculum and domain together
#    domain = Domain()
    curric = CogTutorCurriculum(domain)
    curric.generate(num_units=num_units,
                   mean_sections=mean_sections,
                   stdev_sections=stdev_sections,
                   mean_unit_kcs=mean_unit_kcs,
                   stdev_unit_kcs=stdev_unit_kcs,
                   section_kcs_lambda=section_kcs_lambda,
                   mean_steps=mean_steps,
                   stdev_steps=stdev_steps,
                   mean_prob_kcs=mean_prob_kcs,
                   stdev_prob_kcs=stdev_prob_kcs,
                   num_practice=num_practice
                   )
    
    # Insert domain to db
    #db.domains.insert_one(domain.to_dict())
    #db.kcs.insert_many([kc.__dict__ for kc in domain.kcs])

    # Insert Curriculum to db
    #curric_util = DB_Curriculum_Mapper(db_params)
    #curric_util.write_to_db(curric)
    
    return curric

In [10]:
def gen_students(domain, num_students=2):
    students = [DomainTuner(domain) for i in range(num_students)]
    logger.info("Sample student:\n%s" % str(students[0]))
    logger.info("Inserting %i students to db" % len(students))
    result = db.students.insert_many([stu.to_dict() for stu in students])
    logger.info("Db insert success: %s" % result.acknowledged)
    return students

In [11]:
def simulate_students(domain, curric, students):    
    for i, stu in enumerate(students):
        logger.info("Simulating student #%i" % i)
        sim = SelfEffSimulation(domain, curric, stu)
        sim.run()

In [12]:
def calc_sim_stats(curric, students):

    stu_ids = [stu._id for stu in students]
    tx = pd.DataFrame(db.tutor_events.find({'type': "Tutor Input", 'stu_id': {"$in": stu_ids}}))
    logger.info("Learner Transactions: %s" % str(tx.shape))
    # Add kc field that reduces list of kcs to 1 kc
    tx['kc'] = tx.apply(lambda x: x['kcs'][0]['_id'], axis=1)
    
    # Aggregate Learner transaction stats

    # Total Transaction counts
    stu_stats = tx.groupby('stu_id').agg({'_id': 'count', 
                                          'duration': np.sum,
                                         })
    stu_stats.rename(columns={'_id': "Total Tx",
                              'duration': 'Total Time'}, 
                             inplace = True)
    stu_stats['Total Time(hours)'] = stu_stats['Total Time'].apply(lambda x: x / 3600)
    #logger.info("Number of students: %i" % stu_stats.shape[0])
    #logger.info(stu_stats["Total Tx"].describe())

    # Total of each outcome
    d = tx.groupby(['stu_id','outcome'])['_id'].count().reset_index().pivot(index='stu_id', columns='outcome', values='_id')
    # Proporation of each outcome
    if len(d.columns) > 1:
        d['Total'] = d.sum(axis=1)
    else:
        d['Total'] = d.iloc[:,0]

    for col in d.columns:
        if col != 'Total':
            d['Pct %s' % col] = d[col] / d['Total']
    stu_stats = pd.concat([stu_stats, d], axis=1)
    
    # Calculate attempts per ste
    stu_step_stats = tx.groupby(['stu_id', 'step_id'])['_id'].count().reset_index()
    stu_step_stats.rename(columns={'_id': 'step attempts'}, inplace=True)
    step_stats = stu_step_stats.groupby('stu_id').describe()
    
    # Calculate opportunities per kc per student
    stu_kc_stats = tx[['stu_id', 'kc', 'step_id']].drop_duplicates().groupby(['stu_id', 'kc']).count().reset_index()
    stu_kc_stats.rename(columns={'step_id': 'kc opportunities'}, inplace=True)
    kc_stats = stu_kc_stats.groupby('kc').describe()

    
    # consolidate distributional stats of resulting data

    stats = {}
    stats['Step attempts mean'] = step_stats[('step attempts', 'mean')].mean()
    stats['Step attempts std'] = step_stats[('step attempts', 'std')].mean()
    accuracy_dist = stu_stats['Pct Correct'].describe()
    stats['Mean Pct Correct'] = accuracy_dist['mean']
    stats['Std Pct Correct'] = accuracy_dist['std']
    stats['KC opportunity mean'] = kc_stats[('kc opportunities', 'mean')].mean()
    stats['KC opportunity std'] = kc_stats[('kc opportunities', 'std')].mean()
    return stats

In [13]:
def run_sim(num_stu, hyperparams=None):
    domain = Domain()
    # Set domain hyperparams
    if hyperparams is not None:
        domain.set_kc_hyperparams(**hyperparams)
    logger.info("*** domain has %i kcs before curric *****" % len(domain.kcs) )
    curric = gen_curriculum(domain)
    logger.info("*** domain has %i kcs *****" % len(domain.kcs) )
    students = gen_students(domain, num_stu)
    simulate_students(domain, curric, students)
    stats = calc_sim_stats(curric, students)

    return stats

### Tuning Objective

Tuning on distribution of opportunities per kc, attempts per step, and student accuracy


In [14]:
target = {
    'Step attempts mean': 1,
    'Step attempts std': 0.4,
    'Mean Pct Correct': 0.8,
    'Std Pct Correct': 0.1,
    'KC opportunity mean': 7,
    'KC opportunity std': 3
}

def target_obj(pred, target=target):
    d = pd.DataFrame([pred, target])
    err = math.sqrt(np.sum(d.apply(lambda x: (x[1] - x[0])**2, axis=0)))
    return err


In [15]:

def eval_sim_params(params, num_sims, stu_per_sim):
    start = dt.now()
    logger.error("running eval sim %i times with params: %s" % (num_sims, str(params)))
    db_util = mongo.Data_Utility(data_path, db_params)
    param_dict = {
            'm_l0': params[0],
            'sd_l0':params[1],
            'm_t':params[2],
            'sd_t':params[3],
            'm_s':params[4],
            'sd_s':params[5],
            'm_g':params[6],
            'sd_g':params[7]
    }
    errs = []
    for i in range(num_sims):
        result = run_sim(stu_per_sim, param_dict)
        errs.append(target_obj(result))
        #param_dict['error'] = err
    db_util.clear_db()
    end = dt.now()
    err = np.mean(errs)
    stdev_err = np.std(errs)
    logger.error("Run took %f seconds with mean err: %f\t stdev err: %f" % (((end - start).total_seconds(), err, stdev_err)))

    return err

    

### Run Tuning simulation runs

In [16]:
params = {
            'm_l0':0.5,
            'sd_l0':0.1,
            'm_t':0.2,
            'sd_t':0.03,
            'm_s':0.05,
            'sd_s':0.03,
            'm_g':0.8,
            'sd_g':0.15
}
bounds = Bounds([0.2, 0.01, 0.01, 0.0, 0.0, 0.0, 0.0, 0.0],
               [0.7, 0.3, 0.8, 0.05, 0.3, 0.1, 1.0, 0.2])
bounds = [(0.2,0.7), (0.01,0.3), (0.01,0.8), (0.01,0.05),
        (0.01,0.3), (0.01,0.1), (0.2,1.0),(0.01,0.2)]

In [17]:
# Test specific case
#params = [0.7,  0.3,  0.8,  0.05, 0.3,  0.1,  1.0,  0.01]
#params = [0.9,  0.3,  0.1,  0.01, 0.01, 0.01, 0.2,  0.01]
params = [0.45 , 0.155, 0.45 , 0.03 , 0.155, 0.055, 0.6  , 0.105]
reps = 10

for i in range(reps):
    start = dt.now()
    result = eval_sim_params(params, num_sims, stu_per_sim)
    end = dt.now()
    time = (end - start).total_seconds()    
print("Done")

ERROR:main:running eval sim 2 times with params: [0.45, 0.155, 0.45, 0.03, 0.155, 0.055, 0.6, 0.105]
ERROR:main:Run took 0.842972 seconds with mean err: 2.977656	 stdev err: 0.433265
ERROR:main:running eval sim 2 times with params: [0.45, 0.155, 0.45, 0.03, 0.155, 0.055, 0.6, 0.105]
ERROR:main:Run took 0.899673 seconds with mean err: 1.611873	 stdev err: 0.388029
ERROR:main:running eval sim 2 times with params: [0.45, 0.155, 0.45, 0.03, 0.155, 0.055, 0.6, 0.105]
ERROR:main:Run took 0.343649 seconds with mean err: 3.004740	 stdev err: 0.369121
ERROR:main:running eval sim 2 times with params: [0.45, 0.155, 0.45, 0.03, 0.155, 0.055, 0.6, 0.105]
ERROR:main:Run took 0.687735 seconds with mean err: 2.968252	 stdev err: 0.137057
ERROR:main:running eval sim 2 times with params: [0.45, 0.155, 0.45, 0.03, 0.155, 0.055, 0.6, 0.105]
ERROR:main:Run took 0.924283 seconds with mean err: 1.676432	 stdev err: 0.070698
ERROR:main:running eval sim 2 times with params: [0.45, 0.155, 0.45, 0.03, 0.155, 0.0

Done


In [18]:
reps = 0 # Set to zero so it won't accidntally execute
times = []
for i in range(reps):
    params = [
        random.uniform(bounds[0][0], bounds[0][1]),
        random.uniform(bounds[1][0], bounds[1][1]),
        random.uniform(bounds[2][0], bounds[2][1]),
        random.uniform(bounds[3][0], bounds[3][1]),
        random.uniform(bounds[4][0], bounds[4][1]),
        random.uniform(bounds[5][0], bounds[5][1]),
        random.uniform(bounds[6][0], bounds[6][1]),
        random.uniform(bounds[7][0], bounds[7][1])
    ]
    start = dt.now()
    result = eval_sim_params(params, num_sims, stu_per_sim)
    end = dt.now()
    times.append((end - start).total_seconds())
    #print("Run took %f seconds" % times[-1])
    #print(result)
print("average run time: %f seconds" % np.mean(times))

average run time: nan seconds


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [19]:
#bounds = [(0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0),
#          (0.7, 0.3, 0.8, 0.05, 0.3, 0.1, 1.0, 0.2)]
bounds = [(0.2,0.7), (0.01,0.3), (0.1,0.8), (0.01,0.05),
        (0.01,0.3), (0.01,0.1), (0.2,1.0),(0.01,0.2)]
#init_params = list(params.values())
#print("Initial parameters: %s" % str(init_params))
print("Bounds: %s" % str(bounds))
start = dt.now()
min_result = optimize.shgo(eval_sim_params, bounds, args=(num_sims, stu_per_sim))
end = dt.now()
print("operation took: %s" % str((end - start)))
print("Minimize result: %s" % str(min_result))

ERROR:main:running eval sim 2 times with params: [0.2  0.01 0.1  0.01 0.01 0.01 0.2  0.01]


Bounds: [(0.2, 0.7), (0.01, 0.3), (0.1, 0.8), (0.01, 0.05), (0.01, 0.3), (0.01, 0.1), (0.2, 1.0), (0.01, 0.2)]


ERROR:main:Run took 1.245914 seconds with mean err: 12.305095	 stdev err: 2.896014
ERROR:main:running eval sim 2 times with params: [0.7  0.3  0.8  0.05 0.3  0.1  1.   0.2 ]
ERROR:main:Run took 0.717358 seconds with mean err: 4.953019	 stdev err: 0.043291
ERROR:main:running eval sim 2 times with params: [0.7  0.01 0.1  0.01 0.01 0.01 0.2  0.01]
ERROR:main:Run took 3.211063 seconds with mean err: 6.577959	 stdev err: 1.331609
ERROR:main:running eval sim 2 times with params: [0.7  0.3  0.1  0.01 0.01 0.01 0.2  0.01]
ERROR:main:Run took 1.332894 seconds with mean err: 6.366624	 stdev err: 3.783213
ERROR:main:running eval sim 2 times with params: [0.7  0.3  0.8  0.01 0.01 0.01 0.2  0.01]
ERROR:main:Run took 0.388587 seconds with mean err: 4.359818	 stdev err: 0.111436
ERROR:main:running eval sim 2 times with params: [0.7  0.3  0.8  0.05 0.01 0.01 0.2  0.01]
ERROR:main:Run took 1.037300 seconds with mean err: 4.296689	 stdev err: 0.157219
ERROR:main:running eval sim 2 times with params: [0.

operation took: 0:13:45.968088
Minimize result:      fun: 11.917028956240312
    funl: array([11.91702896])
 message: 'Optimization terminated successfully.'
    nfev: 266
     nit: 2
   nlfev: 9
   nlhev: 0
   nljev: 1
 success: True
       x: array([0.7 , 0.3 , 0.1 , 0.05, 0.3 , 0.01, 0.2 , 0.01])
      xl: array([[0.7 , 0.3 , 0.1 , 0.05, 0.3 , 0.01, 0.2 , 0.01]])


#### Optimial Result:


m_l0: 0.45 <br>
sd_l0: 0.155  <br>
m_t: 0.45  <br>
sd_t: 0.03  <br>
m_s: 0.155 <br>
sd_s: 0.055 <br>
m_g: 0.6 <br>
sd_g: 0.105 <br>

In [None]:

bnds = Bounds([0.2, 0.01, 0.1, 0.01, 0.01, 0.01, 0.2, 0.01],
               [0.7, 0.3, 0.8, 0.05, 0.3, 0.1, 1.0, 0.2])

In [None]:
# Local minima near global minima
init_params = min_result.x
print("Initial parameters: %s" % str(init_params))
print("Bounds: %s" % str(bnds))
start = dt.now()
# Commented out for now
min_result = minimize(eval_sim_params, init_params, 
                      args=(num_sims, stu_per_sim),
                      method='powell', bounds=bnds)
print("operation took: %s" % str((end - start)))
print("Minimize result: %s" % str(min_result))

 <br>### EDA of Simulation Runs