# Canonical Autocorrelation Analysis Test Run

This notebook attempts to perform data prep and runs the CAA method on the transaction data

In [1]:
import sys
sys.path.append("../lib")

In [2]:
import logging
import random
import uuid
import os
from collections.abc import Iterable

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
from log_db import mongo
from CanonicalAutocorrelationAnalysis.model.caa import *
from CanonicalAutocorrelationAnalysis.model.caaObject import CAA
from CanonicalAutocorrelationAnalysis.model.utils import l1Norm, l2Norm, r2Compute

In [4]:
#logging.basicConfig(level=logging.DEBUG)
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("main")

In [5]:
#logging.getLogger().setLevel(logging.DEBUG)
logging.getLogger().setLevel(logging.INFO)
#logging.getLogger().setLevel(logging.WARNING)
logger.debug("Test debug")
logger.info("Test info")
logger.warning("Test warning")

INFO:main:Test info


In [6]:
# Get path to current project directory
cwd = os.path.abspath(".")
base_dir = os.path.abspath(os.path.join(cwd, os.pardir))
logger.debug("Base directory for the project:\n%s" % base_dir)

In [7]:
# Setup connection to database
data_out = "sim-%s" % str(uuid.uuid4())
data_path = os.path.join(base_dir,"test", "data", data_out)
logger.info("Writing simulation results to directory: %s" % data_path)
db_name = "motivsim"
db_params  = mongo.get_db_params(db_name)
logger.info("got db params: %s" % str(db_params))
db_util = mongo.Data_Utility(data_path, db_params)
db = db_util.db

INFO:main:Writing simulation results to directory: /rdata/Sandbox/MotivSim/test/data/sim-08473ed3-61fe-411f-83e1-5687ddb781c7
INFO:main:got db params: {'settingId': 'motivsim', 'url': 'localhost', 'port': '27017', 'name': 'motivsim', 'user': '', 'pswd': ''}


## Prepping Data

In [8]:
# Get learner transactions
tx = pd.DataFrame(db.tutor_events.find({'type': "Tutor Input"}))
logger.info("Learner Transactions: %s" % str(tx.shape))
tx.head()

INFO:main:Learner Transactions: (470030, 17)


Unnamed: 0,_id,type,time,curric_id,unit_id,section_id,prob_id,step_id,stu_id,duration,outcome,kcs,plt,plt1,hints_used,hints_avail,attempt
0,0f2db992-e2ad-43d2-8831-b01ef1f83c04,Tutor Input,2020-11-13 20:04:44.568,5b7d4068-4287-4585-a705-8ea1a72f4761,28660318-8fc7-4568-8c13-aa412261927d,2649cdbc-2676-4ffe-8e62-fd8c2544444d,9f64f057-5af4-4f06-aec8-79a2852178dc,ea76e6b2-50a9-4fe5-b047-f6560142b4ec,5332323f-0421-4dd1-b901-83091a8ac9a8,4.044265,Hint,[{'_id': '219c53ea-ba56-4768-b9fb-affbf6a54aa0...,0.44252,0.515336,0,3,0
1,9952babc-01c8-44d9-898d-28f70a99c3e5,Tutor Input,2020-11-13 20:04:46.146,5b7d4068-4287-4585-a705-8ea1a72f4761,28660318-8fc7-4568-8c13-aa412261927d,2649cdbc-2676-4ffe-8e62-fd8c2544444d,9f64f057-5af4-4f06-aec8-79a2852178dc,ea76e6b2-50a9-4fe5-b047-f6560142b4ec,5332323f-0421-4dd1-b901-83091a8ac9a8,1.577746,Hint,[{'_id': '219c53ea-ba56-4768-b9fb-affbf6a54aa0...,0.515336,0.515336,1,2,1
2,02160b6e-dc3f-4ff0-b384-2ab063596443,Tutor Input,2020-11-13 20:05:10.003,5b7d4068-4287-4585-a705-8ea1a72f4761,28660318-8fc7-4568-8c13-aa412261927d,2649cdbc-2676-4ffe-8e62-fd8c2544444d,9f64f057-5af4-4f06-aec8-79a2852178dc,ea76e6b2-50a9-4fe5-b047-f6560142b4ec,5332323f-0421-4dd1-b901-83091a8ac9a8,23.857322,Correct,[{'_id': '219c53ea-ba56-4768-b9fb-affbf6a54aa0...,0.515336,0.515336,2,1,2
3,819c8b8f-7ac7-4999-bd85-a88ad7578b5c,Tutor Input,2020-11-13 20:05:25.460,5b7d4068-4287-4585-a705-8ea1a72f4761,28660318-8fc7-4568-8c13-aa412261927d,2649cdbc-2676-4ffe-8e62-fd8c2544444d,9f64f057-5af4-4f06-aec8-79a2852178dc,6c40d097-af6e-437e-8072-943cb341f983,5332323f-0421-4dd1-b901-83091a8ac9a8,15.45675,Correct,[{'_id': '219c53ea-ba56-4768-b9fb-affbf6a54aa0...,0.515336,0.845713,0,3,0
4,31d6718e-6330-4fd3-89fb-144749adbea6,Tutor Input,2020-11-13 20:05:28.448,5b7d4068-4287-4585-a705-8ea1a72f4761,28660318-8fc7-4568-8c13-aa412261927d,2649cdbc-2676-4ffe-8e62-fd8c2544444d,69bcd12f-aaf7-4dfe-a1cd-5b97e908febe,04502906-ce8e-46e6-8ce2-1fa8e702aa9f,5332323f-0421-4dd1-b901-83091a8ac9a8,2.988336,Incorrect,[{'_id': '219c53ea-ba56-4768-b9fb-affbf6a54aa0...,0.845713,0.667008,0,3,0


In [20]:
# Add kc field that reduces list of kcs to 1 kc
tx['kc'] = tx.apply(lambda x: x['kcs'][0]['_id'], axis=1)

In [21]:
# Number of transactions with negative duration
# This is a sanity check
count = np.sum(tx['duration'] < 0)
logger.info("Number of transactions with negative duration: %i out of %i(%.2f%%)" % (count, tx.shape[0], count * 100 / tx.shape[0]))

INFO:main:Number of transactions with negative duration: 0 out of 447009(0.00%)


In [22]:
for col in tx.columns:
    print(col)

_id
type
time
curric_id
unit_id
section_id
prob_id
step_id
stu_id
duration
outcome
kcs
plt
plt1
hints_used
hints_avail
attempt
kc


In [23]:
data = tx[['duration', 'plt', 'hints_used', 'hints_avail', 'attempt', 'outcome']]
outcomes = data['outcome'].unique()
for i, outcome in enumerate(outcomes):
    print(f"{outcome} = {i}")
    data.loc[data['outcome'] == outcome, 'outcome'] = i
# d = tx[['duration', 'kc', 'plt', 'hints_used', 'hints_avail', 'attempt', 'outcome']]
data.index = tx['_id']
data.head()

Correct = 0
Incorrect = 1
Hint = 2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


Unnamed: 0_level_0,duration,plt,hints_used,hints_avail,attempt,outcome
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
bbcd5476-b03e-45d1-99ec-50f449be0d74,16.516351,0.356412,0,3,0,0
a921cc4e-5c38-47ac-9d2a-0310d632efc2,2.339306,0.768261,0,3,0,1
82b2d930-db3a-40f0-beea-b64bfc4222f4,21.815426,0.840939,0,3,1,0
f5218ca0-e505-4ba6-83f1-5004c3486ebb,17.616393,0.840939,0,3,0,0
de188246-2d95-44c6-a2eb-4f1fcbfbeb91,22.452271,0.949197,0,3,0,1


In [24]:
## Running CAA

In [25]:
row, features = data.shape

In [26]:
std = np.std(data, axis = 0)
std[std == 0] = 1
X = (data - np.mean(data, axis = 0)) / std

In [27]:
penalty1 = 0.35
penalty2 = 0.35

maxProj = features
minr2 = None
scale = True
doubleInit = True
orthogonality = False

In [28]:
Co = X.T.dot(X) / row

In [29]:
# Remove diagonal values to avoid max
diag_rows, diag_cols = np.diag_indices_from(Co)
for tup in zip(diag_rows, diag_cols):
    Co.iloc[tup[0],tup[1]] = 0
    
Co

Unnamed: 0,duration,plt,hints_used,hints_avail,attempt,outcome
duration,0.0,0.0579804,0.0052377,-0.0052377,0.00399884,-0.408473
plt,0.0579804,0.0,-0.291047,0.291047,-0.319416,-0.21855
hints_used,0.0052377,-0.291047,0.0,-1.0,0.760946,0.0809419
hints_avail,-0.0052377,0.291047,-1.0,0.0,-0.760946,-0.0809419
attempt,0.00399884,-0.319416,0.760946,-0.760946,0.0,0.109113
outcome,-0.408473,-0.21855,0.0809419,-0.0809419,0.109113,0.0


In [30]:
uList, vList, rSquare, dList = [], [], [], []

In [31]:
for _ in range(maxProj):
    maxCorr = np.unravel_index(np.argmax(np.abs(Co), axis=None), Co.shape)

    u, v = np.zeros((1, features)), np.zeros((1, features))
    u[0, maxCorr[0]] = 1.
    v[0, maxCorr[1]] = 1.

    if doubleInit:
        c1 = c2 = 0.5 * np.sqrt(features)
        try:
            u, v = computeProjection(Co, u, v, c1, c2)
        except:
            result = CAA(uList, vList, dList, rSquare, penalty1, penalty2, data)

    c1 = penalty1 * np.sqrt(features)
    c2 = penalty2 * np.sqrt(features)
    try:
        u, v = computeProjection(Co, u, v, c1, c2)
    except:
        result = CAA(uList, vList, dList, rSquare, penalty1, penalty2, data)
    d = np.dot(np.dot(u,Co),v.T).flatten()
    r = r2Compute(u, v, X)
    

    # Append values to the list
    if minr2 is None or r >= minr2:
        uList.append(u)
        vList.append(v)
        dList.append(d)
        rSquare.append(r)

    # Update Correlation Matrix
    Co -= d * (np.matmul(u.T,v) + np.matmul(v.T,u))
    if orthogonality:
        selection = np.ones_like(Co)
        notNull = (np.abs(u) + np.abs(v) != 0).flatten()
        selection[notNull,:] = 0
        selection[:,notNull] = 0
        Co[selection == 0] = 0

result = CAA(uList, vList, dList, rSquare, penalty1, penalty2, data)

In [32]:
print(result)

CAA
Projection 
	 u => [[0.0 1.0442764297686224e-05 -0.8335308812040942 0.0 -0.5524728680850081
  0.0]]
	 v => [[0.0 0.0 0.0 0.9999999999806073 0.0 -6.227806606510826e-06]]
	 d => [1.2539365300600642]
Projection 
	 u => [[0.0 -2.3566103130557086e-06 0.9999999999972232 0.0 0.0 0.0]]
	 v => [[0.0 0.0 0.0 0.0 0.9999999999995992 8.952681107598728e-07]]
	 d => [0.7609464538572956]
Projection 
	 u => [[-0.8602344698905922 -0.509898672639855 0.0 0.0 2.1264198166580708e-05
  0.0]]
	 v => [[0.0 0.0 0.11957192241557353 -0.11955772839479199 0.0 0.985600580839379]]
	 d => [0.4905686498054277]
Projection 
	 u => [[0.0 -0.9999999994988227 0.0 0.0 0.0 3.165998652235883e-05]]
	 v => [[0.0 0.0 7.940371839386965e-05 0.0 0.9999999968475247 0.0]]
	 d => [0.31943884547107776]
Projection 
	 u => [[0.0 0.9999999999900954 0.0 0.0 0.0 -4.450762179394732e-06]]
	 v => [[0.0 0.0 0.0 1.0 0.0 0.0]]
	 d => [0.2611284950146506]
Projection 
	 u => [[0.0 -0.9999999984968208 0.0 0.0 0.0 5.483026858976103e-05]]
	 v => [[

In [33]:
p1 = result.projectPoints(data,1)

In [87]:
caa.plt(result.columns, True)

NameError: name 'caa' is not defined

In [28]:
caa = CAAComputation(d, 0.35, 0.35)
print(caa)
caa.plt(data.columns, True)

AssertionError: Number of manager items must equal union of block items
# manager items: 351323, # tot_items: 6

In [18]:
1/7

0.14285714285714285