In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

### Load data with prediction

In [2]:
bkt = pd.read_csv('E:/Project/AL/Data/Combine/bkt_predict.txt', sep = '\t')

In [3]:
random = pd.read_csv('E:/Project/AL/Data/Combine/random_predict.txt', sep = '\t')

In [4]:
streak = pd.read_csv('E:/Project/AL/Data/Combine/streak_predict.txt', sep = '\t')

In [5]:
revise = pd.read_csv('E:/Project/AL/Data/Combine/revise_predict.txt', sep = '\t')

### How many KCs in each model

In [6]:
# Dictionary for mapping
dic = {'BKT': bkt, 'Random': random, 'Streak': streak, 'Revised BKT': revise}

In [7]:
def numberKC(dic, colname):
    arr = []
    names = []
    for key in dic:
        arr.append(pd.DataFrame(dic[key][colname].value_counts()).sort_index())
        names.append(key)
    res = pd.concat(arr, axis = 1)
    res.columns = names
    return res

In [8]:
# ptype_selection
numberKC(dic, 'KC (ptype_selection)')

Unnamed: 0,BKT,Random,Streak,Revised BKT
AD JCommTable4.R0C0,184,450,755,174
AD JCommTable4.R1C0,194,452,689,165
AD JCommTable5.R0C0,220,541,756,192
AD JCommTable5.R1C0,201,493,699,183
AD JCommTable6.R0C0,323,607,878,317
AD JCommTable6.R1C0,317,689,947,299
AD JCommTable8.R0C0,166,420,671,149
AD done,196,448,738,176
AS JCommTable4.R0C0,15,21,4,17
AS JCommTable4.R1C0,14,26,30,18


In [9]:
# selected
numberKC(dic, 'KC (Selected)')

Unnamed: 0,BKT,Random,Streak,Revised BKT
JCommTable4.R0C0,212,494,824,209
JCommTable4.R1C0,266,567,847,238
JCommTable5.R0C0,238,557,781,207
JCommTable5.R1C0,245,563,733,228
JCommTable6.R0C0,942,2177,1665,923
JCommTable6.R1C0,928,2252,1715,891
JCommTable8.R0C0,248,525,767,231
done,810,2005,1503,784


### Find the max Opportunity (ptype_selection) and related Prediction

The related prediction of max(Opportunity (ptype_selection)) turns out to be max(Prediction).

In [10]:
def findMaxOppPreKC(df):
    maxOppKCidx = df.loc[df.groupby(['KC (ptype_selection)', 'Anon Student Id'])['Opportunity (ptype_selection)'].idxmax()] # slice data by row index where max(Opportunity)
    maxOppPre = maxOppKCidx.groupby('KC (ptype_selection)').agg({'Opportunity (ptype_selection)': ['mean', 'count'], 'Prediction': ['mean', 'count']}).sort_index()
    # save the data
    return maxOppPre

In [11]:
findMaxOppPreKC(bkt)

Unnamed: 0_level_0,Opportunity (ptype_selection),Opportunity (ptype_selection),Prediction,Prediction
Unnamed: 0_level_1,mean,count,mean,count
KC (ptype_selection),Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
AD JCommTable4.R0C0,6.133333,30,0.5563486,30
AD JCommTable4.R1C0,6.466667,30,0.6449583,30
AD JCommTable5.R0C0,7.333333,30,0.517132,30
AD JCommTable5.R1C0,6.7,30,0.5733713,30
AD JCommTable6.R0C0,10.766667,30,0.4403629,30
AD JCommTable6.R1C0,10.566667,30,0.4135227,30
AD JCommTable8.R0C0,5.533333,30,0.7865404,30
AD done,6.533333,30,0.8057804,30
AS JCommTable4.R0C0,1.25,12,1.027582e-09,12
AS JCommTable4.R1C0,1.166667,12,1.655367e-12,12


In [12]:
findMaxOppPreKC(random)

Unnamed: 0_level_0,Opportunity (ptype_selection),Opportunity (ptype_selection),Prediction,Prediction
Unnamed: 0_level_1,mean,count,mean,count
KC (ptype_selection),Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
AD JCommTable4.R0C0,15.0,30,0.9400394,30
AD JCommTable4.R1C0,15.066667,30,0.9843925,30
AD JCommTable5.R0C0,18.033333,30,0.9128168,30
AD JCommTable5.R1C0,16.433333,30,0.9232358,30
AD JCommTable6.R0C0,20.233333,30,0.7418813,30
AD JCommTable6.R1C0,22.966667,30,0.7805884,30
AD JCommTable8.R0C0,14.0,30,0.9996225,30
AD done,14.933333,30,0.9995931,30
AS JCommTable4.R0C0,1.5,14,1.062012e-09,14
AS JCommTable4.R1C0,1.368421,19,1.669109e-12,19


In [13]:
findMaxOppPreKC(streak)

Unnamed: 0_level_0,Opportunity (ptype_selection),Opportunity (ptype_selection),Prediction,Prediction
Unnamed: 0_level_1,mean,count,mean,count
KC (ptype_selection),Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
AD JCommTable4.R0C0,25.166667,30,0.9890989,30
AD JCommTable4.R1C0,22.966667,30,0.9954073,30
AD JCommTable5.R0C0,25.2,30,0.9678532,30
AD JCommTable5.R1C0,23.3,30,0.9685047,30
AD JCommTable6.R0C0,29.266667,30,0.8804822,30
AD JCommTable6.R1C0,31.566667,30,0.8898023,30
AD JCommTable8.R0C0,22.366667,30,0.9998819,30
AD done,24.6,30,0.9999605,30
AS JCommTable4.R0C0,1.0,4,1.529234e-09,4
AS JCommTable4.R1C0,1.428571,21,1.798616e-12,21


In [14]:
findMaxOppPreKC(revise)

Unnamed: 0_level_0,Opportunity (ptype_selection),Opportunity (ptype_selection),Prediction,Prediction
Unnamed: 0_level_1,mean,count,mean,count
KC (ptype_selection),Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
AD JCommTable4.R0C0,5.8,30,0.5471949,30
AD JCommTable4.R1C0,5.5,30,0.5951867,30
AD JCommTable5.R0C0,6.4,30,0.4782522,30
AD JCommTable5.R1C0,6.1,30,0.5494748,30
AD JCommTable6.R0C0,10.566667,30,0.4322738,30
AD JCommTable6.R1C0,9.966667,30,0.3954662,30
AD JCommTable8.R0C0,4.966667,30,0.7697914,30
AD done,5.866667,30,0.7936184,30
AS JCommTable4.R0C0,1.416667,12,9.16232e-10,12
AS JCommTable4.R1C0,1.2,15,1.497129e-12,15


### How many questions each type were given to each student in each model

In [15]:
def ptypeNumEachStudentConcat(df):
    mean_arr, names = [], []
    for key in dic:
        dic[key]['Problem Type'] = dic[key]['Problem Name'].str[0:2] # creat 'Problem Name' column
        problem = dic[key][['Anon Student Id', 'Problem Name', 'Problem Type']].drop_duplicates()
        problem_cnt = problem.groupby(['Problem Type', 'Anon Student Id'], as_index = False).count()
        cnt_mean = problem_cnt.groupby('Problem Type').mean()
        mean_arr.append(cnt_mean)
        names.append(key)
    res = pd.concat(mean_arr, axis = 1)
    res.columns = names
    return res

In [16]:
ptypeNumEachStudentConcat(dic)

Unnamed: 0_level_0,BKT,Random,Streak,Revised BKT
Problem Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AD,4.466667,14,11.066667,4.066667
AS,3.2,10,5.7,2.866667
M,12.0,40,12.933333,12.233333
