## import packages

In [109]:
import numpy as np
import pandas as pd
import math
import statistics

## Load data

In [110]:
raw_df = pd.read_excel("../question_database_schema.xlsx", sheet_name="student_question_responses")
key_df = pd.read_excel("../question_database_schema.xlsx", sheet_name="answer_choices")
key_df = key_df[key_df['is_distractor'] == 0]

## Create single exam/version df for Rasch modeling

In [111]:
# Definitions to form 0/1 dataframe with each row being a student and each column being a single exam/version question
def exam_num_ver_df(num_and_ver, df):
    exam_mask = df.apply(lambda x: x['question_id'].startswith(num_and_ver), axis=1)
    exam_df = df[exam_mask]
    return exam_df

def questions_to_columns(df):
    list_of_unique_id_dicts=[]
    for unique_id in df['student_id'].unique():
        unique_id_df = df[df['student_id'] == unique_id]
        temp_dict={'id': unique_id}

        question_list = unique_id_df['question_id'].tolist()
        selection_list = unique_id_df['selected_option'].tolist()
        
        for index in range(0, len(question_list)):
            temp_dict[question_list[index]]=selection_list[index]
        
        list_of_unique_id_dicts.append(temp_dict)

    restructured_df=pd.DataFrame(list_of_unique_id_dicts)
    restructured_df=restructured_df.set_index('id', drop=True)
    bad_question=['4A01', '4B01', '4C01']
    for bad_q_label in bad_question:
        if bad_q_label in restructured_df.keys():
            restructured_df.drop(bad_q_label, axis=1, inplace=True)
    return restructured_df

def create_num_ver_key_dict(num_and_ver, key_df):
    exam_key_df = exam_num_ver_df(num_and_ver, key_df)

    question_ids = exam_key_df['question_id'].tolist()

    answer_series=exam_key_df['option_id'].replace({'A': '1', 'B': '2', 'C': '3', 'D': '4', 'E': '5'}) # avoids downcast warning
    answer_series=answer_series.astype(int) #manually forces downcasting
    answers = answer_series.tolist()

    key_dict = {}
    for index in range(0, len(question_ids)):
        key_dict[question_ids[index]] = answers[index]
    return key_dict

def compare_row_to_dict(row, dict_to_compare):
    return row.eq(pd.Series(dict_to_compare))

def true_false_df(df, key_dict):
    tf_df = df.apply(compare_row_to_dict, axis=1, args=(key_dict,))
    return tf_df

## Definitions to manipulate raw dfs to create dfs by exam

In [112]:
# Remove students & questions with 100% scores and 0% scores
def remove_issue_scores(df):
    temp_df = df.copy()
    len_of_key = len(df.keys())
    temp_df['score']=temp_df.sum(axis=1)

    if len_of_key in temp_df['score'].unique():
        drop_list_100s=temp_df[temp_df['score'] == len_of_key].index.to_list()
        temp_df.drop(drop_list_100s, inplace=True)
        print(f'{len(drop_list_100s)} 100% scores were dropped.')
    if 0 in temp_df['score'].unique():
        drop_list_0s=temp_df[temp_df['score'] == 0].index.to_list()
        temp_df.drop(drop_list_0s, inplace=True)
        print(f'{len(drop_list_0s)} 0% scores were dropped.')
    temp_df.drop(['score'], axis=1, inplace=True)

    question_score_series=temp_df.mean(axis=0)
    
    question_0_score_series=question_score_series.where(question_score_series == 0).dropna()
    if len(question_0_score_series) > 0:
        q0_index_list=question_0_score_series.index.to_list()
        temp_df.drop(q0_index_list, axis=1, inplace=True)

    question_100_score_series=question_score_series.where(question_score_series == 1).dropna()
    if len(question_100_score_series) > 0:
        q100_index_list=question_100_score_series.index.to_list()
        temp_df.drop(q100_index_list, axis=1, inplace=True)

    
    return temp_df

def collect_all_exam_numbers_and_forms(df):
    all_question_ids=df['question_id'].tolist()
    all_exam_numbers_and_forms=list(set([x[0:2] for x in all_question_ids]))
    return all_exam_numbers_and_forms

def create_true_false_for_all_exams(full_df, key_df, all_exam_numbers_and_forms):
    list_of_tf_dfs=[]
    for exam_num_and_form in all_exam_numbers_and_forms:
        temp_exam_df=exam_num_ver_df(exam_num_and_form, full_df)
        temp_exam_responses_df=questions_to_columns(temp_exam_df)
        temp_exam_answer_key=create_num_ver_key_dict(exam_num_and_form, key_df)
        temp_exam_tf_df=true_false_df(temp_exam_responses_df, temp_exam_answer_key).astype(int)
        list_of_tf_dfs.append({'exam_num_and_form': exam_num_and_form, 'true_false_df': temp_exam_tf_df})
    return list_of_tf_dfs

## Definitions for Rasch calculations

In [113]:
def ability_estimate(row):
    return math.log(row['avg_student_score'] / (1 - row['avg_student_score']))

def difficulty_estimate(col):
    return math.log((1 - col['avg_question_score'] ) / col['avg_question_score'])

def approximate_ability_and_difficulty(df):
    temp_df=df.copy()
    temp_df['avg_student_score']=temp_df.mean(axis=1)
    theta_s=temp_df.apply(ability_estimate, axis=1).tolist()
    temp_df.drop(['avg_student_score'], axis=1, inplace=True)

    temp_df.loc['avg_question_score'] = temp_df.mean(axis=0)
    beta_i_non_normal = temp_df.apply(difficulty_estimate, axis=0)
    temp_df.drop(['avg_question_score'], inplace=True)
    avg_beta_i = beta_i_non_normal.mean()

    beta_i = beta_i_non_normal - avg_beta_i
    beta_i_keys=beta_i.keys().tolist() # Question names for dict keys

    return {'beta_i_keys': beta_i_keys, 'beta_i': beta_i, 'theta_s': theta_s}

def iterate_variable_estimates(variable_estimates_dict, variance_df, residuals_df):
    beta_i=variable_estimates_dict['beta_i']
    beta_i_keys=variable_estimates_dict['beta_i_keys']
    theta_s=variable_estimates_dict['theta_s']

    new_beta_i=[]
    for beta_index in range(0, len(beta_i)):
        temp_key=beta_i_keys[beta_index]
        residual_col_sum=residuals_df[temp_key].sum()
        variance_col_sum=variance_df[temp_key].sum()
        temp_new_beta = beta_i[beta_index] - (residual_col_sum / variance_col_sum)
        new_beta_i.append(temp_new_beta)

    temp_new_theta_s=[]
    for theta_index in range(0, len(theta_s)):
        residual_row_sum=residuals_df.loc[theta_index].sum()
        variance_row_sum=variance_df.loc[theta_index].sum()
        temp_new_theta = theta_s[theta_index] + (residual_row_sum/variance_row_sum)
        temp_new_theta_s.append(temp_new_theta)
    theta_mean=statistics.fmean(temp_new_theta_s)
    new_theta_s=[x-theta_mean for x in temp_new_theta_s]

    return {'beta_i_keys': beta_i_keys, 'beta_i': new_beta_i, 'theta_s': new_theta_s}

def calc_expected_values(variable_estimates_dict):
    beta_i_keys=variable_estimates_dict['beta_i_keys']
    beta_i=variable_estimates_dict['beta_i']
    theta_s=variable_estimates_dict['theta_s']

    list_of_ev_dicts=[]
    for theta_index in range(0, len(theta_s)):
        temp_ev_dict={}
        for beta_index in range(0, len(beta_i)):
            exp_vars=math.exp(theta_s[theta_index] - beta_i[beta_index])
            temp_ev_dict[beta_i_keys[beta_index]] = exp_vars / (1 + exp_vars)
#            try:
#                exp_vars=math.exp(theta_s[theta_index] - beta_i[beta_index])
#                temp_ev_dict[beta_i_keys[beta_index]] = exp_vars / (1 + exp_vars)
#            except OverflowError:
#                temp_ev_dict[beta_i_keys[beta_index]] = 0
        list_of_ev_dicts.append(temp_ev_dict)

    ev_df=pd.DataFrame(list_of_ev_dicts)
    return ev_df

def calc_est_var(df):
    return df.apply(lambda x: x*(1-x))

def calc_sum_sqr_residuals(df):
    temp_series_sum = df.sum(axis=1)
    temp_series_sum = temp_series_sum.pow(2)
    sum_of_sqrs = temp_series_sum.sum()
    return sum_of_sqrs

# NO USE RIGHT NOW
def practice_multiplying_df(df):
    # Take diff_est_normalized row, create df copy for each student, then multiply df by ability_est
    beta_i=df.loc['diff_est_normalized'].copy().drop(['avg_student_score', 'ability_est'])
    theta_s=df['ability_est'].copy().drop(['avg_question_score', 'diff_est_raw', 'diff_est_normalized']).reset_index(drop=True)
    ev_df=pd.DataFrame([beta_i]*len(theta_s)).reset_index(drop=True)
    ev_df=ev_df.multiply(theta_s, axis='index')
    print(ev_df)

## Rasch calculation definition

In [123]:
def build_rasch_model(base_df):
    student_ids=base_df.index.tolist()
    first_iteration=1
    sum_sqr_res=1
    iteration_num=0
    while sum_sqr_res > 0.0001:
        if first_iteration==1:
            iteration_num=1
            first_iteration=0
            variable_estimates_dict=approximate_ability_and_difficulty(base_df)
        else:
            iteration_num+=1
            variable_estimates_dict=iterate_variable_estimates(variable_estimates_dict, est_var_ex_vals_df, residuals_df)
        expected_values_df=calc_expected_values(variable_estimates_dict)
        est_var_ex_vals_df=calc_est_var(expected_values_df)
        base_df.index=expected_values_df.index
        residuals_df=base_df-expected_values_df
        sum_sqr_res=calc_sum_sqr_residuals(residuals_df)

    fit_df=residuals_df.pow(2)/est_var_ex_vals_df
    fit_df.index=student_ids

    var_estimates_students=pd.Series(variable_estimates_dict['theta_s'], index=student_ids)
    var_estimates_items=pd.Series(variable_estimates_dict['beta_i'], index=variable_estimates_dict['beta_i_keys'])

    outfit_students=fit_df.mean(axis=1)
    outfit_students.index=student_ids
    outfit_items=fit_df.mean(axis=0)

    infit_students=residuals_df.pow(2).sum(axis=1)/est_var_ex_vals_df.sum(axis=1)
    infit_students.index=student_ids
    infit_items=residuals_df.pow(2).sum(axis=0)/est_var_ex_vals_df.sum(axis=0)

    bad_infit_items=infit_items.where(infit_items > 1.3).dropna()
    bad_infit_students=infit_students.where(infit_students > 1.3).dropna()

    bad_outfit_items=outfit_items.where(outfit_items > 1.3).dropna()
    bad_outfit_students=outfit_students.where(outfit_students > 1.3).dropna()

    return {'fit_df': fit_df, 
            'var_estimates_students': var_estimates_students, 
            'var_estimates_items': var_estimates_items, 
            'outfit_students': outfit_students, 
            'outfit_items': outfit_items,
            'infit_students': infit_students,
            'infit_items': infit_items,
            'bad_infit_items': bad_infit_items,
            'bad_infit_students': bad_infit_students, 
            'bad_outfit_items': bad_outfit_items,
            'bad_outfit_students': bad_outfit_students
            }

## Run Rasch for all exams

In [124]:
all_exam_numbers_and_forms=collect_all_exam_numbers_and_forms(raw_df)
list_of_tf_dfs=create_true_false_for_all_exams(raw_df, key_df, all_exam_numbers_and_forms)

list_of_rasch_dicts=[]
for exam_df in list_of_tf_dfs:
    no_error_exam_df=remove_issue_scores(exam_df['true_false_df'])
    rasch_dict=build_rasch_model(no_error_exam_df)
    rasch_dict['exam_num_and_form']=exam_df['exam_num_and_form']
    rasch_dict['true_false_df']=exam_df['true_false_df']
    list_of_rasch_dicts.append(rasch_dict)

2 100% scores were dropped.
5 100% scores were dropped.
4 100% scores were dropped.
6 100% scores were dropped.


  exp_vars=math.exp(theta_s[theta_index] - beta_i[beta_index])
  temp_new_beta = beta_i[beta_index] - (residual_col_sum / variance_col_sum)
  exp_vars=math.exp(theta_s[theta_index] - beta_i[beta_index])
  temp_new_beta = beta_i[beta_index] - (residual_col_sum / variance_col_sum)
  exp_vars=math.exp(theta_s[theta_index] - beta_i[beta_index])
  temp_new_beta = beta_i[beta_index] - (residual_col_sum / variance_col_sum)
  exp_vars=math.exp(theta_s[theta_index] - beta_i[beta_index])
  temp_new_beta = beta_i[beta_index] - (residual_col_sum / variance_col_sum)
  exp_vars=math.exp(theta_s[theta_index] - beta_i[beta_index])
  temp_new_beta = beta_i[beta_index] - (residual_col_sum / variance_col_sum)
  exp_vars=math.exp(theta_s[theta_index] - beta_i[beta_index])
  temp_new_beta = beta_i[beta_index] - (residual_col_sum / variance_col_sum)


4 100% scores were dropped.
1 100% scores were dropped.
4 100% scores were dropped.
1 100% scores were dropped.


  exp_vars=math.exp(theta_s[theta_index] - beta_i[beta_index])
  temp_new_beta = beta_i[beta_index] - (residual_col_sum / variance_col_sum)
  exp_vars=math.exp(theta_s[theta_index] - beta_i[beta_index])
  temp_new_beta = beta_i[beta_index] - (residual_col_sum / variance_col_sum)
  exp_vars=math.exp(theta_s[theta_index] - beta_i[beta_index])
  temp_new_beta = beta_i[beta_index] - (residual_col_sum / variance_col_sum)
  exp_vars=math.exp(theta_s[theta_index] - beta_i[beta_index])
  temp_new_beta = beta_i[beta_index] - (residual_col_sum / variance_col_sum)
  exp_vars=math.exp(theta_s[theta_index] - beta_i[beta_index])
  temp_new_beta = beta_i[beta_index] - (residual_col_sum / variance_col_sum)


In [125]:
for rasch_dict in list_of_rasch_dicts:
    print(rasch_dict['bad_infit_items'])
    print(rasch_dict['bad_infit_students'])
    print(rasch_dict['bad_outfit_items'])
    print(rasch_dict['bad_outfit_students'])
    print()

Series([], dtype: float64)
14912309    1.327676
17922917    1.364306
19361270    1.451542
26307414    1.370968
34836375    1.498880
42394523    1.321403
51941565    1.454762
79512547    1.320773
95799545    1.503108
dtype: float64
2B02    1.408174
2B12    1.400089
dtype: float64
14912309    1.363869
17922917    1.520250
19361270    1.463223
23136659    1.367501
26307414    1.409273
33984225    1.321121
34836375    1.626703
36514549    1.305103
51941565    2.179857
57172791    1.304456
65841008    1.552292
79512547    1.378853
84960036    1.437813
91579986    1.351533
91802235    2.355638
95799545    2.259570
dtype: float64

Series([], dtype: float64)
2905547     1.327377
13030101    2.641948
15300931    1.697545
17954414    1.765837
19361270    1.324288
84960036    2.683535
91579986    1.317225
96834233    2.025921
dtype: float64
3B07    1.309980
3B08    3.748434
3B10    1.446412
3B16    1.440265
dtype: float64
7913559      1.449928
11963986     2.333984
13030101     2.244683
13421997 

In [126]:
list_of_rasch_dicts[0]

{'fit_df':               2B01      2B02      2B03      2B04      2B05      2B06  \
 2905547   0.764923  0.864144  0.108332  0.216220  0.180862  0.350193   
 3204944   0.582305  0.384906  0.243214  0.485430  0.406050  1.271922   
 7493959   0.413650  3.657323  0.342379  1.463374  1.749457  1.106770   
 13030101  0.847374  0.560117  0.167134  0.333582  0.279032  1.850909   
 13036341  0.299343  0.197867  0.473119  0.944297  1.266016  1.529401   
 ...            ...       ...       ...       ...       ...       ...   
 95799545  0.764923  1.157214  0.108332  4.624928  5.529080  0.350193   
 96301836  0.847374  0.560117  0.167134  0.333582  0.279032  0.540275   
 96834233  0.159419  0.105377  0.888379  0.563980  1.483162  0.348218   
 97686953  0.442864  0.669987  0.062721  0.125184  0.104713  0.202750   
 99015981  2.258031  1.492567  0.062721  0.125184  0.104713  0.202750   
 
               2B07      2B08      2B09       2B10      2B11      2B12  \
 2905547   2.649563  0.084774  6.07387