In [13]:
import os
os.environ["R_HOME"] = r"C:\Users\rohit.daniel\AppData\Local\Programs\R\R-4.2.0"
%load_ext rpy2.ipython
from rpy2 import robjects

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [15]:
pi = robjects.r['pi']
pi

0
3.141593


In [10]:
%%R
x

[1] 1


In [16]:
type(pi)

rpy2.robjects.vectors.FloatVector

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import regex as re
import seaborn as sns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

<h1>
    Merge raw data files
</h1>

In [5]:
# Import the datasets that need to be merged
up_raw_1 = pd.read_excel("./up_mapped_data_v2.xlsx", index_col='_id')
up_raw_2 = pd.read_csv("./CSF_FLN_2022_Grade_1_Baseline-CSF_FLN_Evaluation_UP_Baseline_2022_v2.19-1666664936961.csv", index_col='_id', low_memory=False)

In [6]:
# Flag to check if all columns in dataset1 are present in dataset 2
all(col in up_raw_2.columns for col in up_raw_1.columns)

True

In [8]:
# missing_cols = set(up_raw_1.columns).difference(set(up_raw_2.columns))
# missing_cols

set()

In [62]:
# Rename colmns in old CSV file to facilitate concatenation
# old_cols = set(mp_raw_1.columns).difference(set(mp_raw_2.columns)) # Create list of columns that needs to be renamed

# new_cols = [re.search('^.+(?=\.\w+$)', col).group(0) for col in old_cols] # Create list of new names for the columns

# rename_cols = {} # Initialize dictionary to store old and new column names

# for old_col, new_col in zip(old_cols, new_cols):
#     rename_cols[old_col] = new_col

# mp_raw_1.rename(columns=rename_cols, inplace=True)

In [9]:
# Concatenate both raw dataframes
up_raw = pd.concat([up_raw_1, up_raw_2], join="inner")

In [10]:
# Export concatenated dataframe to Excel file for further cleaning and analysis
up_raw.to_excel("up_raw_full.xlsx")

In [76]:
up_raw['school_details.State_label'].unique()

array(['Uttar Pradesh', 'Uttar_Pradesh', 'Madhya_Pradesh'], dtype=object)

<h1>
    Import merged raw data file
</h1>

In [11]:
# Import concatenated raw dataframe for cleaning
up_raw = pd.read_excel("./up_raw_full.xlsx")

<h1>
    General Data Cleaning
</h1>
<!-- <p>
    1. Remove data collected during field pracitce <br>
    2. Remove data with 'orphaned' or 'undefined' variables <br>
    3. Handle missing/Nan values
</p> -->

In [12]:
# Drop pratice data collected using the test version of Tangerine
test_data = up_raw[up_raw['buildChannel'] == 'test']
print(f"No. of test assessments: {test_data.shape[0]}")
up_raw.drop(test_data.index, inplace=True)

# Drop incomplete assessments
incomplete_data = up_raw[up_raw['complete'] == "false"]
print(f"No. of incomplete assessments: {incomplete_data.shape[0]}")
up_raw.drop(incomplete_data.index, inplace=True)

# Drop data with 'No' for child's consent for the assessment
no_consent = up_raw[up_raw['consent'] == 'no']
print(f"No. of assessments where children did not give consent: {no_consent.shape[0]}")
up_raw.drop(no_consent.index, inplace=True)

No. of test assessments: 0
No. of incomplete assessments: 0
No. of assessments where children did not give consent: 1


In [13]:
# Define function to fix incorrect literacy grid sub-task scores
def fix_score(scores):
    for i in range(len(scores)-3):
        if (int(scores[i]) + int(scores [i+1]) + int(scores [i+2]) + int(scores [i+3])) == 0:    
            for j in range(i+4, len(scores)):
                scores[j] = '999'
            break
    return scores

In [14]:
# Define function to fix incorrect literacy grid sub-task scores
def fix_counting_score(scores):
    for i in range(len(scores)):
        if scores[i] == '0':    
            for j in range(i, len(scores)):
                scores[j] = '999'
            break
    return scores

<h2>
    A. Literacy Sub-tasks Data Cleaning
</h2>

In [15]:
# Drop data with 'Yes' for child wants to stop literacy assessment subtask
end_list = [col for col in up_raw.columns if re.search(r'^lit\w*end$', col)]
total_literacy_stopped = 0
up_raw_literacy = up_raw.copy()
for col in end_list:
    assessments_stopped = up_raw_literacy[up_raw_literacy[col] == '1']
#     print(f"No. of assessments stopped after {col}: {assessments_stopped.shape[0]}")
    total_literacy_stopped += assessments_stopped.shape[0]
    up_raw_literacy.drop(assessments_stopped.index, inplace=True)
print(f"Total no. of literacy assessments stopped: {total_literacy_stopped}")

# Drop data where a particular literacy sub-task (other than valid skipable tasks) was disabled
literacy_disabled_list = ['listening_comprehension_disabled', 'oral_vocabulary_disabled', 'initial_sounds_disabled', \
                          'letter_naming_untimed_disabled', 'letter_naming_timed_disabled', 'familiar_words_untimed_disabled', \
                          'orf_timed_disabled', 'dictation_untimed_letters_disabled', 'dictation_untimed_words_disabled']

total_literacy_disabled = 0
for col in literacy_disabled_list:
    disabled_assessments = up_raw_literacy[up_raw_literacy[col] == True]
#     print(f"No. of assessments disabled after {col}: {disabled_assessments.shape[0]}")
    up_raw_literacy.drop(disabled_assessments.index, inplace=True)
    total_literacy_disabled += disabled_assessments.shape[0]
print(f"Total no. of literacy assessments disabled: {total_literacy_disabled}")

up_raw_literacy.to_excel('up_raw_literacy_full.xlsx')

Total no. of literacy assessments stopped: 0
Total no. of literacy assessments disabled: 0


In [16]:
# Create dataframe to store sub-task total scores and percentages
mp_literacy = pd.DataFrame()

general_info =['_id', 'tabletUserName', 'assessment_date', 'school_details.State_label', 'school_details.District_label', \
               'school_details.Block_label', 'school_details.School_label', 'school_details.UDISE_cd_label']

student_info = ['SI_std_name', 'student_age', 'student_gender']

literacy1 = [col for col in up_raw.columns if re.search(r'^literacy1\w*', col)]

literacy2 = [col for col in up_raw.columns if re.search(r'^literacy2\w*', col)]
       
literacy3 = [col for col in up_raw.columns if re.search(r'^literacy3\w*', col)]

literacy4_ut = [col for col in up_raw.columns if re.search(r'^literacy4_ut_grid\S*', col)]

literacy4_tt =  [col for col in up_raw.columns if re.search(r'literacy4_tt_grid\S*', col)]

literacy5_ut = [col for col in up_raw.columns if re.search(r'literacy5_ut_grid\S*', col)]

literacy5_tt = [col for col in up_raw.columns if re.search(r'literacy5_tt_grid\S*', col)]
    
literacy6 = [col for col in up_raw.columns if re.search(r'literacy6_tt_grid\S*', col)]
    
literacy7 = [col for col in up_raw.columns if re.search(r'literacy7_tt_grid\S*', col)]

literacy8 = [col for col in up_raw.columns if re.search(r'literacy8_ut_\S*', col)]

literacy9a = [col for col in up_raw.columns if re.search(r'literacy9a_ut_grid\S', col)]

literacy9b = [col for col in up_raw.columns if re.search(r'literacy9b_ut_grid\S', col)]

up_literacy = up_raw_literacy[general_info + student_info + literacy1 + literacy2 + literacy3 + literacy4_ut + literacy4_tt + literacy5_ut + literacy5_tt + literacy6 + literacy7 + literacy8 + literacy9a + literacy9b].copy()

up_literacy.loc[:, 'student_gender'] = up_literacy.loc[:, 'student_gender'].apply(lambda x: 'Male' if ((x=='0') | (x==0)) else 'Female')

<h3>
    Literacy 1: Listening Comprehension
</h3>

In [31]:
# Extract other responses to listening comprehension questions
literacy1_or = [col for col in up_literacy.columns if re.search(r'^literacy1_\S*or$', col)]
with pd.ExcelWriter('up_literacy1_other_responses_vf.xlsx') as writer: 
    for col in literacy1_or:
        up_literacy[col].value_counts().reset_index().rename(columns = {'index':"Response", col:'Frequency'}).to_excel(writer, sheet_name=col)

In [33]:
# Check data for'UNDEFINED' or 'SKIPPED' values
literacy1_raw = [col for col in literacy1 if re.search(r'\d$', col)]
for col in literacy1_raw:
    print(f"Unique values in {col} = {up_literacy[col].unique()}")
    print(f"No. of NaN values in {col} = {up_literacy.loc[:, col].isna().sum()}")
    print(f"No. of UNDEFINED values in {col} = {up_literacy[up_literacy[col] == 'UNDEFINED'].shape[0]}")    
    print(f"No. of SKIPPED values in {col} = {up_literacy[up_literacy[col] == 'SKIPPED'].shape[0]}\n")
    up_literacy.loc[:, col].fillna('999', inplace=True)
    up_literacy.loc[:, col].replace('UNDEFINED', '999', inplace=True)
    up_literacy.loc[:, col].replace('SKIPPED', '999', inplace=True)
    up_literacy.loc[:, col].replace('.', '999', inplace=True)
    up_literacy.loc[:, col].replace(88, '88', inplace=True)
    up_literacy.loc[:, col].replace(1, '1', inplace=True)
    up_literacy.loc[:, col].replace(0, '0', inplace=True)
    up_literacy.loc[:, col].astype('str')

Unique values in literacy1_q1 = ['88' '1' '0']
No. of NaN values in literacy1_q1 = 0
No. of UNDEFINED values in literacy1_q1 = 0
No. of SKIPPED values in literacy1_q1 = 0

Unique values in literacy1_q2 = [88 1 0 '1' '0' '88']
No. of NaN values in literacy1_q2 = 0
No. of UNDEFINED values in literacy1_q2 = 0
No. of SKIPPED values in literacy1_q2 = 0

Unique values in literacy1_q3 = [88 1 0 '88' '1' '0']
No. of NaN values in literacy1_q3 = 0
No. of UNDEFINED values in literacy1_q3 = 0
No. of SKIPPED values in literacy1_q3 = 0

Unique values in literacy1_q4 = [88 1 0 '0' '1' '88']
No. of NaN values in literacy1_q4 = 0
No. of UNDEFINED values in literacy1_q4 = 0
No. of SKIPPED values in literacy1_q4 = 0



<h3>
    Literacy 2: Oral Vocabulary
</h3>

In [34]:
# Extract other responses to oral vocabulary questions
literacy2_or = [col for col in up_literacy.columns if re.search(r'literacy2\S*or$', col)]
with pd.ExcelWriter('up_literacy2_other_responses_vf.xlsx') as writer: 
    for col in literacy2_or:
        up_literacy[col].value_counts().reset_index().rename(columns = {'index':"Response", col:'Frequency'}).to_excel(writer, sheet_name=col)

In [35]:
# Check data for'UNDEFINED' or 'SKIPPED' values
literacy2_raw = [col for col in literacy2 if re.search(r'\d$', col)]
for col in literacy2_raw:
    print(f"Unique values in {col} = {up_literacy[col].unique()}")
    print(f"No. of NaN values in {col} = {up_literacy.loc[:, col].isna().sum()}")
    print(f"No. of UNDEFINED values in {col} = {up_literacy[up_literacy[col] == 'UNDEFINED'].shape[0]}")    
    print(f"No. of SKIPPED values in {col} = {up_literacy[up_literacy[col] == 'SKIPPED'].shape[0]}\n")
    up_literacy.loc[:, col].fillna('999', inplace=True)
    up_literacy.loc[:, col].replace('UNDEFINED', '999', inplace=True)
    up_literacy.loc[:, col].replace('SKIPPED', '999', inplace=True)
    up_literacy.loc[:, col].replace('.', '999', inplace=True)
    up_literacy.loc[:, col].replace(88, '88', inplace=True)
    up_literacy.loc[:, col].replace(1, '1', inplace=True)
    up_literacy.loc[:, col].replace(0, '0', inplace=True)
    up_literacy.loc[:, col].astype('str')

Unique values in literacy2_q1 = [1 88 0 '1' '0' '88']
No. of NaN values in literacy2_q1 = 0
No. of UNDEFINED values in literacy2_q1 = 0
No. of SKIPPED values in literacy2_q1 = 0

Unique values in literacy2_q2 = [1 0 88 '1' '0' '88']
No. of NaN values in literacy2_q2 = 0
No. of UNDEFINED values in literacy2_q2 = 0
No. of SKIPPED values in literacy2_q2 = 0

Unique values in literacy2_q3 = [1 88 0 '1' '88' '0']
No. of NaN values in literacy2_q3 = 0
No. of UNDEFINED values in literacy2_q3 = 0
No. of SKIPPED values in literacy2_q3 = 0

Unique values in literacy2_q4 = [1 88 0 '1' '0' '88']
No. of NaN values in literacy2_q4 = 0
No. of UNDEFINED values in literacy2_q4 = 0
No. of SKIPPED values in literacy2_q4 = 0

Unique values in literacy2_q5 = [1 88 0 '1' '88' '0']
No. of NaN values in literacy2_q5 = 0
No. of UNDEFINED values in literacy2_q5 = 0
No. of SKIPPED values in literacy2_q5 = 0

Unique values in literacy2_q6 = [1 88 0 '88' '1' '0']
No. of NaN values in literacy2_q6 = 0
No. of UNDEFI

<h3>
    Literacy 3: Initial Sound Identification
</h3>

In [36]:
# Check data for 'UNDEFINED' or 'SKIPPED' values
for col in literacy3:
    print(f"Unique values in {col} = {up_literacy[col].unique()}")
    print(f"No. of NaN values in {col} = {up_literacy.loc[:, col].isna().sum()}")
    print(f"No. of UNDEFINED values in {col} = {up_literacy[up_literacy[col] == 'UNDEFINED'].shape[0]}")    
    print(f"No. of SKIPPED values in {col} = {up_literacy[up_literacy[col] == 'SKIPPED'].shape[0]}\n")
    up_literacy.loc[:, col].fillna('999', inplace=True)
    up_literacy.loc[:, col].replace('UNDEFINED', '999', inplace=True)
    up_literacy.loc[:, col].replace('SKIPPED', '999', inplace=True)
    up_literacy.loc[:, col].replace('.', '999', inplace=True)
    up_literacy.loc[:, col].replace(88, '88', inplace=True)
    up_literacy.loc[:, col].replace(1, '1', inplace=True)
    up_literacy.loc[:, col].replace(0, '0', inplace=True)
    up_literacy.loc[:, col].astype('str')

Unique values in literacy3_p_q1 = [1 0 '0' '1']
No. of NaN values in literacy3_p_q1 = 0
No. of UNDEFINED values in literacy3_p_q1 = 0
No. of SKIPPED values in literacy3_p_q1 = 0

Unique values in literacy3_p_q2 = [0 1 '0' '1']
No. of NaN values in literacy3_p_q2 = 0
No. of UNDEFINED values in literacy3_p_q2 = 0
No. of SKIPPED values in literacy3_p_q2 = 0

Unique values in literacy3_end = [0 '0']
No. of NaN values in literacy3_end = 0
No. of UNDEFINED values in literacy3_end = 0
No. of SKIPPED values in literacy3_end = 0

Unique values in literacy3_q1 = [0 1 88 '88' '0' '1']
No. of NaN values in literacy3_q1 = 0
No. of UNDEFINED values in literacy3_q1 = 0
No. of SKIPPED values in literacy3_q1 = 0

Unique values in literacy3_q2 = [0 1 88 '88' '0' '1']
No. of NaN values in literacy3_q2 = 0
No. of UNDEFINED values in literacy3_q2 = 0
No. of SKIPPED values in literacy3_q2 = 0

Unique values in literacy3_q3 = [1 88 0 '88' '0' '1']
No. of NaN values in literacy3_q3 = 0
No. of UNDEFINED values

<h3>
    Literacy 4: Letter Recognition (Untimed)
</h3>

In [37]:
# Check data for 'UNDEFINED' or 'SKIPPED' values
literacy4_ut_raw = [col for col in literacy4_ut if re.search(r'literacy4_ut_grid_\d*$', col)]
for col in literacy4_ut_raw:
    print(f"Unique values in {col} = {up_literacy[col].unique()}")
    print(f"No. of NaN values in {col} = {up_literacy.loc[:, col].isna().sum()}")
    print(f"No. of UNDEFINED values in {col} = {up_literacy[up_literacy[col] == 'UNDEFINED'].shape[0]}")    
    print(f"No. of SKIPPED values in {col} = {up_literacy[up_literacy[col] == 'SKIPPED'].shape[0]}\n")
    up_literacy.loc[:, col].fillna('999', inplace=True)
    up_literacy.loc[:, col].replace('UNDEFINED', '999', inplace=True)
    up_literacy.loc[:, col].replace('SKIPPED', '999', inplace=True)
    up_literacy.loc[:, col].replace('.', '999', inplace=True)
    up_literacy.loc[:, col].replace(88, '88', inplace=True)
    up_literacy.loc[:, col].replace(1, '1', inplace=True)
    up_literacy.loc[:, col].replace(0, '0', inplace=True)
    up_literacy.loc[:, col].astype('str')

Unique values in literacy4_ut_grid_1 = [1 0 '0' '1']
No. of NaN values in literacy4_ut_grid_1 = 0
No. of UNDEFINED values in literacy4_ut_grid_1 = 0
No. of SKIPPED values in literacy4_ut_grid_1 = 0

Unique values in literacy4_ut_grid_2 = ['1' '0' '.']
No. of NaN values in literacy4_ut_grid_2 = 0
No. of UNDEFINED values in literacy4_ut_grid_2 = 0
No. of SKIPPED values in literacy4_ut_grid_2 = 0

Unique values in literacy4_ut_grid_3 = ['1' '0' '.']
No. of NaN values in literacy4_ut_grid_3 = 0
No. of UNDEFINED values in literacy4_ut_grid_3 = 0
No. of SKIPPED values in literacy4_ut_grid_3 = 0

Unique values in literacy4_ut_grid_4 = ['1' '0' '.']
No. of NaN values in literacy4_ut_grid_4 = 0
No. of UNDEFINED values in literacy4_ut_grid_4 = 0
No. of SKIPPED values in literacy4_ut_grid_4 = 0

Unique values in literacy4_ut_grid_5 = ['1' '.' '0']
No. of NaN values in literacy4_ut_grid_5 = 0
No. of UNDEFINED values in literacy4_ut_grid_5 = 0
No. of SKIPPED values in literacy4_ut_grid_5 = 0

Uniqu

In [38]:
up_literacy.loc[:, literacy4_ut_raw] = pd.DataFrame((up_literacy.apply(lambda x: fix_score([x[col] for col in literacy4_ut_raw]), axis=1)).to_list(), index=up_literacy.index, columns=literacy4_ut_raw)

<h3>
    Literacy 4: Letter Recognition (Timed)
</h3>

In [39]:
# Check data for 'UNDEFINED' or 'SKIPPED' values
literacy4_tt_raw = [col for col in literacy4_tt if re.search(r'literacy4_tt_grid_\d*$', col)]
for col in literacy4_tt_raw:
    print(f"Unique values in {col} = {up_literacy[col].unique()}")
    print(f"No. of NaN values in {col} = {up_literacy.loc[:, col].isna().sum()}")
    print(f"No. of UNDEFINED values in {col} = {up_literacy[up_literacy[col] == 'UNDEFINED'].shape[0]}")    
    print(f"No. of SKIPPED values in {col} = {up_literacy[up_literacy[col] == 'SKIPPED'].shape[0]}\n")
    up_literacy.loc[:, col].fillna('999', inplace=True)
    up_literacy.loc[:, col].replace('UNDEFINED', '999', inplace=True)
    up_literacy.loc[:, col].replace('SKIPPED', '999', inplace=True)
    up_literacy.loc[:, col].replace('.', '999', inplace=True)
    up_literacy.loc[:, col].replace(88, '88', inplace=True)
    up_literacy.loc[:, col].replace(1, '1', inplace=True)
    up_literacy.loc[:, col].replace(0, '0', inplace=True)
    up_literacy.loc[:, col].astype('str')

Unique values in literacy4_tt_grid_1 = [1 0 '0' '1']
No. of NaN values in literacy4_tt_grid_1 = 0
No. of UNDEFINED values in literacy4_tt_grid_1 = 0
No. of SKIPPED values in literacy4_tt_grid_1 = 0

Unique values in literacy4_tt_grid_2 = [1 0 '0' '1' '.']
No. of NaN values in literacy4_tt_grid_2 = 0
No. of UNDEFINED values in literacy4_tt_grid_2 = 0
No. of SKIPPED values in literacy4_tt_grid_2 = 0

Unique values in literacy4_tt_grid_3 = [1 0 '0' '1' '.']
No. of NaN values in literacy4_tt_grid_3 = 0
No. of UNDEFINED values in literacy4_tt_grid_3 = 0
No. of SKIPPED values in literacy4_tt_grid_3 = 0

Unique values in literacy4_tt_grid_4 = [1 0 '0' '1' '.']
No. of NaN values in literacy4_tt_grid_4 = 0
No. of UNDEFINED values in literacy4_tt_grid_4 = 0
No. of SKIPPED values in literacy4_tt_grid_4 = 0

Unique values in literacy4_tt_grid_5 = ['1' '0' '.']
No. of NaN values in literacy4_tt_grid_5 = 0
No. of UNDEFINED values in literacy4_tt_grid_5 = 0
No. of SKIPPED values in literacy4_tt_grid_

No. of UNDEFINED values in literacy4_tt_grid_56 = 0
No. of SKIPPED values in literacy4_tt_grid_56 = 0

Unique values in literacy4_tt_grid_57 = ['.' '1' '0']
No. of NaN values in literacy4_tt_grid_57 = 0
No. of UNDEFINED values in literacy4_tt_grid_57 = 0
No. of SKIPPED values in literacy4_tt_grid_57 = 0

Unique values in literacy4_tt_grid_58 = ['.' '1' '0']
No. of NaN values in literacy4_tt_grid_58 = 0
No. of UNDEFINED values in literacy4_tt_grid_58 = 0
No. of SKIPPED values in literacy4_tt_grid_58 = 0

Unique values in literacy4_tt_grid_59 = ['.' '0' '1']
No. of NaN values in literacy4_tt_grid_59 = 0
No. of UNDEFINED values in literacy4_tt_grid_59 = 0
No. of SKIPPED values in literacy4_tt_grid_59 = 0

Unique values in literacy4_tt_grid_60 = ['.' '1' '0']
No. of NaN values in literacy4_tt_grid_60 = 0
No. of UNDEFINED values in literacy4_tt_grid_60 = 0
No. of SKIPPED values in literacy4_tt_grid_60 = 0

Unique values in literacy4_tt_grid_61 = ['.' '1' '0']
No. of NaN values in literacy4_

In [40]:
up_literacy.loc[:, literacy4_tt_raw] = pd.DataFrame((up_literacy.apply(lambda x: fix_score([x[col] for col in literacy4_tt_raw]), axis=1)).to_list(), index=up_literacy.index, columns=literacy4_tt_raw)

<h3>
    Literacy 5: Familiar Words Reading (Untimed)
</h3>

In [41]:
# Check data for 'UNDEFINED' or 'SKIPPED' values
literacy5_ut_raw = [col for col in literacy5_ut if re.search(r'literacy5_ut_grid_\d*$', col)]
for col in literacy5_ut_raw:
    print(f"Unique values in {col} = {up_literacy[col].unique()}")
    print(f"No. of NaN values in {col} = {up_literacy.loc[:, col].isna().sum()}")
    print(f"No. of UNDEFINED values in {col} = {up_literacy[up_literacy[col] == 'UNDEFINED'].shape[0]}")    
    print(f"No. of SKIPPED values in {col} = {up_literacy[up_literacy[col] == 'SKIPPED'].shape[0]}\n")
    up_literacy.loc[:, col].fillna('999', inplace=True)
    up_literacy.loc[:, col].replace('UNDEFINED', '999', inplace=True)
    up_literacy.loc[:, col].replace('SKIPPED', '999', inplace=True)
    up_literacy.loc[:, col].replace('.', '999', inplace=True)
    up_literacy.loc[:, col].replace(88, '88', inplace=True)
    up_literacy.loc[:, col].replace(1, '1', inplace=True)
    up_literacy.loc[:, col].replace(0, '0', inplace=True)
    up_literacy.loc[:, col].astype('str')

Unique values in literacy5_ut_grid_1 = [0 1 '0' '1']
No. of NaN values in literacy5_ut_grid_1 = 0
No. of UNDEFINED values in literacy5_ut_grid_1 = 0
No. of SKIPPED values in literacy5_ut_grid_1 = 0

Unique values in literacy5_ut_grid_2 = [1 0 '0' '1']
No. of NaN values in literacy5_ut_grid_2 = 0
No. of UNDEFINED values in literacy5_ut_grid_2 = 0
No. of SKIPPED values in literacy5_ut_grid_2 = 0

Unique values in literacy5_ut_grid_3 = [0 1 '0' '1']
No. of NaN values in literacy5_ut_grid_3 = 0
No. of UNDEFINED values in literacy5_ut_grid_3 = 0
No. of SKIPPED values in literacy5_ut_grid_3 = 0

Unique values in literacy5_ut_grid_4 = ['0' '1' '.']
No. of NaN values in literacy5_ut_grid_4 = 0
No. of UNDEFINED values in literacy5_ut_grid_4 = 0
No. of SKIPPED values in literacy5_ut_grid_4 = 0

Unique values in literacy5_ut_grid_5 = ['0' '1' '.']
No. of NaN values in literacy5_ut_grid_5 = 0
No. of UNDEFINED values in literacy5_ut_grid_5 = 0
No. of SKIPPED values in literacy5_ut_grid_5 = 0

Uniqu

In [42]:
up_literacy.loc[:, literacy5_ut_raw] = pd.DataFrame((up_literacy.apply(lambda x: fix_score([x[col] for col in literacy5_ut_raw]), axis=1)).to_list(), index=up_literacy.index, columns=literacy5_ut_raw)

<h3>
    Literacy 5: Familiar Words Reading (Timed)
</h3>

In [43]:
# Check data for 'UNDEFINED' or 'SKIPPPED' values
literacy5_tt_raw = [col for col in literacy5_tt if re.search(r'literacy5_tt_grid_\d*$', col)]
for col in literacy5_tt_raw:
    print(f"Unique values in {col} = {up_literacy[col].unique()}")
    print(f"No. of NaN values in {col} = {up_literacy.loc[:, col].isna().sum()}")
    print(f"No. of UNDEFINED values in {col} = {up_literacy[up_literacy[col] == 'UNDEFINED'].shape[0]}")    
    print(f"No. of SKIPPED values in {col} = {up_literacy[up_literacy[col] == 'SKIPPED'].shape[0]}\n")
    up_literacy.loc[:, col].fillna('999', inplace=True)
    up_literacy.loc[:, col].replace('UNDEFINED', '999', inplace=True)
    up_literacy.loc[:, col].replace('SKIPPED', '999', inplace=True)
    up_literacy.loc[:, col].replace('.', '999', inplace=True)
    up_literacy.loc[:, col].replace(88, '88', inplace=True)
    up_literacy.loc[:, col].replace(1, '1', inplace=True)
    up_literacy.loc[:, col].replace(0, '0', inplace=True)
    up_literacy.loc[:, col].astype('str')

Unique values in literacy5_tt_grid_1 = ['1' '0' 'SKIPPED']
No. of NaN values in literacy5_tt_grid_1 = 0
No. of UNDEFINED values in literacy5_tt_grid_1 = 0
No. of SKIPPED values in literacy5_tt_grid_1 = 7

Unique values in literacy5_tt_grid_2 = ['1' '0' 'SKIPPED' '.']
No. of NaN values in literacy5_tt_grid_2 = 0
No. of UNDEFINED values in literacy5_tt_grid_2 = 0
No. of SKIPPED values in literacy5_tt_grid_2 = 7

Unique values in literacy5_tt_grid_3 = ['1' '0' 'SKIPPED' '.']
No. of NaN values in literacy5_tt_grid_3 = 0
No. of UNDEFINED values in literacy5_tt_grid_3 = 0
No. of SKIPPED values in literacy5_tt_grid_3 = 7

Unique values in literacy5_tt_grid_4 = ['0' '1' 'SKIPPED' '.']
No. of NaN values in literacy5_tt_grid_4 = 0
No. of UNDEFINED values in literacy5_tt_grid_4 = 0
No. of SKIPPED values in literacy5_tt_grid_4 = 7

Unique values in literacy5_tt_grid_5 = ['1' '.' '0' 'SKIPPED']
No. of NaN values in literacy5_tt_grid_5 = 0
No. of UNDEFINED values in literacy5_tt_grid_5 = 0
No. of SK

No. of SKIPPED values in literacy5_tt_grid_49 = 7

Unique values in literacy5_tt_grid_50 = ['.' 'SKIPPED' '1' '0']
No. of NaN values in literacy5_tt_grid_50 = 0
No. of UNDEFINED values in literacy5_tt_grid_50 = 0
No. of SKIPPED values in literacy5_tt_grid_50 = 7



In [44]:
up_literacy.loc[:, literacy5_tt_raw] = pd.DataFrame((up_literacy.apply(lambda x: fix_score([x[col] for col in literacy5_tt_raw]), axis=1)).to_list(), index=up_literacy.index, columns=literacy5_tt_raw)

<h3>
    Literacy 6: Non-word Reading
</h3>

In [45]:
# Check data for 'UNDEFINED' or 'SKIPPPED' values
literacy6_raw = [col for col in up_raw.columns if re.search(r'literacy6_tt_grid_\d*', col)]
for col in literacy6_raw:
    print(f"Unique values in {col} = {up_literacy[col].unique()}")
    print(f"No. of NaN values in {col} = {up_literacy.loc[:, col].isna().sum()}")
    print(f"No. of UNDEFINED values in {col} = {up_literacy[up_literacy[col] == 'UNDEFINED'].shape[0]}")    
    print(f"No. of SKIPPED values in {col} = {up_literacy[up_literacy[col] == 'SKIPPED'].shape[0]}\n")
    up_literacy.loc[:, col].fillna('999', inplace=True)
    up_literacy.loc[:, col].replace('UNDEFINED', '999', inplace=True)
    up_literacy.loc[:, col].replace('SKIPPED', '999', inplace=True)
    up_literacy.loc[:, col].replace('.', '999', inplace=True)
    up_literacy.loc[:, col].replace(88, '88', inplace=True)
    up_literacy.loc[:, col].replace(1, '1', inplace=True)
    up_literacy.loc[:, col].replace(0, '0', inplace=True)
    up_literacy.loc[:, col].astype('str')

Unique values in literacy6_tt_grid_1 = ['0' '1' 'SKIPPED']
No. of NaN values in literacy6_tt_grid_1 = 0
No. of UNDEFINED values in literacy6_tt_grid_1 = 0
No. of SKIPPED values in literacy6_tt_grid_1 = 7

Unique values in literacy6_tt_grid_2 = ['0' '1' 'SKIPPED' '.']
No. of NaN values in literacy6_tt_grid_2 = 0
No. of UNDEFINED values in literacy6_tt_grid_2 = 0
No. of SKIPPED values in literacy6_tt_grid_2 = 7

Unique values in literacy6_tt_grid_3 = ['0' '1' 'SKIPPED' '.']
No. of NaN values in literacy6_tt_grid_3 = 0
No. of UNDEFINED values in literacy6_tt_grid_3 = 0
No. of SKIPPED values in literacy6_tt_grid_3 = 7

Unique values in literacy6_tt_grid_4 = ['1' '0' 'SKIPPED' '.']
No. of NaN values in literacy6_tt_grid_4 = 0
No. of UNDEFINED values in literacy6_tt_grid_4 = 0
No. of SKIPPED values in literacy6_tt_grid_4 = 7

Unique values in literacy6_tt_grid_5 = ['1' '.' '0' 'SKIPPED']
No. of NaN values in literacy6_tt_grid_5 = 0
No. of UNDEFINED values in literacy6_tt_grid_5 = 0
No. of SK

Unique values in literacy6_tt_grid_48 = ['.' 'SKIPPED' '1']
No. of NaN values in literacy6_tt_grid_48 = 0
No. of UNDEFINED values in literacy6_tt_grid_48 = 0
No. of SKIPPED values in literacy6_tt_grid_48 = 7

Unique values in literacy6_tt_grid_49 = ['.' 'SKIPPED' '0' '1']
No. of NaN values in literacy6_tt_grid_49 = 0
No. of UNDEFINED values in literacy6_tt_grid_49 = 0
No. of SKIPPED values in literacy6_tt_grid_49 = 7

Unique values in literacy6_tt_grid_50 = ['.' 'SKIPPED' '1']
No. of NaN values in literacy6_tt_grid_50 = 0
No. of UNDEFINED values in literacy6_tt_grid_50 = 0
No. of SKIPPED values in literacy6_tt_grid_50 = 7



In [46]:
up_literacy.loc[:, literacy6_raw] = pd.DataFrame((up_literacy.apply(lambda x: fix_score([x[col] for col in literacy6_raw]), axis=1)).to_list(), index=up_literacy.index, columns=literacy6_raw)

<h3>
    Literacy 7: Oral Reading Fluency (Timed)
</h3>

In [47]:
# Check data for 'UNDEFINED' or 'SKIPPPED' values
literacy7_raw = [col for col in up_raw.columns if re.search(r'literacy7_tt_grid_\d*', col)]
for col in literacy7_raw:
    print(f"Unique values in {col} = {up_literacy[col].unique()}")
    print(f"No. of NaN values in {col} = {up_literacy.loc[:, col].isna().sum()}")
    print(f"No. of UNDEFINED values in {col} = {up_literacy[up_literacy[col] == 'UNDEFINED'].shape[0]}")    
    print(f"No. of SKIPPED values in {col} = {up_literacy[up_literacy[col] == 'SKIPPED'].shape[0]}\n")
    up_literacy.loc[:, col].fillna('999', inplace=True)
    up_literacy.loc[:, col].replace('UNDEFINED', '999', inplace=True)
    up_literacy.loc[:, col].replace('SKIPPED', '999', inplace=True)
    up_literacy.loc[:, col].replace('.', '999', inplace=True)
    up_literacy.loc[:, col].replace(88, '88', inplace=True)
    up_literacy.loc[:, col].replace(1, '1', inplace=True)
    up_literacy.loc[:, col].replace(0, '0', inplace=True)
    up_literacy.loc[:, col].astype('str')

Unique values in literacy7_tt_grid_1 = [0 1 '0' '1']
No. of NaN values in literacy7_tt_grid_1 = 0
No. of UNDEFINED values in literacy7_tt_grid_1 = 0
No. of SKIPPED values in literacy7_tt_grid_1 = 0

Unique values in literacy7_tt_grid_2 = [0 1 '0' '1' '.']
No. of NaN values in literacy7_tt_grid_2 = 0
No. of UNDEFINED values in literacy7_tt_grid_2 = 0
No. of SKIPPED values in literacy7_tt_grid_2 = 0

Unique values in literacy7_tt_grid_3 = [0 1 '0' '1' '.']
No. of NaN values in literacy7_tt_grid_3 = 0
No. of UNDEFINED values in literacy7_tt_grid_3 = 0
No. of SKIPPED values in literacy7_tt_grid_3 = 0

Unique values in literacy7_tt_grid_4 = [0 1 '0' '1' '.']
No. of NaN values in literacy7_tt_grid_4 = 0
No. of UNDEFINED values in literacy7_tt_grid_4 = 0
No. of SKIPPED values in literacy7_tt_grid_4 = 0

Unique values in literacy7_tt_grid_5 = ['.' '0' '1']
No. of NaN values in literacy7_tt_grid_5 = 0
No. of UNDEFINED values in literacy7_tt_grid_5 = 0
No. of SKIPPED values in literacy7_tt_grid_

In [48]:
up_literacy.loc[:, literacy7_raw] = pd.DataFrame((up_literacy.apply(lambda x: fix_score([x[col] for col in literacy7_raw]), axis=1)).to_list(), index=up_literacy.index, columns=literacy7_raw)

<h3>
    Literacy 8: Reading Comprehension (Untimed)
</h3>

In [49]:
# Check data for 'UNDEFINED' or 'SKIPPPED' values
literacy8_raw = [col for col in up_raw.columns if re.search(r'literacy8_ut_grid_\d*', col)]
for col in literacy8_raw:
    print(f"Unique values in {col} = {up_literacy[col].unique()}")
    print(f"No. of NaN values in {col} = {up_literacy.loc[:, col].isna().sum()}")
    print(f"No. of UNDEFINED values in {col} = {up_literacy[up_literacy[col] == 'UNDEFINED'].shape[0]}")    
    print(f"No. of SKIPPED values in {col} = {up_literacy[up_literacy[col] == 'SKIPPED'].shape[0]}\n")
    up_literacy.loc[:, col].fillna('999', inplace=True)
    up_literacy.loc[:, col].replace('UNDEFINED', '999', inplace=True)
    up_literacy.loc[:, col].replace('SKIPPED', '999', inplace=True)
    up_literacy.loc[:, col].replace('.', '999', inplace=True)
    up_literacy.loc[:, col].replace(88, '88', inplace=True)
    up_literacy.loc[:, col].replace(1, '1', inplace=True)
    up_literacy.loc[:, col].replace(0, '0', inplace=True)
    up_literacy.loc[:, col].astype('str')

Unique values in literacy8_ut_grid_1 = ['0' '1' 'SKIPPED']
No. of NaN values in literacy8_ut_grid_1 = 0
No. of UNDEFINED values in literacy8_ut_grid_1 = 0
No. of SKIPPED values in literacy8_ut_grid_1 = 7

Unique values in literacy8_ut_grid_2 = ['0' '1' 'SKIPPED']
No. of NaN values in literacy8_ut_grid_2 = 0
No. of UNDEFINED values in literacy8_ut_grid_2 = 0
No. of SKIPPED values in literacy8_ut_grid_2 = 7

Unique values in literacy8_ut_grid_3 = ['0' '1' 'SKIPPED' '.']
No. of NaN values in literacy8_ut_grid_3 = 0
No. of UNDEFINED values in literacy8_ut_grid_3 = 0
No. of SKIPPED values in literacy8_ut_grid_3 = 7

Unique values in literacy8_ut_grid_4 = ['0' '1' 'SKIPPED' '.']
No. of NaN values in literacy8_ut_grid_4 = 0
No. of UNDEFINED values in literacy8_ut_grid_4 = 0
No. of SKIPPED values in literacy8_ut_grid_4 = 7

Unique values in literacy8_ut_grid_5 = ['.' '0' '1' 'SKIPPED']
No. of NaN values in literacy8_ut_grid_5 = 0
No. of UNDEFINED values in literacy8_ut_grid_5 = 0
No. of SKIPPE

In [50]:
up_literacy.loc[:, literacy8_raw] = pd.DataFrame((up_literacy.apply(lambda x: fix_score([x[col] for col in literacy8_raw]), axis=1)).to_list(), index=up_literacy.index, columns=literacy8_raw)

<h3>
    Literacy 9a: Dictation (Letters)
</h3>

In [51]:
# Check data for 'UNDEFINED' or 'SKIPPPED' values
literacy9a_raw = [col for col in up_raw.columns if re.search(r'literacy9a_ut_grid_\d*', col)]
for col in literacy9a_raw:
    print(f"Unique values in {col} = {up_literacy[col].unique()}")
    print(f"No. of NaN values in {col} = {up_literacy.loc[:, col].isna().sum()}")
    print(f"No. of UNDEFINED values in {col} = {up_literacy[up_literacy[col] == 'UNDEFINED'].shape[0]}")    
    print(f"No. of SKIPPED values in {col} = {up_literacy[up_literacy[col] == 'SKIPPED'].shape[0]}\n")
    up_literacy.loc[:, col].fillna('999', inplace=True)
    up_literacy.loc[:, col].replace('UNDEFINED', '999', inplace=True)
    up_literacy.loc[:, col].replace('SKIPPED', '999', inplace=True)
    up_literacy.loc[:, col].replace('.', '999', inplace=True)
    up_literacy.loc[:, col].replace(88, '88', inplace=True)
    up_literacy.loc[:, col].replace(1, '1', inplace=True)
    up_literacy.loc[:, col].replace(0, '0', inplace=True)
    up_literacy.loc[:, col].astype('str')

Unique values in literacy9a_ut_grid_1 = [1 0 '0' '1']
No. of NaN values in literacy9a_ut_grid_1 = 0
No. of UNDEFINED values in literacy9a_ut_grid_1 = 0
No. of SKIPPED values in literacy9a_ut_grid_1 = 0

Unique values in literacy9a_ut_grid_2 = ['1' '0' '.']
No. of NaN values in literacy9a_ut_grid_2 = 0
No. of UNDEFINED values in literacy9a_ut_grid_2 = 0
No. of SKIPPED values in literacy9a_ut_grid_2 = 0

Unique values in literacy9a_ut_grid_3 = ['0' '1' '.']
No. of NaN values in literacy9a_ut_grid_3 = 0
No. of UNDEFINED values in literacy9a_ut_grid_3 = 0
No. of SKIPPED values in literacy9a_ut_grid_3 = 0

Unique values in literacy9a_ut_grid_4 = ['0' '1' '.']
No. of NaN values in literacy9a_ut_grid_4 = 0
No. of UNDEFINED values in literacy9a_ut_grid_4 = 0
No. of SKIPPED values in literacy9a_ut_grid_4 = 0

Unique values in literacy9a_ut_grid_5 = ['1' '0' '.']
No. of NaN values in literacy9a_ut_grid_5 = 0
No. of UNDEFINED values in literacy9a_ut_grid_5 = 0
No. of SKIPPED values in literacy9a_

In [52]:
up_literacy.loc[:, literacy9a_raw] = pd.DataFrame((up_literacy.apply(lambda x: fix_score([x[col] for col in literacy9a_raw]), axis=1)).to_list(), index=up_literacy.index, columns=literacy9a_raw)

<h3>
    Literacy 9b: Dictation (Words)
</h3>

In [53]:
# Check data for 'UNDEFINED' or 'SKIPPPED' values
literacy9b_raw = [col for col in up_raw.columns if re.search(r'literacy9b_ut_grid_\d*', col)]
for col in literacy9b_raw:
    print(f"Unique values in {col} = {up_literacy[col].unique()}")
    print(f"No. of NaN values in {col} = {up_literacy.loc[:, col].isna().sum()}")
    print(f"No. of UNDEFINED values in {col} = {up_literacy[up_literacy[col] == 'UNDEFINED'].shape[0]}")    
    print(f"No. of SKIPPED values in {col} = {up_literacy[up_literacy[col] == 'SKIPPED'].shape[0]}\n")
    up_literacy.loc[:, col].fillna('999', inplace=True)
    up_literacy.loc[:, col].replace('UNDEFINED', '999', inplace=True)
    up_literacy.loc[:, col].replace('SKIPPED', '999', inplace=True)
    up_literacy.loc[:, col].replace('.', '999', inplace=True)
    up_literacy.loc[:, col].replace(88, '88', inplace=True)
    up_literacy.loc[:, col].replace(1, '1', inplace=True)
    up_literacy.loc[:, col].replace(0, '0', inplace=True)
    up_literacy.loc[:, col].astype('str')

Unique values in literacy9b_ut_grid_1 = ['0' '1' 'SKIPPED']
No. of NaN values in literacy9b_ut_grid_1 = 0
No. of UNDEFINED values in literacy9b_ut_grid_1 = 0
No. of SKIPPED values in literacy9b_ut_grid_1 = 2234

Unique values in literacy9b_ut_grid_2 = ['0' '1' 'SKIPPED']
No. of NaN values in literacy9b_ut_grid_2 = 0
No. of UNDEFINED values in literacy9b_ut_grid_2 = 0
No. of SKIPPED values in literacy9b_ut_grid_2 = 2234

Unique values in literacy9b_ut_grid_3 = ['0' '1' 'SKIPPED']
No. of NaN values in literacy9b_ut_grid_3 = 0
No. of UNDEFINED values in literacy9b_ut_grid_3 = 0
No. of SKIPPED values in literacy9b_ut_grid_3 = 2234

Unique values in literacy9b_ut_grid_4 = ['0' '1' 'SKIPPED' '.']
No. of NaN values in literacy9b_ut_grid_4 = 0
No. of UNDEFINED values in literacy9b_ut_grid_4 = 0
No. of SKIPPED values in literacy9b_ut_grid_4 = 2234

Unique values in literacy9b_ut_grid_5 = ['.' '1' '0' 'SKIPPED']
No. of NaN values in literacy9b_ut_grid_5 = 0
No. of UNDEFINED values in literacy9b_

In [54]:
up_literacy.loc[:, literacy9b_raw] = pd.DataFrame((up_literacy.apply(lambda x: fix_score([x[col] for col in literacy9b_raw]), axis=1)).to_list(), index=up_literacy.index, columns=literacy9b_raw)

In [55]:
up_literacy.to_excel("up_raw_literacy_cleaned_full.xlsx")

<h2>
    B. Numeracy Sub-tasks Data Cleaning
</h2>

In [56]:
# Drop data with 'Yes' for child wants to stop numeracy assessment subtask
end_list = [col for col in up_raw.columns if re.search(r'^num\w*end$', col)]
total_numeracy_stopped = 0
up_raw_numeracy = up_raw.copy()
for col in end_list:
    assessments_stopped = up_raw_numeracy[up_raw_numeracy[col] == '1']
#     print(f"No. of assessments stopped after {col}: {assessments_stopped.shape[0]}")
    total_numeracy_stopped += assessments_stopped.shape[0]
    up_raw_numeracy.drop(assessments_stopped.index, inplace=True)
print(f"Total no. of numeracy assessments stopped: {total_numeracy_stopped}")

# Drop data where a particular numeracy sub-task was disabled
numeracy_disabled_list = ['counting_timed_disabled', 'number_recognition_untimed_disabled', 'number_recognition_timed_disabled', \
                          'number_comparison_untimed_disabled', 'counting_in_bundles_untimed_disabled', 'missing_number_untimed_disabled', \
                          'addition_untimed_disabled', 'subtraction_untimed_disabled', 'word_problems_untimed_disabled', \
                          'shape_recognition_a_untimed_disabled', 'shape_recognition_b_untimed_disabled',]

total_numeracy_disabled = 0
for col in numeracy_disabled_list:
    disabled_assessments = up_raw_numeracy[up_raw_numeracy[col] == True]
#     print(f"No. of assessments disabled after {col}: {disabled_assessments.shape[0]}")
    up_raw_numeracy.drop(disabled_assessments.index, inplace=True)
    total_numeracy_disabled += disabled_assessments.shape[0]
print(f"Total no. of numeracy assessments disabled: {total_numeracy_disabled}")

up_raw_numeracy.to_excel('20221004_up_raw_numeracy.xlsx')

Total no. of numeracy assessments stopped: 0
Total no. of numeracy assessments disabled: 0


In [57]:
# Create dataframe to store sub-task total scores and percentages
up_numeracy = pd.DataFrame()

general_info =['_id', 'tabletUserName', 'assessment_date', 'school_details.State_label', 'school_details.District_label', \
               'school_details.Block_label', 'school_details.School_label', 'school_details.UDISE_cd_label']

student_info = ['SI_std_name', 'student_age', 'student_gender']

numeracy1 = [col for col in up_raw_numeracy.columns if re.search(r'^numeracy1_tt\w*', col)]

numeracy2_ut = [col for col in up_raw_numeracy.columns if re.search(r'^numeracy2_ut\w*', col)]

numeracy2_tt = [col for col in up_raw_numeracy.columns if re.search(r'^numeracy2_tt\w*', col)]
       
numeracy3 = [col for col in up_raw_numeracy.columns if re.search(r'^numeracy3_ut\w*', col)]

numeracy4 = [col for col in up_raw_numeracy.columns if re.search(r'^numeracy4_ut\S*', col)]

numeracy5 = [col for col in up_raw_numeracy.columns if re.search(r'numeracy5_ut\S*', col)]

numeracy6 = [col for col in up_raw_numeracy.columns if re.search(r'numeracy6_ut\S*', col)]
    
numeracy7 = [col for col in up_raw_numeracy.columns if re.search(r'numeracy7_ut\S*', col)]

numeracy8 = [col for col in up_raw_numeracy.columns if re.search(r'numeracy8_ut\S*', col)]

numeracy9a = [col for col in up_raw_numeracy.columns if re.search(r'numeracy9a_ut\S', col)]

numeracy9b = [col for col in up_raw_numeracy.columns if re.search(r'numeracy9b_ut\S', col)]

up_numeracy = up_raw_numeracy[general_info + student_info + numeracy1 + numeracy2_ut + numeracy2_tt + numeracy3 + numeracy4 + numeracy5 + numeracy6 + numeracy7 + numeracy8 + numeracy9a + numeracy9b].copy()

up_numeracy.loc[:, 'student_gender'] = up_numeracy.loc[:, 'student_gender'].apply(lambda x: 'Male' if ((x=='0') | (x==0)) else 'Female')

<h3>
    Numeracy 1: Counting
</h3>

In [58]:
# Check data for'UNDEFINED' or 'SKIPPED' values
numeracy1_raw = [col for col in numeracy1 if re.search(r'numeracy1_tt_grid_', col)]
for col in numeracy1_raw:
    print(f"Unique values in {col} = {up_numeracy[col].unique()}")
    print(f"No. of NaN values in {col} = {up_numeracy.loc[:, col].isna().sum()}")
    print(f"No. of UNDEFINED values in {col} = {up_numeracy[up_numeracy[col] == 'UNDEFINED'].shape[0]}")    
    print(f"No. of SKIPPED values in {col} = {up_numeracy[up_numeracy[col] == 'SKIPPED'].shape[0]}\n")
    up_numeracy.loc[:, col].fillna('999', inplace=True)
    up_numeracy[col].replace('UNDEFINED', '999', inplace=True)
    up_numeracy[col].replace('SKIPPED', '999', inplace=True)
    up_numeracy[col].replace('.', '999', inplace=True)    
    up_numeracy.loc[:, col].replace(88, '88', inplace=True)
    up_numeracy.loc[:, col].replace(1, '1', inplace=True)
    up_numeracy.loc[:, col].replace(0, '0', inplace=True)
    up_numeracy.loc[:, col].astype('str')

Unique values in numeracy1_tt_grid_1 = [1 0 '1' '0']
No. of NaN values in numeracy1_tt_grid_1 = 0
No. of UNDEFINED values in numeracy1_tt_grid_1 = 0
No. of SKIPPED values in numeracy1_tt_grid_1 = 0

Unique values in numeracy1_tt_grid_2 = ['1' '.' '0']
No. of NaN values in numeracy1_tt_grid_2 = 0
No. of UNDEFINED values in numeracy1_tt_grid_2 = 0
No. of SKIPPED values in numeracy1_tt_grid_2 = 0

Unique values in numeracy1_tt_grid_3 = ['1' '.' '0']
No. of NaN values in numeracy1_tt_grid_3 = 0
No. of UNDEFINED values in numeracy1_tt_grid_3 = 0
No. of SKIPPED values in numeracy1_tt_grid_3 = 0

Unique values in numeracy1_tt_grid_4 = ['1' '0' '.']
No. of NaN values in numeracy1_tt_grid_4 = 0
No. of UNDEFINED values in numeracy1_tt_grid_4 = 0
No. of SKIPPED values in numeracy1_tt_grid_4 = 0

Unique values in numeracy1_tt_grid_5 = ['1' '0' '.']
No. of NaN values in numeracy1_tt_grid_5 = 0
No. of UNDEFINED values in numeracy1_tt_grid_5 = 0
No. of SKIPPED values in numeracy1_tt_grid_5 = 0

Uniqu

In [59]:
up_numeracy.loc[:, numeracy1_raw] = pd.DataFrame((up_numeracy.apply(lambda x: fix_counting_score([x[col] for col in numeracy1_raw]), axis=1)).to_list(), index=up_numeracy.index, columns=numeracy1_raw)

<h3>
    Numeracy 2: Number Recognition (Untimed)
</h3>

In [60]:
# Check data for'UNDEFINED' or 'SKIPPED' values
numeracy2_ut_raw = [col for col in numeracy2_ut if re.search(r'numeracy2_ut_grid_', col)]
for col in numeracy2_ut_raw:
    print(f"Unique values in {col} = {up_numeracy[col].unique()}")
    print(f"No. of NaN values in {col} = {up_numeracy.loc[:, col].isna().sum()}")
    print(f"No. of UNDEFINED values in {col} = {up_numeracy[up_numeracy[col] == 'UNDEFINED'].shape[0]}")    
    print(f"No. of SKIPPED values in {col} = {up_numeracy[up_numeracy[col] == 'SKIPPED'].shape[0]}\n")
    up_numeracy.loc[:, col].fillna('999', inplace=True)
    up_numeracy[col].replace('UNDEFINED', '999', inplace=True)
    up_numeracy[col].replace('SKIPPED', '999', inplace=True)
    up_numeracy[col].replace('.', '999', inplace=True)    
    up_numeracy.loc[:, col].replace(88, '88', inplace=True)
    up_numeracy.loc[:, col].replace(1, '1', inplace=True)
    up_numeracy.loc[:, col].replace(0, '0', inplace=True)
    up_numeracy.loc[:, col].astype('str')

Unique values in numeracy2_ut_grid_1 = [1 0 '0' '1']
No. of NaN values in numeracy2_ut_grid_1 = 0
No. of UNDEFINED values in numeracy2_ut_grid_1 = 0
No. of SKIPPED values in numeracy2_ut_grid_1 = 0

Unique values in numeracy2_ut_grid_2 = [0 1 '0' '1']
No. of NaN values in numeracy2_ut_grid_2 = 0
No. of UNDEFINED values in numeracy2_ut_grid_2 = 0
No. of SKIPPED values in numeracy2_ut_grid_2 = 0

Unique values in numeracy2_ut_grid_3 = [0 1 '0' '1']
No. of NaN values in numeracy2_ut_grid_3 = 0
No. of UNDEFINED values in numeracy2_ut_grid_3 = 0
No. of SKIPPED values in numeracy2_ut_grid_3 = 0

Unique values in numeracy2_ut_grid_4 = [1 0 '0' '1' '.']
No. of NaN values in numeracy2_ut_grid_4 = 0
No. of UNDEFINED values in numeracy2_ut_grid_4 = 0
No. of SKIPPED values in numeracy2_ut_grid_4 = 0

Unique values in numeracy2_ut_grid_5 = ['1' '0' '.']
No. of NaN values in numeracy2_ut_grid_5 = 0
No. of UNDEFINED values in numeracy2_ut_grid_5 = 0
No. of SKIPPED values in numeracy2_ut_grid_5 = 0

U

In [61]:
up_numeracy.loc[:, numeracy2_ut_raw] = pd.DataFrame((up_numeracy.apply(lambda x: fix_score([x[col] for col in numeracy2_ut_raw]), axis=1)).to_list(), index=up_numeracy.index, columns=numeracy2_ut_raw)

<h3>
    Numeracy 2: Number Recognition (Timed)
</h3>

In [62]:
# Check data for'UNDEFINED' or 'SKIPPED' values
numeracy2_tt_raw = [col for col in numeracy2_tt if re.search(r'numeracy2_tt_grid_', col)]
for col in numeracy2_tt_raw:
    print(f"Unique values in {col} = {up_numeracy[col].unique()}")
    print(f"No. of NaN values in {col} = {up_numeracy.loc[:, col].isna().sum()}")
    print(f"No. of UNDEFINED values in {col} = {up_numeracy[up_numeracy[col] == 'UNDEFINED'].shape[0]}")    
    print(f"No. of SKIPPED values in {col} = {up_numeracy[up_numeracy[col] == 'SKIPPED'].shape[0]}\n")
    up_numeracy.loc[:, col].fillna('999', inplace=True)
    up_numeracy[col].replace('UNDEFINED', '999', inplace=True)
    up_numeracy[col].replace('SKIPPED', '999', inplace=True)
    up_numeracy[col].replace('.', '999', inplace=True)    
    up_numeracy.loc[:, col].replace(88, '88', inplace=True)
    up_numeracy.loc[:, col].replace(1, '1', inplace=True)
    up_numeracy.loc[:, col].replace(0, '0', inplace=True)
    up_numeracy.loc[:, col].astype('str')

Unique values in numeracy2_tt_grid_1 = [1 0 '0' '1']
No. of NaN values in numeracy2_tt_grid_1 = 0
No. of UNDEFINED values in numeracy2_tt_grid_1 = 0
No. of SKIPPED values in numeracy2_tt_grid_1 = 0

Unique values in numeracy2_tt_grid_2 = [1 0 '0' '1' '.']
No. of NaN values in numeracy2_tt_grid_2 = 0
No. of UNDEFINED values in numeracy2_tt_grid_2 = 0
No. of SKIPPED values in numeracy2_tt_grid_2 = 0

Unique values in numeracy2_tt_grid_3 = [1 0 '0' '1' '.']
No. of NaN values in numeracy2_tt_grid_3 = 0
No. of UNDEFINED values in numeracy2_tt_grid_3 = 0
No. of SKIPPED values in numeracy2_tt_grid_3 = 0

Unique values in numeracy2_tt_grid_4 = ['1' '0' '.']
No. of NaN values in numeracy2_tt_grid_4 = 0
No. of UNDEFINED values in numeracy2_tt_grid_4 = 0
No. of SKIPPED values in numeracy2_tt_grid_4 = 0

Unique values in numeracy2_tt_grid_5 = ['1' '.' '0']
No. of NaN values in numeracy2_tt_grid_5 = 0
No. of UNDEFINED values in numeracy2_tt_grid_5 = 0
No. of SKIPPED values in numeracy2_tt_grid_5 = 

In [63]:
up_numeracy.loc[:, numeracy2_tt_raw] = pd.DataFrame((up_numeracy.apply(lambda x: fix_score([x[col] for col in numeracy2_tt_raw]), axis=1)).to_list(), index=up_numeracy.index, columns=numeracy2_tt_raw)

<h3>
    Numeracy 3: Number Comparison
</h3>

In [64]:
# Check data for'UNDEFINED' or 'SKIPPED' values
for col in numeracy3:
    print(f"Unique values in {col} = {up_numeracy[col].unique()}")
    print(f"No. of NaN values in {col} = {up_numeracy.loc[:, col].isna().sum()}")
    print(f"No. of UNDEFINED values in {col} = {up_numeracy[up_numeracy[col] == 'UNDEFINED'].shape[0]}")    
    print(f"No. of SKIPPED values in {col} = {up_numeracy[up_numeracy[col] == 'SKIPPED'].shape[0]}\n")
    up_numeracy.loc[:, col].fillna('999', inplace=True)
    up_numeracy[col].replace('UNDEFINED', '999', inplace=True)
    up_numeracy[col].replace('SKIPPED', '999', inplace=True)
    up_numeracy[col].replace('.', '999', inplace=True)    
    up_numeracy.loc[:, col].replace(88, '88', inplace=True)
    up_numeracy.loc[:, col].replace(1, '1', inplace=True)
    up_numeracy.loc[:, col].replace(0, '0', inplace=True)
    up_numeracy.loc[:, col].astype('str')

Unique values in numeracy3_ut_q1 = [1 88 0 '1' '88' '0']
No. of NaN values in numeracy3_ut_q1 = 0
No. of UNDEFINED values in numeracy3_ut_q1 = 0
No. of SKIPPED values in numeracy3_ut_q1 = 0

Unique values in numeracy3_ut_q2 = [0 1 88 '0' '1' '88']
No. of NaN values in numeracy3_ut_q2 = 0
No. of UNDEFINED values in numeracy3_ut_q2 = 0
No. of SKIPPED values in numeracy3_ut_q2 = 0

Unique values in numeracy3_ut_q3 = [1 0 88 '0' '1' '88']
No. of NaN values in numeracy3_ut_q3 = 0
No. of UNDEFINED values in numeracy3_ut_q3 = 0
No. of SKIPPED values in numeracy3_ut_q3 = 0

Unique values in numeracy3_ut_q4 = [1 0 88 '1' '0' '88']
No. of NaN values in numeracy3_ut_q4 = 0
No. of UNDEFINED values in numeracy3_ut_q4 = 0
No. of SKIPPED values in numeracy3_ut_q4 = 0

Unique values in numeracy3_ut_q5 = [1 0 nan 88 '0' '1' '88']
No. of NaN values in numeracy3_ut_q5 = 382
No. of UNDEFINED values in numeracy3_ut_q5 = 0
No. of SKIPPED values in numeracy3_ut_q5 = 0

Unique values in numeracy3_ut_q6 = [0 1

<h3>
    Numeracy 4: Counting in Bundles
</h3>

In [65]:
# Check data for'UNDEFINED' or 'SKIPPED' values
numeracy4_raw = [col for col in numeracy4 if re.search(r'numeracy4.+\d$', col)]
for col in numeracy4_raw:
    print(f"Unique values in {col} = {up_numeracy[col].unique()}")
    print(f"No. of NaN values in {col} = {up_numeracy.loc[:, col].isna().sum()}")
    print(f"No. of UNDEFINED values in {col} = {up_numeracy[up_numeracy[col] == 'UNDEFINED'].shape[0]}")    
    print(f"No. of SKIPPED values in {col} = {up_numeracy[up_numeracy[col] == 'SKIPPED'].shape[0]}\n")
    up_numeracy.loc[:, col].fillna('999', inplace=True)
    up_numeracy[col].replace('UNDEFINED', '999', inplace=True)
    up_numeracy[col].replace('SKIPPED', '999', inplace=True)
    up_numeracy[col].replace('.', '999', inplace=True)    
    up_numeracy.loc[:, col].replace(88, '88', inplace=True)
    up_numeracy.loc[:, col].replace(1, '1', inplace=True)
    up_numeracy.loc[:, col].replace(0, '0', inplace=True)
    up_numeracy.loc[:, col].astype('str')

Unique values in numeracy4_ut_q1 = [1 0 88 '88' '1' '0']
No. of NaN values in numeracy4_ut_q1 = 0
No. of UNDEFINED values in numeracy4_ut_q1 = 0
No. of SKIPPED values in numeracy4_ut_q1 = 0

Unique values in numeracy4_ut_q2 = [88 0 1 '88' '1' '0']
No. of NaN values in numeracy4_ut_q2 = 0
No. of UNDEFINED values in numeracy4_ut_q2 = 0
No. of SKIPPED values in numeracy4_ut_q2 = 0

Unique values in numeracy4_ut_q3 = [88 0 1 '0' '88' '1']
No. of NaN values in numeracy4_ut_q3 = 0
No. of UNDEFINED values in numeracy4_ut_q3 = 0
No. of SKIPPED values in numeracy4_ut_q3 = 0

Unique values in numeracy4_ut_q4 = [0 1 88 '0' '88' '1']
No. of NaN values in numeracy4_ut_q4 = 0
No. of UNDEFINED values in numeracy4_ut_q4 = 0
No. of SKIPPED values in numeracy4_ut_q4 = 0



<h3>
    Numeracy 5: Missing Numbers
</h3>

In [66]:
# Check data for'UNDEFINED' or 'SKIPPED' values
numeracy5_raw = [col for col in numeracy5 if re.search(r'numeracy5.+\d$', col)]
for col in numeracy5_raw:
    print(f"Unique values in {col} = {up_numeracy[col].unique()}")
    print(f"No. of NaN values in {col} = {up_numeracy.loc[:, col].isna().sum()}")
    print(f"No. of UNDEFINED values in {col} = {up_numeracy[up_numeracy[col] == 'UNDEFINED'].shape[0]}")    
    print(f"No. of SKIPPED values in {col} = {up_numeracy[up_numeracy[col] == 'SKIPPED'].shape[0]}\n")
    up_numeracy.loc[:, col].fillna('999', inplace=True)
    up_numeracy[col].replace('UNDEFINED', '999', inplace=True)
    up_numeracy[col].replace('SKIPPED', '999', inplace=True)
    up_numeracy[col].replace('.', '999', inplace=True)    
    up_numeracy.loc[:, col].replace(88, '88', inplace=True)
    up_numeracy.loc[:, col].replace(1, '1', inplace=True)
    up_numeracy.loc[:, col].replace(0, '0', inplace=True)
    up_numeracy.loc[:, col].astype('str')

Unique values in numeracy5_ut_q1 = [1 88 0 '0' '1' '88']
No. of NaN values in numeracy5_ut_q1 = 0
No. of UNDEFINED values in numeracy5_ut_q1 = 0
No. of SKIPPED values in numeracy5_ut_q1 = 0

Unique values in numeracy5_ut_q2 = [1 88 0 '0' '1' '88']
No. of NaN values in numeracy5_ut_q2 = 0
No. of UNDEFINED values in numeracy5_ut_q2 = 0
No. of SKIPPED values in numeracy5_ut_q2 = 0

Unique values in numeracy5_ut_q3 = [0 88 1 '0' '1' '88']
No. of NaN values in numeracy5_ut_q3 = 0
No. of UNDEFINED values in numeracy5_ut_q3 = 0
No. of SKIPPED values in numeracy5_ut_q3 = 0

Unique values in numeracy5_ut_q4 = [1 88 0 '0' '88' '1']
No. of NaN values in numeracy5_ut_q4 = 0
No. of UNDEFINED values in numeracy5_ut_q4 = 0
No. of SKIPPED values in numeracy5_ut_q4 = 0

Unique values in numeracy5_ut_q5 = [0 nan 1 88 '0' '1' '88']
No. of NaN values in numeracy5_ut_q5 = 1730
No. of UNDEFINED values in numeracy5_ut_q5 = 0
No. of SKIPPED values in numeracy5_ut_q5 = 0

Unique values in numeracy5_ut_q6 = [0 

<h3>
    Numeracy 6: Addition
</h3>

In [67]:
# Check data for'UNDEFINED' or 'SKIPPED' values
numeracy6_raw = [col for col in numeracy6 if re.search(r'numeracy6.+\d$', col)]
for col in numeracy6_raw:
    print(f"Unique values in {col} = {up_numeracy[col].unique()}")
    print(f"No. of NaN values in {col} = {up_numeracy.loc[:, col].isna().sum()}")
    print(f"No. of UNDEFINED values in {col} = {up_numeracy[up_numeracy[col] == 'UNDEFINED'].shape[0]}")    
    print(f"No. of SKIPPED values in {col} = {up_numeracy[up_numeracy[col] == 'SKIPPED'].shape[0]}\n")
    up_numeracy.loc[:, col].fillna('999', inplace=True)
    up_numeracy[col].replace('UNDEFINED', '999', inplace=True)
    up_numeracy[col].replace('SKIPPED', '999', inplace=True)
    up_numeracy[col].replace('.', '999', inplace=True)    
    up_numeracy.loc[:, col].replace(88, '88', inplace=True)
    up_numeracy.loc[:, col].replace(1, '1', inplace=True)
    up_numeracy.loc[:, col].replace(0, '0', inplace=True)
    up_numeracy.loc[:, col].astype('str')

Unique values in numeracy6_ut_q1 = [0 1 88 '0' '88' '1']
No. of NaN values in numeracy6_ut_q1 = 0
No. of UNDEFINED values in numeracy6_ut_q1 = 0
No. of SKIPPED values in numeracy6_ut_q1 = 0

Unique values in numeracy6_ut_q2 = [0 1 88 '0' '88' '1']
No. of NaN values in numeracy6_ut_q2 = 0
No. of UNDEFINED values in numeracy6_ut_q2 = 0
No. of SKIPPED values in numeracy6_ut_q2 = 0

Unique values in numeracy6_ut_q3 = [1 88 0 '0' '88' '1']
No. of NaN values in numeracy6_ut_q3 = 0
No. of UNDEFINED values in numeracy6_ut_q3 = 0
No. of SKIPPED values in numeracy6_ut_q3 = 0

Unique values in numeracy6_ut_q4 = [1 88 0 '1' '88' '0']
No. of NaN values in numeracy6_ut_q4 = 0
No. of UNDEFINED values in numeracy6_ut_q4 = 0
No. of SKIPPED values in numeracy6_ut_q4 = 0

Unique values in numeracy6_ut_q5 = [1 0 nan 88 '0' '1' '88']
No. of NaN values in numeracy6_ut_q5 = 1427
No. of UNDEFINED values in numeracy6_ut_q5 = 0
No. of SKIPPED values in numeracy6_ut_q5 = 0

Unique values in numeracy6_ut_q6 = [1 

<h3>
    Numeracy 7: Subtraction
</h3>

In [68]:
# Check data for'UNDEFINED' or 'SKIPPED' values
numeracy7_raw = [col for col in numeracy7 if re.search(r'numeracy7.+\d$', col)]
for col in numeracy7_raw:
    print(f"Unique values in {col} = {up_numeracy[col].unique()}")
    print(f"No. of NaN values in {col} = {up_numeracy.loc[:, col].isna().sum()}")
    print(f"No. of UNDEFINED values in {col} = {up_numeracy[up_numeracy[col] == 'UNDEFINED'].shape[0]}")    
    print(f"No. of SKIPPED values in {col} = {up_numeracy[up_numeracy[col] == 'SKIPPED'].shape[0]}\n")
    up_numeracy.loc[:, col].fillna('999', inplace=True)
    up_numeracy[col].replace('UNDEFINED', '999', inplace=True)
    up_numeracy[col].replace('SKIPPED', '999', inplace=True)
    up_numeracy[col].replace('.', '999', inplace=True)    
    up_numeracy.loc[:, col].replace(88, '88', inplace=True)
    up_numeracy.loc[:, col].replace(1, '1', inplace=True)
    up_numeracy.loc[:, col].replace(0, '0', inplace=True)
    up_numeracy.loc[:, col].astype('str')

Unique values in numeracy7_ut_q1 = [0 1 88 '0' '88' '1']
No. of NaN values in numeracy7_ut_q1 = 0
No. of UNDEFINED values in numeracy7_ut_q1 = 0
No. of SKIPPED values in numeracy7_ut_q1 = 0

Unique values in numeracy7_ut_q2 = [0 88 1 '0' '88' '1']
No. of NaN values in numeracy7_ut_q2 = 0
No. of UNDEFINED values in numeracy7_ut_q2 = 0
No. of SKIPPED values in numeracy7_ut_q2 = 0

Unique values in numeracy7_ut_q3 = [0 88 1 '0' '88' '1']
No. of NaN values in numeracy7_ut_q3 = 0
No. of UNDEFINED values in numeracy7_ut_q3 = 0
No. of SKIPPED values in numeracy7_ut_q3 = 0

Unique values in numeracy7_ut_q4 = [0 88 1 '0' '88' '1']
No. of NaN values in numeracy7_ut_q4 = 0
No. of UNDEFINED values in numeracy7_ut_q4 = 0
No. of SKIPPED values in numeracy7_ut_q4 = 0

Unique values in numeracy7_ut_q5 = [nan 1 0 88 '1' '0' '88']
No. of NaN values in numeracy7_ut_q5 = 2039
No. of UNDEFINED values in numeracy7_ut_q5 = 0
No. of SKIPPED values in numeracy7_ut_q5 = 0

Unique values in numeracy7_ut_q6 = [na

<h3>
    Numeracy 8: Word Problems
</h3>

In [69]:
# Check data for'UNDEFINED' or 'SKIPPED' values
numeracy8_raw = [col for col in numeracy8 if re.search(r'numeracy8.+\d$', col)]
for col in numeracy8_raw:
    print(f"Unique values in {col} = {up_numeracy[col].unique()}")
    print(f"No. of NaN values in {col} = {up_numeracy.loc[:, col].isna().sum()}")
    print(f"No. of UNDEFINED values in {col} = {up_numeracy[up_numeracy[col] == 'UNDEFINED'].shape[0]}")    
    print(f"No. of SKIPPED values in {col} = {up_numeracy[up_numeracy[col] == 'SKIPPED'].shape[0]}\n")
    up_numeracy.loc[:, col].fillna('999', inplace=True)
    up_numeracy[col].replace('UNDEFINED', '999', inplace=True)
    up_numeracy[col].replace('SKIPPED', '999', inplace=True)
    up_numeracy[col].replace('.', '999', inplace=True)    
    up_numeracy.loc[:, col].replace(88, '88', inplace=True)
    up_numeracy.loc[:, col].replace(1, '1', inplace=True)
    up_numeracy.loc[:, col].replace(0, '0', inplace=True)
    up_numeracy.loc[:, col].astype('str')

Unique values in numeracy8_ut_q1 = [1 0 88 '0' '1' '88']
No. of NaN values in numeracy8_ut_q1 = 0
No. of UNDEFINED values in numeracy8_ut_q1 = 0
No. of SKIPPED values in numeracy8_ut_q1 = 0

Unique values in numeracy8_ut_q2 = [0 1 88 '0' '1' '88']
No. of NaN values in numeracy8_ut_q2 = 0
No. of UNDEFINED values in numeracy8_ut_q2 = 0
No. of SKIPPED values in numeracy8_ut_q2 = 0

Unique values in numeracy8_ut_q3 = [0 1 88 '0' '1' '88']
No. of NaN values in numeracy8_ut_q3 = 0
No. of UNDEFINED values in numeracy8_ut_q3 = 0
No. of SKIPPED values in numeracy8_ut_q3 = 0

Unique values in numeracy8_ut_q4 = [0 1 88 '0' '1' '88']
No. of NaN values in numeracy8_ut_q4 = 0
No. of UNDEFINED values in numeracy8_ut_q4 = 0
No. of SKIPPED values in numeracy8_ut_q4 = 0

Unique values in numeracy8_ut_q5 = [0 nan 1 88 '1' '0' '88']
No. of NaN values in numeracy8_ut_q5 = 1103
No. of UNDEFINED values in numeracy8_ut_q5 = 0
No. of SKIPPED values in numeracy8_ut_q5 = 0

Unique values in numeracy8_ut_q6 = [na

<h3>
    Numeracy 9a: Shape Recognition (Circle)
</h3>

In [70]:
# Check data for'UNDEFINED' or 'SKIPPED' values
numeracy9a_raw = [col for col in numeracy9a if re.search(r'numeracy9a_ut_grid_', col)]
for col in numeracy9a_raw:
    print(f"Unique values in {col} = {up_numeracy[col].unique()}")
    print(f"No. of NaN values in {col} = {up_numeracy.loc[:, col].isna().sum()}")
    print(f"No. of UNDEFINED values in {col} = {up_numeracy[up_numeracy[col] == 'UNDEFINED'].shape[0]}")    
    print(f"No. of SKIPPED values in {col} = {up_numeracy[up_numeracy[col] == 'SKIPPED'].shape[0]}\n")
    up_numeracy[col].replace('UNDEFINED', '0', inplace=True)
    up_numeracy.loc[:, col].replace(88, '88', inplace=True)
    up_numeracy.loc[:, col].replace(1, '1', inplace=True)
    up_numeracy.loc[:, col].replace(0, '0', inplace=True)
    up_numeracy.loc[:, col].astype('str')

Unique values in numeracy9a_ut_grid_1 = [0 1 '0' '1']
No. of NaN values in numeracy9a_ut_grid_1 = 0
No. of UNDEFINED values in numeracy9a_ut_grid_1 = 0
No. of SKIPPED values in numeracy9a_ut_grid_1 = 0

Unique values in numeracy9a_ut_grid_2 = [0 1 '0' '1']
No. of NaN values in numeracy9a_ut_grid_2 = 0
No. of UNDEFINED values in numeracy9a_ut_grid_2 = 0
No. of SKIPPED values in numeracy9a_ut_grid_2 = 0

Unique values in numeracy9a_ut_grid_3 = [0 1 '0' '1']
No. of NaN values in numeracy9a_ut_grid_3 = 0
No. of UNDEFINED values in numeracy9a_ut_grid_3 = 0
No. of SKIPPED values in numeracy9a_ut_grid_3 = 0

Unique values in numeracy9a_ut_grid_4 = [0 1 '0' '1']
No. of NaN values in numeracy9a_ut_grid_4 = 0
No. of UNDEFINED values in numeracy9a_ut_grid_4 = 0
No. of SKIPPED values in numeracy9a_ut_grid_4 = 0

Unique values in numeracy9a_ut_grid_5 = [0 1 '0' '1']
No. of NaN values in numeracy9a_ut_grid_5 = 0
No. of UNDEFINED values in numeracy9a_ut_grid_5 = 0
No. of SKIPPED values in numeracy9a_

<h3>
    Numeracy 9b: Shape Recognition (Rectangle)
</h3>

In [71]:
# Check data for'UNDEFINED' or 'SKIPPED' values
numeracy9b_raw = [col for col in numeracy9b if re.search(r'numeracy9b_ut_grid_', col)]
for col in numeracy9b_raw:
    print(f"Unique values in {col} = {up_numeracy[col].unique()}")
    print(f"No. of NaN values in {col} = {up_numeracy.loc[:, col].isna().sum()}")
    print(f"No. of UNDEFINED values in {col} = {up_numeracy[up_numeracy[col] == 'UNDEFINED'].shape[0]}")    
    print(f"No. of SKIPPED values in {col} = {up_numeracy[up_numeracy[col] == 'SKIPPED'].shape[0]}\n")
    up_numeracy[col].replace('UNDEFINED', '0', inplace=True)
    up_numeracy.loc[:, col].replace(88, '88', inplace=True)
    up_numeracy.loc[:, col].replace(1, '1', inplace=True)
    up_numeracy.loc[:, col].replace(0, '0', inplace=True)
    up_numeracy.loc[:, col].astype('str')

Unique values in numeracy9b_ut_grid_1 = [1 0 '0' '1']
No. of NaN values in numeracy9b_ut_grid_1 = 0
No. of UNDEFINED values in numeracy9b_ut_grid_1 = 0
No. of SKIPPED values in numeracy9b_ut_grid_1 = 0

Unique values in numeracy9b_ut_grid_2 = [0 1 '0' '1']
No. of NaN values in numeracy9b_ut_grid_2 = 0
No. of UNDEFINED values in numeracy9b_ut_grid_2 = 0
No. of SKIPPED values in numeracy9b_ut_grid_2 = 0

Unique values in numeracy9b_ut_grid_3 = [1 0 '0' '1']
No. of NaN values in numeracy9b_ut_grid_3 = 0
No. of UNDEFINED values in numeracy9b_ut_grid_3 = 0
No. of SKIPPED values in numeracy9b_ut_grid_3 = 0

Unique values in numeracy9b_ut_grid_4 = [1 0 '0' '1']
No. of NaN values in numeracy9b_ut_grid_4 = 0
No. of UNDEFINED values in numeracy9b_ut_grid_4 = 0
No. of SKIPPED values in numeracy9b_ut_grid_4 = 0

Unique values in numeracy9b_ut_grid_5 = [1 0 '0' '1']
No. of NaN values in numeracy9b_ut_grid_5 = 0
No. of UNDEFINED values in numeracy9b_ut_grid_5 = 0
No. of SKIPPED values in numeracy9b_

In [72]:
up_numeracy.to_excel("up_raw_numeracy_cleaned_full.xlsx")