In [3]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import regex as re
import os
import seaborn as sns
from openpyxl import load_workbook
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


<h1>
    Import raw data files
</h1>

In [6]:
os.chdir('/home/rohitdaniel/Documents/Ei_CSF_FLN_Evaluation/SLO_Baseline/cleaned_data')

In [10]:
mp_lit = pd.read_excel("./MP literacy_cleaned.xlsx", index_col='_id')
# numeracy_raw = pd.read_excel("./mp_raw_numeracy_full.xlsx", index_col='_id')

In [11]:
def total_score(scores):
    total_score = 0
    for score in scores:
        if (score == 1) | (score == '1'):
            total_score += 1
    return total_score

In [15]:
# def fluency(scores):
#     duration = (60 - )
#     score = [col for col in list if re.search(r'literacy4_tt_grid_\d*$', col)]
#     fluency_score = 60* total_score(score)/duration    
#     return fluency_score

<h1 style="color:blue;">
    Literary Sub-task Score Calculations
</h1>

<h3>
    Literacy 1: Listening Comprehension
</h3>

In [15]:
# Calculate total score on listening comprehension sub-task
lit1 = [col for col in literacy_raw.columns if re.search(r'literacy1_q\d$', col)]
                                                         
mp_lit_scores.loc[:, 'literacy1_total'] = literacy_raw.apply(lambda x: total_score([x[col] for col in literacy1]), axis=1)
mp_lit_scores.loc[:, 'literacy1_%_correct'] = mp_lit_scores.apply(lambda x: 100*x['literacy1_total']/4, axis=1)

In [16]:
# Extract other responses to oral vocabulary questions
literacy1_or = [col for col in literacy_raw.columns if re.search(r'literacy1\S*or$', col)]
with pd.ExcelWriter('mp_literacy1_other_responses_vf.xlsx') as writer: 
    for col in literacy1_or:
        literacy_raw[col].value_counts().reset_index().rename(columns = {'index':"Child's response", col:'Frequency'}).to_excel(writer, sheet_name=col)

<h3>
    Literacy 2: Oral Vocabulary
</h3>

In [17]:
# Calculate total score on oral comprehension sub-task
literacy2 = [col for col in literacy_raw.columns if re.search(r'literacy2_q\d+$', col)]
                                                         
mp_lit_scores.loc[:, 'literacy2_total'] = literacy_raw.apply(lambda x: total_score([x[col] for col in literacy2]), axis=1)
mp_lit_scores.loc[:, 'literacy2_%_correct'] = mp_lit_scores.apply(lambda x: 100*x['literacy2_total']/len(literacy2), axis=1)

In [18]:
# Extract other responses to oral vocabulary questions
literacy2_or = [col for col in literacy_raw.columns if re.search(r'literacy2\S*or$', col)]
with pd.ExcelWriter('mp_literacy2_other_responses_vf.xlsx') as writer: 
    for col in literacy2_or:
        literacy_raw[col].value_counts().reset_index().rename(columns = {'index':"Child's response", col:'Frequency'}).to_excel(writer, sheet_name=col)

<h3>
    Literacy 3: Initial Sound Identification
</h3>

In [19]:
# Calculate total score on initial sound identification sub-task
literacy3 = [col for col in literacy_raw.columns if re.search(r'literacy3_q+', col)]
                                                         
mp_lit_scores.loc[:, 'literacy3_total'] = literacy_raw.apply(lambda x: total_score([x[col] for col in literacy3]), axis=1)
mp_lit_scores.loc[:, 'literacy3_%_correct'] = mp_lit_scores.apply(lambda x: 100*x['literacy3_total']/10, axis=1)

<h3>
    Literacy 4: Letter Recognition (Untimed)
</h3>

In [20]:
literacy4_ut = [col for col in literacy_raw.columns if re.search(r'literacy4_ut_grid_\d*$', col)]
    
# Calculate total score on letter naming (untimed) sub-task
mp_lit_scores.loc[:, 'literacy4_ut_total'] = literacy_raw.apply(lambda x: total_score([x[score] for score in literacy4_ut]), axis=1)
mp_lit_scores.loc[:, 'literacy4_%_correct'] = mp_lit_scores.apply(lambda x: 100*x['literacy4_ut_total']/15, axis=1)

In [21]:
pg.cronbach_alpha(data=mp_lit_scores[[col for col in mp_lit_scores.columns if re.search(r'\w*%\w*', col)]])

(0.5522179923488686, array([0.513, 0.589]))

<h3>
    Literacy 4: Letter Recognition (Timed)
</h3>

In [22]:
literacy_raw.columns.to_list()


['tabletUserName',
 'assessment_date',
 'school_details.State_label',
 'school_details.District_label',
 'school_details.Block_label',
 'school_details.School_label',
 'school_details.UDISE_cd_label',
 'SI_std_name',
 'student_age',
 'student_gender',
 'literacy1_q1',
 'literacy1_q1_or',
 'literacy1_q2',
 'literacy1_q2_or',
 'literacy1_q3',
 'literacy1_q3_or',
 'literacy1_q4',
 'literacy1_q4_or',
 'literacy1_end',
 'literacy2_p_q',
 'literacy2_q1',
 'literacy2_q2',
 'literacy2_q3',
 'literacy2_q4',
 'literacy2_q5',
 'literacy2_q6',
 'literacy2_q7',
 'literacy2_q8',
 'literacy2_q10',
 'literacy2_end',
 'literacy2_q1_or',
 'literacy2_q2_or',
 'literacy2_q3_or',
 'literacy2_q4_or',
 'literacy2_q5_or',
 'literacy2_q6_or',
 'literacy2_q7_or',
 'literacy2_q8_or',
 'literacy2_q9',
 'literacy2_q9_or',
 'literacy2_q10_or',
 'literacy3_p_q1',
 'literacy3_p_q2',
 'literacy3_q1',
 'literacy3_q2',
 'literacy3_q3',
 'literacy3_q4',
 'literacy3_q5',
 'literacy3_q6',
 'literacy3_q7',
 'literacy3_q8',


In [23]:
literacy4_tt =  [col for col in literacy_raw.columns if re.search(r'literacy4_tt_grid_\d*$', col)]
literacy_raw.loc[:, 'literacy4_tt_duration'] = literacy_raw['literacy4_tt_grid.duration'] - literacy_raw['literacy4_tt_grid.time_remaining']

# Calculate total score on letter naming (timed) sub-task
mp_lit_scores.loc[:, 'literacy4_tt_total'] = literacy_raw.apply(lambda x: total_score([x[score] for score in literacy4_tt]), axis=1)
# mp_lit_scores.loc[:, 'literacy4_tt_time_remaining'] = literacy_raw['literacy4_tt_grid.time_remaining']

In [25]:
pg.cronbach_alpha(data=literacy_raw[literacy4_tt])

(0.9749330457282289, array([0.973, 0.977]))

<h3>
    Literacy 5: Familiar Words Reading (Untimed)
</h3>

In [26]:
literacy5_ut = [col for col in literacy_raw.columns if re.search(r'literacy5_ut_grid_\d*$', col)]
    
mp_lit_scores.loc[:, 'literacy5_ut_total'] = literacy_raw.apply(lambda x: total_score([x[score] for score in literacy5_ut]), axis=1)

In [27]:
pg.cronbach_alpha(data=literacy_raw[literacy5_ut])

(0.932063076705482, array([0.927, 0.937]))

<h3>
    Literacy 5: Familiar Words Reading (Timed)
</h3>

In [28]:
literacy5_tt = [col for col in literacy_raw.columns if re.search(r'literacy5_tt_grid_\d*$', col)]

mp_lit_scores.loc[:, 'literacy5_tt_total'] = literacy_raw.apply(lambda x: total_score([x[score] for score in literacy5_tt]), axis=1)

In [29]:
pg.cronbach_alpha(data=literacy_raw[literacy5_tt])

(0.9421205229086459, array([0.938, 0.946]))

<h3>
    Literacy 6: Non-word Reading
</h3>

In [30]:
literacy6 = [col for col in literacy_raw.columns if re.search(r'literacy6_tt_grid_\d*$', col)]

mp_lit_scores.loc[:, 'literacy6_total'] = literacy_raw.apply(lambda x: total_score([x[score] for score in literacy6]), axis=1)

In [31]:
pg.cronbach_alpha(data=literacy_raw[literacy6])

(0.934648268558154, array([0.93 , 0.939]))

<h3>
    Literacy 7: Oral Reading Fluency (Timed)
</h3>

In [32]:
literacy7 = [col for col in literacy_raw.columns if re.search(r'literacy7_tt_grid_\d*$', col)]

mp_lit_scores.loc[:, 'literacy7_total'] = literacy_raw.apply(lambda x: total_score([x[score] for score in literacy7]), axis=1)

In [33]:
pg.cronbach_alpha(data=literacy_raw[literacy7])

(0.9519612153125053, array([0.948, 0.955]))

<h3>
    Literacy 8: Reading Comprehension (Untimed)
</h3>

In [34]:
literacy8_reading = [col for col in literacy_raw.columns if re.search(r'literacy8_ut_grid_\d*$', col)]

literacy8_comprehension = [col for col in literacy_raw.columns if re.search(r'literacy8_ut_q\d*$', col)]

mp_lit_scores.loc[:, 'literacy8_reading_total'] = literacy_raw.apply(lambda x: total_score([x[score] for score in literacy8_reading]), axis=1)

mp_lit_scores.loc[:, 'literacy8_comprehension_total'] = literacy_raw.apply(lambda x: total_score([x[score] for score in literacy8_comprehension]), axis=1)

In [35]:
pg.cronbach_alpha(data=literacy_raw[literacy8_reading])

(0.9711815886090991, array([0.969, 0.973]))

In [36]:
pg.cronbach_alpha(data=literacy_raw[literacy8_comprehension])

(0.9901580111416294, array([0.989, 0.991]))

<h3>
    Literacy 9a: Dictation (Letters)
</h3>

In [37]:
literacy9a = [col for col in literacy_raw.columns if re.search(r'literacy9a_ut_grid_\d*$', col)]

mp_lit_scores.loc[:, 'literacy9a_total'] = literacy_raw.apply(lambda x: total_score([x[score] for score in literacy9a]), axis=1)

In [38]:
pg.cronbach_alpha(data=literacy_raw[literacy9a])

(0.9116573414888413, array([0.905, 0.918]))

<h3>
    Literacy 9a: Dictation (Words)
</h3>

In [39]:
literacy9b = [col for col in literacy_raw.columns if re.search(r'literacy9b_ut_grid_\d*$', col)]

mp_lit_scores.loc[:, 'literacy9b_total'] = literacy_raw.apply(lambda x: total_score([x[score] for score in literacy9b]), axis=1)

In [40]:
pg.cronbach_alpha(data=literacy_raw[literacy9b])

(0.9753077409810995, array([0.973, 0.977]))

In [41]:
mp_lit_scores

Unnamed: 0_level_0,tabletUserName,assessment_date,school_details.State_label,school_details.District_label,school_details.Block_label,school_details.School_label,school_details.UDISE_cd_label,SI_std_name,student_age,student_gender,literacy1_total,literacy1_%_correct,literacy2_total,literacy2_%_correct,literacy3_total,literacy3_%_correct,literacy4_ut_total,literacy4_%_correct,literacy4_tt_total,literacy5_ut_total,literacy5_tt_total,literacy6_total,literacy7_total,literacy8_reading_total,literacy8_comprehension_total,literacy9a_total,literacy9b_total
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
0050c62f-02ba-4cc6-bb95-bf76e33e5069,Jeevan,2022-09-17,Madhya Pradesh,Shajapur,M. Barodiya,GIRLS P.S. PALSAWAD,23220413303,Bhumika,5.0,Female,3,75.0,9,90.0,0,0.0,0,0.0,3,0,0,0,0,0,0,0,0
00d4fdf3-5016-4037-a386-148bf838ff90,Kajal,2022-09-23,Madhya Pradesh,Vidisha,Gyaraspur,PS NIPANIYA,23310211501,pavan kushwah,6.0,Male,4,100.0,8,80.0,7,70.0,11,73.333333,28,4,7,2,2,1,0,4,0
010fada9-996a-49c2-a8af-bdb6c5c688a3,Rahnuma,2022-09-16,Madhya Pradesh,Shajapur,Shajapur,P.S. RATANPURA,23220603401,lakshmi gurjar,5.0,Female,1,25.0,7,70.0,0,0.0,0,0.0,0,0,0,0,0,0,0,10,10
015e4b93-016a-457c-8987-65a7a048b051,Vaibhav,2022-09-23,Madhya Pradesh,Shajapur,Shajapur,BOYS M.S. MAHUPURA SHAJAPUR,23220605849,radhika rodu,6.0,Female,0,0.0,0,0.0,0,0.0,0,0.0,0,0,0,0,0,0,0,10,10
0170015d-21f2-4342-a66e-bd9ad6f49162,Nanda,2022-09-16,Madhya Pradesh,Shajapur,Shajapur,NEW M.S. DILLOD,23220619202,Subham Ramesh,6.0,Male,0,0.0,7,70.0,0,0.0,0,0.0,0,0,0,0,0,0,0,0,0
0180327f-16d3-49d9-9ea5-971a8ae20f57,Seema,2022-09-14,Madhya Pradesh,Sehore,Ashta,GMS GWALI,23330115001,nagesh,5.0,Male,3,75.0,8,80.0,0,0.0,0,0.0,0,0,0,0,0,0,0,0,0
0184f234-c120-4640-8e53-0071d2b5ec11,Kajal,2022-09-21,Madhya Pradesh,Shajapur,Shujalpur,P.S. GANGLA KHEDI,23220714101,Shivanya hukum Singh,5.0,Female,3,75.0,6,60.0,0,0.0,0,0.0,1,0,0,0,0,0,0,0,0
01c7e032-f3f8-4987-85cb-a2e54fbbbae9,Kirti,2022-09-21,Madhya Pradesh,Shajapur,Kalapipal,UEGS KHEDA BEHRAWAL,23220505512,Jaya purviya,5.0,Female,0,0.0,0,0.0,0,0.0,0,0.0,0,0,0,0,0,0,0,0,0
01d68d6f-12e2-47de-99d2-d5bad8643286,Sadhna,2022-09-20,Madhya Pradesh,Shajapur,Shujalpur,P. S. TITWAS,23220709901,Gagan,4.0,Male,2,50.0,9,90.0,0,0.0,0,0.0,0,0,0,0,0,0,0,0,0
022ba980-6fa8-4e72-aef8-2ccf8ed34cb7,Radhika,2022-09-22,Madhya Pradesh,Shajapur,Shajapur,UEGS BHERU KHEDA PATOLI,23220609502,Ayush,5.0,Male,1,25.0,9,90.0,0,0.0,0,0.0,0,0,0,0,0,0,0,0,0


In [56]:
mp_lit_scores.to_excel("mp_literacy_total_scores_v3.xlsx")

<h2>
    B. Numeracy Sub-tasks Data Cleaning
</h2>

<h3>
    Numeracy 1: Counting
</h3>

In [43]:
# Calculate total score on counting sub-task
numeracy1 = [col for col in numeracy_raw.columns if re.search(r'numeracy1_tt_grid_\d*$', col)]

mp_num_scores.loc[:, 'numeracy1_total'] = numeracy_raw.apply(lambda x: total_score([x[col] for col in numeracy1]), axis=1)

<h3>
    Numeracy 2: Number Recognition (Untimed)
</h3>

In [44]:
numeracy2_ut = [col for col in numeracy_raw.columns if re.search(r'numeracy2_ut_grid_\d*$', col)]

mp_num_scores.loc[:, 'numeracy2_ut_total'] = numeracy_raw.apply(lambda x: total_score([x[col] for col in numeracy2_ut]), axis=1)

<h3>
    Numeracy 2: Number Recognition (Timed)
</h3>

In [45]:
numeracy2_tt = [col for col in numeracy_raw.columns if re.search(r'numeracy2_tt_grid_\d*$', col)]

mp_num_scores.loc[:, 'numeracy2_tt_total'] = numeracy_raw.apply(lambda x: total_score([x[col] for col in numeracy2_tt]), axis=1)

<h3>
    Numeracy 3: Number Comparison
</h3>

In [46]:
numeracy3 = [col for col in numeracy_raw.columns if re.search(r'numeracy3\w*', col)]

mp_num_scores.loc[:, 'numeracy3_total'] = numeracy_raw.apply(lambda x: total_score([x[col] for col in numeracy3]), axis=1)

<h3>
    Numeracy 4: Counting in Bundles
</h3>

In [47]:
numeracy4 = [col for col in numeracy_raw.columns if re.search(r'numeracy4_ut_q\d*$', col)]

mp_num_scores.loc[:, 'numeracy4_total'] = numeracy_raw.apply(lambda x: total_score([x[col] for col in numeracy4]), axis=1)

<h3>
    Numeracy 5: Missing Numbers
</h3>

In [48]:
numeracy5 = [col for col in numeracy_raw.columns if re.search(r'numeracy5_ut_q\d*$', col)]

mp_num_scores.loc[:, 'numeracy5_total'] = numeracy_raw.apply(lambda x: total_score([x[col] for col in numeracy5]), axis=1)

<h3>
    Numeracy 6: Addition
</h3>

In [49]:
numeracy6 = [col for col in numeracy_raw.columns if re.search(r'numeracy6_ut_q\d*$', col)]

mp_num_scores.loc[:, 'numeracy6_total'] = numeracy_raw.apply(lambda x: total_score([x[col] for col in numeracy6]), axis=1)

<h3>
    Numeracy 7: Subtraction
</h3>

In [50]:
numeracy7 = [col for col in numeracy_raw.columns if re.search(r'numeracy7_ut_q\d*$', col)]

mp_num_scores.loc[:, 'numeracy7_total'] = numeracy_raw.apply(lambda x: total_score([x[col] for col in numeracy7]), axis=1)

<h3>
    Numeracy 8: Word Problems
</h3>

In [51]:
numeracy8 = [col for col in numeracy_raw.columns if re.search(r'numeracy8_ut_q\d*$', col)]

mp_num_scores.loc[:, 'numeracy8_total'] = numeracy_raw.apply(lambda x: total_score([x[col] for col in numeracy8]), axis=1)

<h3>
    Numeracy 9a: Shape Recognition (Circle)
</h3>

In [52]:
numeracy9a = [col for col in numeracy_raw.columns if re.search(r'numeracy9a_ut_grid_\d$', col)]

def total_score_9a(scores):
    total_score_9a = 0
    if scores[0] == '1' and scores[1] == '1' and scores[2] == '0' and scores[3] == '1' and scores[4] == '1' and scores[5] == '1' and scores[6] == '1' and scores[7] == '0':
            total_score_9a += 1 
    return total_score_9a
    
mp_num_scores.loc[:, 'numeracy9a_total'] = numeracy_raw.apply(lambda x: total_score_9a([x[score] for score in numeracy9a]), axis=1)

<h3>
    Numeracy 9b: Shape Recognition (Rectangle)
</h3>

In [53]:
numeracy9b = [col for col in numeracy_raw.columns if re.search(r'numeracy9b_ut_grid_\d$', col)]
    
def total_score_9b(scores):
    total_score_9b = 0
    if scores[0] == '1' and scores[1] == '0' and scores[2] == '1' and scores[3] == '1' and scores[4] == '0' and scores[5] == '1' and scores[6] == '1' and scores[7] == '0':
            total_score_9b += 1 
    return total_score_9b   
    
mp_num_scores.loc[:, 'numeracy9b_total'] = numeracy_raw.apply(lambda x: total_score_9b([x[score] for score in numeracy9b]), axis=1)

In [55]:
mp_num_scores.to_excel("mp_numeracy_total_scores_v3.xlsx")