In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#converting chronic absentee data from txt files to csv
#Chronic Absentee data: https://www.cde.ca.gov/ds/ad/filesabd.asp
#Chronic Absentee file description: https://www.cde.ca.gov/ds/ad/fsabd.asp
#from 2022 on, DASS column was added so remove for consistency 

df1 = pd.read_csv("chronicabsenteeism23.txt", delimiter= '\t', low_memory=False, on_bad_lines='skip', encoding = 'unicode_escape')
df1 = df1.drop("DASS", axis='columns')
#Only want school based data
df1 = df1[df1['Aggregate Level'] == "S"]
#drop unused columns to make data scrollable
df1 = df1.drop(['Aggregate Level', 'County Code', 'District Code', 'School Code', 'County Name', 'Charter School'], axis = 1)
#remove student privacy null data
df1 = df1[df1['ChronicAbsenteeismCount'] != '*']
df1.to_csv('chronicabsenteeism23.csv', index = None)

df2 = pd.read_csv("chronicabsenteeism22-v3.txt", delimiter= '\t', low_memory=False, on_bad_lines='skip', encoding = 'unicode_escape')
df2 = df2.drop("DASS", axis='columns')
df2 = df2[df2['Aggregate Level'] == "S"]
df2 = df2.drop(['Aggregate Level', 'County Code', 'District Code', 'School Code', 'County Name', 'Charter School'], axis = 1)
df2 = df2[df2['ChronicAbsenteeismCount'] != '*']
df2.to_csv('chronicabsenteeism22.csv', index = None)

df3 = pd.read_csv("chronicabsenteeism21.txt", delimiter= '\t', low_memory=False, on_bad_lines='skip', encoding = 'unicode_escape')
df3 = df3[df3['Aggregate Level'] == "S"]
df3 = df3.drop(['Aggregate Level', 'County Code', 'District Code', 'School Code', 'County Name', 'Charter School'], axis = 1)
df3 = df3.rename(columns={"ChronicAbsenteeismEligibleCumula":"ChronicAbsenteeismEligibleCumulativeEnrollment"})
df3 = df3[df3['ChronicAbsenteeismRate'].notnull()]
df3.to_csv('chronicabsenteeism21.csv', index = None)

df4 = pd.read_csv("chronicabsenteeism19.txt", delimiter= '\t', low_memory=False, on_bad_lines='skip', encoding = 'unicode_escape')
df4 = df4[df4['AggregateLevel'] == "S"]
df4 = df4.drop(['AggregateLevel', 'CountyCode', 'DistrictCode', 'SchoolCode', 'CountyName', 'CharterYN'], axis = 1)
df4 = df4.rename(columns={"AcademicYear":"Academic Year", "DistrictName":"District Name", "SchoolName":"School Name", "ReportingCategory":"Reporting Category", "ChronicAbsenteeismEligibleCumula":"ChronicAbsenteeismEligibleCumulativeEnrollment"})
df4 = df4[df4['ChronicAbsenteeismRate'].notnull()]
df4.to_csv('chronicabsenteeism19.csv', index = None)

df5 = pd.read_csv("chronicabsenteeism18.txt", delimiter= '\t', low_memory=False, on_bad_lines='skip', encoding = 'unicode_escape')
df5 = df5[df5['AggregateLevel'] == "S"]
df5 = df5.drop(['AggregateLevel', 'CountyCode', 'DistrictCode', 'SchoolCode', 'CountyName', 'CharterYN'], axis = 1)
df5 = df5.rename(columns={"AcademicYear":"Academic Year", "DistrictName":"District Name", "SchoolName":"School Name", "ReportingCategory":"Reporting Category", "ChronicAbsenteeismEligibleCumula":"ChronicAbsenteeismEligibleCumulativeEnrollment"})
df5 = df5[df5['ChronicAbsenteeismRate'].notnull()]
df5.to_csv('chronicabsenteeism18.csv', index = None)

df6 = pd.read_csv("chronicabsenteeism17.txt", delimiter= '\t', low_memory=False, on_bad_lines='skip', encoding = 'unicode_escape')
df6 = df6[df6['AggregateLevel'] == "S"]
df6 = df6.drop(['AggregateLevel', 'CountyCode', 'DistrictCode', 'SchoolCode', 'CountyName', 'CharterYN'], axis = 1)
df6 = df6.rename(columns={"AcademicYear":"Academic Year", "DistrictName":"District Name", "SchoolName":"School Name", "ReportingCategory":"Reporting Category", "ChronicAbsenteeismEligibleCumula":"ChronicAbsenteeismEligibleCumulativeEnrollment"})
df6 = df6[df6['ChronicAbsenteeismRate'].notnull()]
df6.to_csv('chronicabsenteeism17.csv', index = None)

#Concatenate all Chronic Absentee data
dfAbsentee = pd.concat([df1,df2,df3,df4,df5,df6], ignore_index = True, sort = False)
dfAbsentee.to_csv('allChronicAbsentee.csv', index = None)

In [4]:
#All Absentee Data
#pull chronic absenteeism for 2021 by demographics

#allChronicAbsentee is already cleaned

dfAbsRAW = pd.read_csv("allChronicAbsentee.csv")

dfAbsRAW = dfAbsRAW.drop(['District Name', 'ChronicAbsenteeismRate'], axis = 1)
dfAbsRAW = dfAbsRAW.sort_values(by=['Academic Year', 'School Name']).dropna()

#All that to separate them in this version for parallel computing:
dfAbs16 = dfAbsRAW[dfAbsRAW['Academic Year'] == "2016-17"]
dfAbs16.to_csv('Sample1.csv', index = None)
dfAbs17 = dfAbsRAW[dfAbsRAW['Academic Year'] == "2017-18"]
dfAbs17.to_csv('Sample2.csv', index = None)
dfAbs18 = dfAbsRAW[dfAbsRAW['Academic Year'] == "2018-19"]
dfAbs18.to_csv('Sample3.csv', index = None)
dfAbs20 = dfAbsRAW[dfAbsRAW['Academic Year'] == "2020-21"]
dfAbs20.to_csv('Sample4.csv', index = None)
dfAbs21 = dfAbsRAW[dfAbsRAW['Academic Year'] == "2021-22"]
dfAbs21.to_csv('Sample5.csv', index = None)
dfAbs22 = dfAbsRAW[dfAbsRAW['Academic Year'] == "2022-23"]
dfAbs22.to_csv('Sample6.csv', index = None)

In [2]:
demographicList = [
"RB",
"RI",
"RA",
"RF",
"RH",
"RD",
"RP",
"RT",
"RW",
"GM",
"GF",
"GX",
"SE",
"SD",
"SS",
"SM",
"SF",
"SH",
"GRKN",
"GR13",
"GR46",
"GR78",
"GRK8",
"GR912",
"TA"]

In [None]:
yearnum = $SLURM_ARRAY_TASK_ID - 1

#getting name of year from df
for year in dfAbsRAW['Academic Year'].unique().tolist():
    #absRateDemoList = []
    demonum = 0
    #using demographic symbol in domgraphic list
    for demo in demographicList:
        #refine set to the current demographic
        dfAbsDemo = dfAbsRAW[dfAbsRAW['Reporting Category'] == demo].dropna()
        #refine that to the current year
        dfAbsDemoYear = dfAbsDemo[dfAbsDemo['Academic Year'] == year]
        #sum the eligible students of the set comprised of the demo and the year
        eligibleDemo = dfAbsDemoYear['ChronicAbsenteeismEligibleCumulativeEnrollment'].astype(float).sum()
        #sum the chronically absent students of the set comprised of the demo and the year
        chronicAbsDemo = dfAbsDemoYear['ChronicAbsenteeismCount'].astype(float).sum()
        #create the absentee rate for the demographic for the year
        absRateDemo = chronicAbsDemo/eligibleDemo
    
        #Add it to the list of all the demos for the year
        absRateDemoList.append(absRateDemo)

        #add the demo's absentee rate to the array row for that year
        #demoYearArray[yearnum][demonum] = absRateDemo
        demonum += 1
    
        
        print("In", year, "with", chronicAbsDemo, "absent of a possible", eligibleDemo, "students, the chronic absentee rate of", demo, "is: ", absRateDemo)
    
    #print pie chart for ethnicities for that year
    plt.pie(absRateDemoList[0:9], labels = demographicList[0:9], autopct='%1.1f%%', startangle=90)
    titlep = "Chronic Absentee Rates by Ethnicity for " + str(year)
    plt.title(titlep) 
    #plt.show()
    plt.savefig(f'ChronAbsEthni{year}.png')

    #print bar graph for grade groups for that year
    categories = demographicList[18:24]
    values = absRateDemoList[18:24]

    plt.bar(categories, values)
    plt.xlabel('Grade Groups')
    plt.ylabel('Absentee Rate')
    titleb = "Absentee Rates by Grade Group for " + str(year)
    plt.title(titleb)
    #plt.show()
    plt.savefig(f'ChronAbsGrade{year}.png')

    #yearnum += 1
