In [1]:
import tabula
import pandas as pd
import os
import datetime
import numpy as np

pd.options.display.max_rows = 2000


In [2]:
# change campus names
def change_school_name(df):
    
    # school name dictionary
    school_dict = {
        'Bilingual': '',
        'NON-LEP DL': '',
        'Non‐LEP DL': '',
        'LEP DL BIL': '',
        'ESL': '',
        'ESOL': '',
        'Regular': '',
        'Self Contained': '',
        'S C': '',
        
        'Mt.': 'Mountain',
        'Tr.': 'Trail',
        
        'MIS': '',
        'ES': '',
        'MS': '',
        'HS': '',
        'High School': '',
        'Elementary School': '',
        'Middle School': '',
        'Elem School': '',
        'Elementary': '',
        'Elem: ': '',
        'EC': 'Early College',
        'Wm S Lott Juvenile Ctr': 'LOTT',
        'Wm. Lott': 'LOTT',
        'Lott': 'LOTT',
        'RROC': 'Round Rock Opportunity Center',
        'Daep': 'DAEP',
        'Deepwood': 'Deep Wood',
        'Liveoak': 'Live Oak',
        'Joe Lee': '',
        'Xenia': '',
        'Patsy': '',
        'Neysa': '',
        'Noel': '',
        'C.D.': 'CD',
        'C. D.': 'CD',
        'Claude': '',
        'Elsa': '',
        'James': '',
        'Kathy': '',
        'Linda': '',
        'Patsy': '',
        'RRISD': '',
        
    }
    
    # change keys(k) to values(v)
    for k, v in school_dict.items():
        df['School'] = df['School'].map(lambda dis_cell: dis_cell.replace(k, v).strip())
    
    return None

In [3]:
school_type = {
    # list of middle schools
    'CD Fulkes': 'MS',
    'Canyon Vista': 'MS',
    'Cedar Valley': 'MS',
    'Chisholm Trail': 'MS',
    'Deerpark': 'MS',
    'Grisham': 'MS',
    'Hernandez': 'MS',
    'Hopewell': 'MS',
    'Pearson Ranch': 'MS',
    'Ridgeview': 'MS',
    'Walsh': 'MS',
    
    # list of high schools
    'Cedar Ridge': 'HS',
    'McNeil': 'HS',
    'Round Rock': 'HS',
    'Stony Point': 'HS',
    'Westwood': 'HS',
    'Success': 'HS',
    'Early College': 'HS',
    
    # other schools
    'Round Rock Opportunity Center': 'Other',
    'DAEP': 'Other',
    'JJAEP': 'Other',
    'LOTT': 'Other',
}

In [8]:
# path for files
path = os.getcwd() + '/data/pdf/1213/'

# file names
filenames = os.listdir(path)
filenames.sort()

pdf = pd.DataFrame()
df_pdf = []
grades = ['ECE', 'PK', 'K', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']

for filename in filenames:
    if filename.endswith('pdf'):
        df_one = pd.DataFrame()
        print(filename)

        # read all pages from a pdf
        # pdf returns a list of dataframes
        # one dataframe for each page (pdf[0], pdf[1]...)
        # does not merge files don't have projected numbers
        pdf = tabula.read_pdf(path + filename, pages='all', multiple_tables=True, spreadsheet=True)

        # merge all dataframes into one    
        pdf = pd.concat(pdf, join='inner', axis=0)

        # get columns we want, rename columns
        pdf = pdf[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
        pdf = pdf.rename({0: 'School', 1: 'ECE', 2: 'PK', 3: 'K', 4: '1', 5: '2', 6: '3', 
                          7: '4', 8: '5', 9: '6', 10: '7', 11: '8', 12: '9', 13: '10',
                          14: '11', 15: '12'}, axis='columns')

        # remove unnecessary rows (total, campus - header rows)
        pdf = pdf[pdf['School'].str.contains('tot|campus', case=False, regex=True) != True]
        pdf = pdf[pdf['School'].isna() != True]
        pdf.fillna(0, inplace=True)


        # get dates from a filename
        filename = filename.replace('.pdf', '').strip()
        filename = filename.replace(' -  last day of school', '').strip()
        if len(filename) == 10:
            filename = datetime.datetime.strptime(filename, '%m-%d-%Y')
        elif len(filename) == 8:
            filename = datetime.datetime.strptime(filename, '%m-%d-%y')

        pdf.reset_index(drop=True, inplace=True)

        df_one['School'] = pdf['School']
        df_one['Date'] = filename
        # create 14 more rows for each school (so we can have 15 grades)
        df_one = df_one.append([df_one]*14, ignore_index=True)

        # sort by school name and reset index
        df_one = df_one.sort_values(by=['School'])
        df_one.reset_index(inplace=True, drop=True)

        for i in range(len(df_one)):

            # grades
            idx = i % 15
            df_one.loc[i, 'Grade'] = grades[idx]


        df_one['Enrolled'] = 0
        for i in range(len(df_one)):

            # enrollment
            school = df_one.loc[i, 'School']
            grade = df_one.loc[i, 'Grade']

            if not pdf[pdf['School'].str.strip() == school][grade].values.size == 0:
                df_one.loc[i, 'Enrolled'] = pdf[pdf['School'].str.strip() == school][grade].values[0]

        #  change types accordingly
        df_one['Group'] = 'Regular'
        for i in range(len(df_one)):

            # group
            school = str(df_one.loc[i, 'School']).strip().lower()

            if 'bilingual' in school:
                df_one.loc[i, 'Group'] = 'Bilingual'
            elif 'non‐lep dl' in school:
                df_one.loc[i, 'Group'] = 'Non-LEP DL'
            elif 'lep dl bil' in school:
                df_one.loc[i, 'Group'] = 'LEP DL BIL'
            elif 'esol' in school:
                df_one.loc[i, 'Group'] = 'ESL'
            elif 'esl' in school:
                df_one.loc[i, 'Group'] = 'ESL'
            elif 'regular' in school:
                df_one.loc[i, 'Group'] = 'Regular'
            elif 's/c' in school:
                df_one.loc[i, 'Group'] = 'Self Contained'

        # change school names
        df_one['School'] = df_one['School'].str.strip().str.replace('/', ' ')
        change_school_name(df_one)

        # reset index for future use
        df_one.reset_index(inplace=True, drop=True)

        # set all schools to 'ES' (default)
        # then change types accordingly
        df_one['Type'] = 'ES'
        for i in range(len(df_one)):
            if df_one.loc[i, 'School'] in school_type.keys():
                df_one.loc[i, 'Type'] = school_type[df_one.loc[i, 'School']]    

        df_pdf.append(df_one)

df_one_complete = pd.DataFrame(columns=['Date', 'School', 'Grade', 'Enrolled', 'Group', 'Type'])
    
for i in range(len(df_pdf)):
    df_one_complete = df_one_complete.append(df_pdf[i], sort=False, ignore_index=True)

01-08-13.pdf.pdf
01-15-13.pdf.pdf
01-22-13.pdf.pdf
01-29-13.pdf.pdf
02-12-13.pdf.pdf
02-19-13.pdf.pdf
02-26-13.pdf.pdf
04-09-13.pdf.pdf
04-23-13.pdf.pdf
09-11-12.pdf.pdf
10-02-12.pdf.pdf
10-16-12.pdf.pdf
10-23-12.pdf.pdf
10-30-12.pdf.pdf
11-06-12.pdf.pdf
11-13-12.pdf.pdf
11-27-12.pdf.pdf
12-04-12.pdf.pdf
12-11-12.pdf.pdf
12-18-12.pdf.pdf


In [9]:
df_one_complete

Unnamed: 0,Date,School,Grade,Enrolled,Group,Type
0,2013-01-08,Anderson Mill,ECE,0,Regular,ES
1,2013-01-08,Anderson Mill,PK,28,Regular,ES
2,2013-01-08,Anderson Mill,K,40,Regular,ES
3,2013-01-08,Anderson Mill,1,45,Regular,ES
4,2013-01-08,Anderson Mill,2,44,Regular,ES
5,2013-01-08,Anderson Mill,3,59,Regular,ES
6,2013-01-08,Anderson Mill,4,41,Regular,ES
7,2013-01-08,Anderson Mill,5,52,Regular,ES
8,2013-01-08,Anderson Mill,6,0,Regular,ES
9,2013-01-08,Anderson Mill,7,0,Regular,ES


In [None]:
df_one

In [12]:
set(df_one_complete['School'])

{'Anderson Mill',
 'Berkman',
 'Blackland Prairie',
 'Bluebonnet',
 'Brushy Creek',
 'CD Fulkes',
 'Cactus Ranch',
 'Caldwell Heights',
 'Callison',
 'Canyon Creek',
 'Canyon Vista',
 'Caraway',
 'Cedar Ridge',
 'Cedar Valley',
 'Chandler Oaks',
 'Chisholm Trail',
 'DAEP',
 'Deep Wood',
 'Deerpark',
 'Double File Trail',
 'England',
 'Fern Bluff',
 'Forest Creek',
 'Forest North',
 'Gattis',
 'Great Oaks',
 'Grisham',
 'Hernandez',
 'Herrington',
 'Hopewell',
 'JJAEP LOTT',
 'Jollyville',
 'Laurel Mountain',
 'Live Oak',
 'McNeil',
 'Old Town',
 'Pond Springs',
 'Purple Sage',
 'Ridgeview',
 'Robertson',
 'Round Rock',
 'Round Rock Opportunity Center',
 'Sommer',
 'Spicewood',
 'Stony Point',
 'Success',
 'Teravista',
 'Union Hill',
 'Voigt',
 'Walsh',
 'Wells Branch',
 'Westwood'}

In [11]:
set(df_one_complete['Group'])

{'Bilingual', 'ESL', 'Non-LEP DL', 'Regular', 'Self Contained'}

In [10]:
set(df_one_complete['Type'])

{'ES', 'HS', 'MS', 'Other'}

In [13]:
df_one_complete.to_csv('./pdf_1213.csv')