In [1]:
import tabula
import pandas as pd
import os
import datetime
import numpy as np

pd.options.display.max_rows = 2000
pd.options.display.max_columns= 2000

In [2]:
# change campus names
def change_school_name(df):
    
    # school name dictionary
    school_dict = {
        '\t\r': '',
        '-\xad': '',
        'Bilingual': '',
        'NON-LEP DL': '',
        'Non‐LEP DL': '',
        'LEP DL BIL': '',
        'ESL': '',
        'ESOL': '',
        'Regular': '',
        'Self Contained': '',
        'S C': '',
        'Chisholm Trail': 'Chisholm Trail',
        'Mt.': 'Mountain',
        'Tr.': 'Trail',
        
        'MIS': '',
        'ES': '',
        'MS': '',
        'HS': '',
        'High School': '',
        'Elementary School': '',
        'Middle School': '',
        'Elem School': '',
        'Elementary': '',
        'Elem: ': '',
        'EC': 'Early College',
        'Wm S Lott Juvenile Ctr': 'LOTT',
        'Wm. Lott': 'LOTT',
        'Lott': 'LOTT',
        'RROC': 'Round Rock Opportunity Center',
        'Daep': 'DAEP',
        'Deepwood': 'Deep Wood',
        'Liveoak': 'Live Oak',
        'Joe Lee': '',
        'Xenia': '',
        'Patsy': '',
        'Neysa': '',
        'Noel': '',
        'C.D.': 'CD',
        'C. D.': 'CD',
        'Claude': '',
        'Elsa': '',
        'James': '',
        'Kathy': '',
        'Linda': '',
        'Patsy': '',
        'RRISD': '',
        'Non-LEP DL': '',
        'Trailil': 'Trail',
    }
    
    # change keys(k) to values(v)
    for k, v in school_dict.items():
        df['School'] = df['School'].str.replace(k, v).str.strip()
    
    return None

In [3]:
school_type = {
    # list of middle schools
    'CD Fulkes': 'MS',
    'Canyon Vista': 'MS',
    'Cedar Valley': 'MS',
    'Chisholm Trail': 'MS',
    'Deerpark': 'MS',
    'Grisham': 'MS',
    'Hernandez': 'MS',
    'Hopewell': 'MS',
    'Pearson Ranch': 'MS',
    'Ridgeview': 'MS',
    'Walsh': 'MS',
    
    # list of high schools
    'Cedar Ridge': 'HS',
    'McNeil': 'HS',
    'Round Rock': 'HS',
    'Stony Point': 'HS',
    'Westwood': 'HS',
    'Success': 'HS',
    'Early College': 'HS',
    
    # other schools
    'Round Rock Opportunity Center': 'Other',
    'DAEP': 'Other',
    'JJAEP': 'Other',
    'LOTT': 'Other',
}

In [4]:
# path for files
path = os.getcwd() + '/data/pdf/1213/'

# file names
filenames = os.listdir(path)
filenames.sort()

pdf = pd.DataFrame()
df_pdf = []
grades = ['ECE', 'PK', 'K', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']

for filename in filenames:
    if filename.endswith('pdf'):
        df_one = pd.DataFrame()
        print(filename)

        # read all pages from a pdf
        # pdf returns a list of dataframes
        # one dataframe for each page (pdf[0], pdf[1]...)
        # does not merge files don't have projected numbers
        pdf = tabula.read_pdf(path + filename, pages='all', multiple_tables=True, spreadsheet=True)

        # merge all dataframes into one    
        pdf = pd.concat(pdf, join='inner', axis=0)

        # get columns we want, rename columns
        pdf = pdf[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
        pdf = pdf.rename({0: 'School', 1: 'ECE', 2: 'PK', 3: 'K', 4: '1', 5: '2', 6: '3', 
                          7: '4', 8: '5', 9: '6', 10: '7', 11: '8', 12: '9', 13: '10',
                          14: '11', 15: '12'}, axis='columns')

        # remove unnecessary rows (total, campus - header rows)
        pdf = pdf[pdf['School'].str.contains('tot|campus', case=False, regex=True) != True]
        pdf = pdf[pdf['School'].isna() != True]
        pdf.fillna(0, inplace=True)


        # get dates from a filename
        filename = filename.replace('.pdf', '').strip()
        filename = filename.replace(' -  last day of school', '').strip()
        if len(filename) == 10:
            filename = datetime.datetime.strptime(filename, '%m-%d-%Y')
        elif len(filename) == 8:
            filename = datetime.datetime.strptime(filename, '%m-%d-%y')

        pdf.reset_index(drop=True, inplace=True)

        df_one['School'] = pdf['School']
        df_one['Date'] = filename
        # create 14 more rows for each school (so we can have 15 grades)
        df_one = df_one.append([df_one]*14, ignore_index=True)

        # sort by school name and reset index
        df_one = df_one.sort_values(by=['School'])
        df_one.reset_index(inplace=True, drop=True)

        for i in range(len(df_one)):

            # grades
            idx = i % 15
            df_one.loc[i, 'Grade'] = grades[idx]


        df_one['Enrolled'] = 0
        for i in range(len(df_one)):

            # enrollment
            school = df_one.loc[i, 'School']
            grade = df_one.loc[i, 'Grade']

            if not pdf[pdf['School'].str.strip() == school][grade].values.size == 0:
                df_one.loc[i, 'Enrolled'] = pdf[pdf['School'].str.strip() == school][grade].values[0]

        #  change types accordingly
        df_one['Group'] = 'Regular'

        # modify school names to the right format
        df_one['School'] = df_one['School'].str.replace('\t\r', '')
        df_one['School'] = df_one['School'].str.replace('  ', ' ')
        df_one['School'] = df_one['School'].str.replace('-­‐', '-')
        
        for i in range(len(df_one)):

            # group
            school = str(df_one.loc[i, 'School']).strip().lower()

            if 'bilingual' in school:
                df_one.loc[i, 'Group'] = 'Bilingual'
            elif 'non-lep dl' in school or 'non‐lep dl' in school:
                df_one.loc[i, 'Group'] = 'Non-LEP DL'
            elif 'lep dl bil' in school:
                df_one.loc[i, 'Group'] = 'LEP DL BIL'
            elif 'esol' in school:
                df_one.loc[i, 'Group'] = 'ESL'
            elif 'esl' in school:
                df_one.loc[i, 'Group'] = 'ESL'
            elif 'regular' in school:
                df_one.loc[i, 'Group'] = 'Regular'
            elif 's/c' in school:
                df_one.loc[i, 'Group'] = 'Self Contained'

        # change school names
        df_one['School'] = df_one['School'].str.strip().str.replace('/', ' ')
        change_school_name(df_one)

        # reset index for future use
        df_one.reset_index(inplace=True, drop=True)

        # set all schools to 'ES' (default)
        # then change types accordingly
        df_one['Type'] = 'ES'
        for i in range(len(df_one)):
            if df_one.loc[i, 'School'] in school_type.keys():
                df_one.loc[i, 'Type'] = school_type[df_one.loc[i, 'School']]    

        df_pdf.append(df_one)

df_one_complete = pd.DataFrame(columns=['Date', 'School', 'Grade', 'Enrolled', 'Group', 'Type'])
    
for i in range(len(df_pdf)):
    df_one_complete = df_one_complete.append(df_pdf[i], sort=False, ignore_index=True)

01-08-13.pdf.pdf
01-15-13.pdf.pdf
01-22-13.pdf.pdf
01-29-13.pdf.pdf
02-12-13.pdf.pdf
02-19-13.pdf.pdf
02-26-13.pdf.pdf
04-09-13.pdf.pdf
04-23-13.pdf.pdf
09-11-12.pdf.pdf
10-02-12.pdf.pdf
10-16-12.pdf.pdf
10-23-12.pdf.pdf
10-30-12.pdf.pdf
11-06-12.pdf.pdf
11-13-12.pdf.pdf
11-27-12.pdf.pdf
12-04-12.pdf.pdf
12-11-12.pdf.pdf
12-18-12.pdf.pdf


In [5]:
p1_school = set(df_one_complete['School'])

p1_group = set(df_one_complete['Group'])

p1_type = set(df_one_complete['Type'])

df_one_complete.to_csv('./pdf_1213.csv')

In [6]:
# path for files
path = os.getcwd() + '/data/pdf/1314/'

# file names
filenames = os.listdir(path)
filenames.sort()

pdf = pd.DataFrame()
df_pdf = []
grades = ['ECE', 'PK', 'K', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']

for filename in filenames:
    if filename.endswith('pdf'):
        df_one = pd.DataFrame()
        print(filename)

        # read all pages from a pdf
        # pdf returns a list of dataframes
        # one dataframe for each page (pdf[0], pdf[1]...)
        # does not merge files don't have projected numbers
        pdf = tabula.read_pdf(path + filename, pages='all', multiple_tables=True, spreadsheet=True)

        # merge all dataframes into one    
        pdf = pd.concat(pdf, join='inner', axis=0)

        # get columns we want, rename columns
        pdf = pdf[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
        pdf = pdf.rename({0: 'School', 1: 'ECE', 2: 'PK', 3: 'K', 4: '1', 5: '2', 6: '3', 
                          7: '4', 8: '5', 9: '6', 10: '7', 11: '8', 12: '9', 13: '10',
                          14: '11', 15: '12'}, axis='columns')

        # remove unnecessary rows (total, campus - header rows)
        pdf = pdf[pdf['School'].str.contains('tot|campus', case=False, regex=True) != True]
        pdf = pdf[pdf['School'].isna() != True]
        pdf.fillna(0, inplace=True)


        # get dates from a filename
        filename = filename.replace('.pdf', '').strip()
        filename = filename.replace(' -  last day of school', '').strip()
        if len(filename) == 10:
            filename = datetime.datetime.strptime(filename, '%m-%d-%Y')
        elif len(filename) == 8:
            filename = datetime.datetime.strptime(filename, '%m-%d-%y')

        pdf.reset_index(drop=True, inplace=True)

        df_one['School'] = pdf['School']
        df_one['Date'] = filename
        # create 14 more rows for each school (so we can have 15 grades)
        df_one = df_one.append([df_one]*14, ignore_index=True)

        # sort by school name and reset index
        df_one = df_one.sort_values(by=['School'])
        df_one.reset_index(inplace=True, drop=True)

        for i in range(len(df_one)):

            # grades
            idx = i % 15
            df_one.loc[i, 'Grade'] = grades[idx]


        df_one['Enrolled'] = 0
        for i in range(len(df_one)):

            # enrollment
            school = df_one.loc[i, 'School']
            grade = df_one.loc[i, 'Grade']

            if not pdf[pdf['School'].str.strip() == school][grade].values.size == 0:
                df_one.loc[i, 'Enrolled'] = pdf[pdf['School'].str.strip() == school][grade].values[0]

        #  change types accordingly
        df_one['Group'] = 'Regular'
        
        # modify school names to the right format
        df_one['School'] = df_one['School'].str.replace('\t\r', '')
        df_one['School'] = df_one['School'].str.replace('  ', ' ')
        df_one['School'] = df_one['School'].str.replace('-­‐', '-')
        
        for i in range(len(df_one)):

            # group
            school = str(df_one.loc[i, 'School']).strip().lower()

            if 'bilingual' in school:
                df_one.loc[i, 'Group'] = 'Bilingual'
            elif 'non-lep dl' in school or 'non‐lep dl' in school:
                df_one.loc[i, 'Group'] = 'Non-LEP DL'
            elif 'lep dl bil' in school:
                df_one.loc[i, 'Group'] = 'LEP DL BIL'
            elif 'esol' in school:
                df_one.loc[i, 'Group'] = 'ESL'
            elif 'esl' in school:
                df_one.loc[i, 'Group'] = 'ESL'
            elif 'regular' in school:
                df_one.loc[i, 'Group'] = 'Regular'
            elif 's/c' in school:
                df_one.loc[i, 'Group'] = 'Self Contained'

        # change school names
        df_one['School'] = df_one['School'].str.strip().str.replace('/', ' ')
        change_school_name(df_one)

        # reset index for future use
        df_one.reset_index(inplace=True, drop=True)

        # set all schools to 'ES' (default)
        # then change types accordingly
        df_one['Type'] = 'ES'
        for i in range(len(df_one)):
            if df_one.loc[i, 'School'] in school_type.keys():
                df_one.loc[i, 'Type'] = school_type[df_one.loc[i, 'School']]    

        df_pdf.append(df_one)

df_two_complete = pd.DataFrame(columns=['Date', 'School', 'Grade', 'Enrolled', 'Group', 'Type'])
    
for i in range(len(df_pdf)):
    df_two_complete = df_two_complete.append(df_pdf[i], sort=False, ignore_index=True)

01-07-2014.pdf
01-14-2014.pdf
01-21-2014.pdf
01-29-2014 .pdf
02-04-2014 .pdf
02-11-2014.pdf
02-18-2014.pdf
02-25-2014.pdf
03-04-2014.pdf
03-18-2014.pdf
03-25-2014.pdf
04-01-2014.pdf
04-08-2014.pdf
04-15-2014 .pdf
04-22-2014.pdf
04-29-2014 .pdf
05-06-2014.pdf
05-13-2014.pdf
05-20-2014.pdf
05-27-2014.pdf


In [7]:
p2_school = set(df_two_complete['School'])

p2_group = set(df_two_complete['Group'])

p2_type = set(df_two_complete['Type'])

df_two_complete.to_csv('./pdf_1314.csv')

In [8]:
# path for files
path = os.getcwd() + '/data/pdf/1415/'

# file names
filenames = os.listdir(path)
filenames.sort()

pdf = pd.DataFrame()
df_pdf = []
grades = ['ECE', 'PK', 'K', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']

for filename in filenames:
    if filename.endswith('pdf'):
        df_one = pd.DataFrame()
        print(filename)

        # read all pages from a pdf
        # pdf returns a list of dataframes
        # one dataframe for each page (pdf[0], pdf[1]...)
        # does not merge files don't have projected numbers
        pdf = tabula.read_pdf(path + filename, pages='all', multiple_tables=True, spreadsheet=True)

        # merge all dataframes into one    
        pdf = pd.concat(pdf, join='inner', axis=0)

        # get columns we want, rename columns
        pdf = pdf[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
        pdf = pdf.rename({0: 'School', 1: 'ECE', 2: 'PK', 3: 'K', 4: '1', 5: '2', 6: '3', 
                          7: '4', 8: '5', 9: '6', 10: '7', 11: '8', 12: '9', 13: '10',
                          14: '11', 15: '12'}, axis='columns')

        # remove unnecessary rows (total, campus - header rows)
        pdf = pdf[pdf['School'].str.contains('tot|campus', case=False, regex=True) != True]
        pdf = pdf[pdf['School'].isna() != True]
        pdf.fillna(0, inplace=True)


        # get dates from a filename
        filename = filename.replace('.pdf', '').strip()
        filename = filename.replace(' -  last day of school', '').strip()
        if len(filename) == 10:
            filename = datetime.datetime.strptime(filename, '%m-%d-%Y')
        elif len(filename) == 8:
            filename = datetime.datetime.strptime(filename, '%m-%d-%y')

        pdf.reset_index(drop=True, inplace=True)
        
        df_one['School'] = pdf['School']
        df_one['Date'] = filename

        # create 14 more rows for each school (so we can have 15 grades)
        df_one = df_one.append([df_one]*14, ignore_index=True)

        # sort by school name and reset index
        df_one = df_one.sort_values(by=['School'])
        df_one.reset_index(inplace=True, drop=True)

        for i in range(len(df_one)):

            # grades
            idx = i % 15
            df_one.loc[i, 'Grade'] = grades[idx]


        df_one['Enrolled'] = 0
        for i in range(len(df_one)):

            # enrollment
            school = df_one.loc[i, 'School']
            grade = df_one.loc[i, 'Grade']

            if not pdf[pdf['School'].str.strip() == school][grade].values.size == 0:
                df_one.loc[i, 'Enrolled'] = pdf[pdf['School'].str.strip() == school][grade].values[0]

        # change types accordingly
        df_one['Group'] = 'Regular'
        
        # modify school names to the right format
        df_one['School'] = df_one['School'].str.replace('\t\r', '')
        df_one['School'] = df_one['School'].str.replace('  ', ' ')
        df_one['School'] = df_one['School'].str.replace('-­‐', '-')


        for i in range(len(df_one)):

            # group
            school = str(df_one.loc[i, 'School']).strip().lower()

            if 'bilingual' in school:
                df_one.loc[i, 'Group'] = 'Bilingual'
            elif 'non-lep dl' in school or 'non‐lep dl' in school:
                df_one.loc[i, 'Group'] = 'Non-LEP DL'
            elif 'lep dl bil' in school:
                df_one.loc[i, 'Group'] = 'LEP DL BIL'
            elif 'esol' in school:
                df_one.loc[i, 'Group'] = 'ESL'
            elif 'esl' in school:
                df_one.loc[i, 'Group'] = 'ESL'
            elif 'regular' in school:
                df_one.loc[i, 'Group'] = 'Regular'
            elif 's/c' in school:
                df_one.loc[i, 'Group'] = 'Self Contained'
            elif 'self contained' in school:
                df_one.loc[i, 'Group'] = 'Self Contained'

        # change school names
        df_one['School'] = df_one['School'].str.strip().str.replace('/', ' ')
        df_one['School'] = df_one['School'].str.strip().str.replace('  ', ' ')
        change_school_name(df_one)

        # reset index for future use
        df_one.reset_index(inplace=True, drop=True)

        # set all schools to 'ES' (default)
        # then change types accordingly
        df_one['Type'] = 'ES'
        for i in range(len(df_one)):
            if df_one.loc[i, 'School'] in school_type.keys():
                df_one.loc[i, 'Type'] = school_type[df_one.loc[i, 'School']]    

        df_pdf.append(df_one)

df_three_complete = pd.DataFrame(columns=['Date', 'School', 'Grade', 'Enrolled', 'Group', 'Type'])
    
for i in range(len(df_pdf)):
    df_three_complete = df_three_complete.append(df_pdf[i], sort=False, ignore_index=True)

01-06-2015.pdf
01-13-2015.pdf
01-20-2015.pdf
01-27-2015.pdf
02-03-2015.pdf
02-10-2015.pdf
02-17-15.pdf
03-03-2015.pdf
03-10-2015.pdf
03-24-2015.pdf
03-31-2015.pdf
04-07-2015.pdf
04-14-2015.pdf
04-21-2015.pdf
04-28-2015.pdf
05-05-2015.pdf
05-12-2015.pdf
05-19-2015.pdf
05-26-2015.pdf
08-26-14.pdf
09-02-14.pdf
09-09-2014.pdf
09-16-2014.pdf
09-23-2014.pdf
09-30-2014.pdf
10-08-14.pdf
10-15-14.pdf
10-22-14.pdf
10-28-14.pdf
11-04-14.pdf
11-11-14.pdf
11-18-14.pdf
11-25-14.pdf
12-02-14.pdf
12-09-14.pdf
12-16-14.pdf


In [None]:
df_three_complete[df_three_complete['Group'] == 'Non-LEP DL']

In [None]:
df_three_complete[df_three_complete['Date']=='2014-09-16']

In [9]:
p3_school = set(df_three_complete['School'])

p3_group = set(df_three_complete['Group'])

p3_type = set(df_three_complete['Type'])

df_three_complete.to_csv('./pdf_1415.csv')

In [10]:
df_three_complete['Group'].value_counts()

Regular           28590
ESL               17325
Self Contained    14520
Non-LEP DL         6300
Bilingual          6300
Name: Group, dtype: int64

In [11]:
# path for files
path = os.getcwd() + '/data/pdf/1516/'

# file names
filenames = os.listdir(path)
filenames.sort()

pdf = pd.DataFrame()
df_pdf = []
grades = ['ECE', 'PK', 'K', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']

for filename in filenames:
    if filename.endswith('pdf'):
        df_one = pd.DataFrame()
        print(filename)

        # read all pages from a pdf
        # pdf returns a list of dataframes
        # one dataframe for each page (pdf[0], pdf[1]...)
        # does not merge files don't have projected numbers
        pdf = tabula.read_pdf(path + filename, pages='all', multiple_tables=True, spreadsheet=True)

        # merge all dataframes into one    
        pdf = pd.concat(pdf, join='inner', axis=0)

        # get columns we want, rename columns
        pdf = pdf[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
        pdf = pdf.rename({0: 'School', 1: 'ECE', 2: 'PK', 3: 'K', 4: '1', 5: '2', 6: '3', 
                          7: '4', 8: '5', 9: '6', 10: '7', 11: '8', 12: '9', 13: '10',
                          14: '11', 15: '12'}, axis='columns')

        # remove unnecessary rows (total, campus - header rows)
        pdf = pdf[pdf['School'].str.contains('tot|campus', case=False, regex=True) != True]
        pdf = pdf[pdf['School'].isna() != True]
        pdf.fillna(0, inplace=True)


        # get dates from a filename
        filename = filename.replace('.pdf', '').strip()
        filename = filename.replace(' -  last day of school', '').strip()
        if len(filename) == 10:
            filename = datetime.datetime.strptime(filename, '%m-%d-%Y')
        elif len(filename) == 8:
            filename = datetime.datetime.strptime(filename, '%m-%d-%y')

        pdf.reset_index(drop=True, inplace=True)

        df_one['School'] = pdf['School']
        df_one['Date'] = filename
        # create 14 more rows for each school (so we can have 15 grades)
        df_one = df_one.append([df_one]*14, ignore_index=True)

        # sort by school name and reset index
        df_one = df_one.sort_values(by=['School'])
        df_one.reset_index(inplace=True, drop=True)

        for i in range(len(df_one)):

            # grades
            idx = i % 15
            df_one.loc[i, 'Grade'] = grades[idx]


        df_one['Enrolled'] = 0
        for i in range(len(df_one)):

            # enrollment
            school = df_one.loc[i, 'School']
            grade = df_one.loc[i, 'Grade']

            if not pdf[pdf['School'].str.strip() == school][grade].values.size == 0:
                df_one.loc[i, 'Enrolled'] = pdf[pdf['School'].str.strip() == school][grade].values[0]

        #  change types accordingly
        df_one['Group'] = 'Regular'
        
        # modify school names to the right format
        df_one['School'] = df_one['School'].str.replace('\t\r', '')
        df_one['School'] = df_one['School'].str.replace('  ', ' ')
        df_one['School'] = df_one['School'].str.replace('-­‐', '-')
        
        for i in range(len(df_one)):

            # group
            school = str(df_one.loc[i, 'School']).strip().lower()
            if 'bilingual' in school:
                df_one.loc[i, 'Group'] = 'Bilingual'
            elif 'non-lep dl' in school or 'non‐lep dl' in school:
                df_one.loc[i, 'Group'] = 'Non-LEP DL'
            elif 'lep dl bil' in school:
                df_one.loc[i, 'Group'] = 'LEP DL BIL'
            elif 'esol' in school:
                df_one.loc[i, 'Group'] = 'ESL'
            elif 'esl' in school:
                df_one.loc[i, 'Group'] = 'ESL'
            elif 'regular' in school:
                df_one.loc[i, 'Group'] = 'Regular'
            elif 's/c' in school:
                df_one.loc[i, 'Group'] = 'Self Contained'
            elif 'self contained' in school:
                df_one.loc[i, 'Group'] = 'Self Contained'

        # change school names
        df_one['School'] = df_one['School'].str.strip().str.replace('/', ' ')
        df_one['School'] = df_one['School'].str.strip().str.replace('  ', ' ')
        change_school_name(df_one)

        # reset index for future use
        df_one.reset_index(inplace=True, drop=True)

        # set all schools to 'ES' (default)
        # then change types accordingly
        df_one['Type'] = 'ES'
        for i in range(len(df_one)):
            if df_one.loc[i, 'School'] in school_type.keys():
                df_one.loc[i, 'Type'] = school_type[df_one.loc[i, 'School']]    

        df_pdf.append(df_one)

df_four_complete = pd.DataFrame(columns=['Date', 'School', 'Grade', 'Enrolled', 'Group', 'Type'])
    
for i in range(len(df_pdf)):
    df_four_complete = df_four_complete.append(df_pdf[i], sort=False, ignore_index=True)

01-05-2016.pdf
01-12-2016.pdf
01-19-2016.pdf
01-26-2016.pdf
02-02-2016.pdf
02-09-2016.pdf
02-16-2016.pdf
02-23-2016.pdf
03-01-2016.pdf
03-08-2016.pdf
03-22-2016.pdf
03-29-2016.pdf
04-06-2016.pdf
04-12-2016.pdf
04-19-2016.pdf
04-26-2016.pdf
05-03-2016.pdf
05-10-2016.pdf
05-17-2016.pdf
05-24-2016.pdf
06-02-2016 -  last day of school.pdf
09-22-2015.pdf
10-27-2015.pdf
11-03-2015.pdf
11-10-2015.pdf
11-17-2015.pdf
12-01-2015.pdf
12-08-2015.pdf
12-15-2015.pdf


In [12]:
p4_school = set(df_four_complete['School'])

p4_group = set(df_four_complete['Group'])

p4_type = set(df_four_complete['Type'])

df_four_complete.to_csv('./pdf_1516.csv')

In [15]:
df = df_one_complete.append([df_two_complete, df_three_complete, df_four_complete, df_five_complete], sort=False)
df.reset_index(drop=True, inplace=True)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 267060 entries, 0 to 267059
Data columns (total 6 columns):
Date        267060 non-null datetime64[ns]
School      267060 non-null object
Grade       267060 non-null object
Enrolled    267060 non-null object
Group       267060 non-null object
Type        267060 non-null object
dtypes: datetime64[ns](1), object(5)
memory usage: 12.2+ MB


In [17]:
df['Enrolled'] = df['Enrolled'].astype(int)

In [18]:
df.to_csv('./pdf.csv')

In [19]:
set(df['Group'])

{'Bilingual', 'ESL', 'Non-LEP DL', 'Regular', 'Self Contained'}

In [20]:
df['Group'].value_counts()

Regular           106950
ESL                65160
Self Contained     45645
Non-LEP DL         25545
Bilingual          23760
Name: Group, dtype: int64

In [13]:
# path for files
path = os.getcwd() + '/data/pdf/etc/'

# file names
filenames = os.listdir(path)
filenames.sort()

pdf = pd.DataFrame()
df_pdf = []
grades = ['ECE', 'PK', 'K', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']

for filename in filenames:
    if filename.endswith('pdf'):
        df_one = pd.DataFrame()
        print(filename)

        # read all pages from a pdf
        # pdf returns a list of dataframes
        # one dataframe for each page (pdf[0], pdf[1]...)
        # does not merge files don't have projected numbers
        pdf = tabula.read_pdf(path + filename, pages='all', multiple_tables=True, spreadsheet=True)

        # merge all dataframes into one    
        pdf = pd.concat(pdf, join='inner', axis=0)

        # get columns we want, rename columns
        pdf = pdf[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
        pdf = pdf.rename({0: 'School', 1: 'ECE', 2: 'PK', 3: 'K', 4: '1', 5: '2', 6: '3', 
                          7: '4', 8: '5', 9: '6', 10: '7', 11: '8', 12: '9', 13: '10',
                          14: '11', 15: '12'}, axis='columns')

        # remove unnecessary rows (total, campus - header rows)
        pdf = pdf[pdf['School'].str.contains('tot|campus', case=False, regex=True) != True]
        pdf = pdf[pdf['School'].isna() != True]
        pdf.fillna(0, inplace=True)


        # get dates from a filename
        filename = filename.replace('.pdf', '').strip()
        filename = filename.replace(' -  last day of school', '').strip()
        if len(filename) == 10:
            filename = datetime.datetime.strptime(filename, '%m-%d-%Y')
        elif len(filename) == 8:
            filename = datetime.datetime.strptime(filename, '%m-%d-%y')

        pdf.reset_index(drop=True, inplace=True)

        df_one['School'] = pdf['School'].str.replace('\t\r','').str.replace('  ',' ').str.replace('-­‐', '-')
        df_one['Date'] = filename
        # create 14 more rows for each school (so we can have 15 grades)
        df_one = df_one.append([df_one]*14, ignore_index=True)

        # sort by school name and reset index
        df_one = df_one.sort_values(by=['School'])
        df_one.reset_index(inplace=True, drop=True)

        for i in range(len(df_one)):

            # grades
            idx = i % 15
            df_one.loc[i, 'Grade'] = grades[idx]


        df_one['Enrolled'] = 0
        for i in range(len(df_one)):

            # enrollment
            school = df_one.loc[i, 'School']
            grade = df_one.loc[i, 'Grade']

            if not pdf[pdf['School'].str.strip() == school][grade].values.size == 0:
                df_one.loc[i, 'Enrolled'] = pdf[pdf['School'].str.strip() == school][grade].values[0]

        #  change types accordingly
        df_one['Group'] = 'Regular'

        # modify school names to the right format
        df_one['School'] = df_one['School'].str.replace('\t\r', '')
        df_one['School'] = df_one['School'].str.replace('  ', ' ')
        df_one['School'] = df_one['School'].str.replace('-­‐', '-')
        
        for i in range(len(df_one)):

            # group
            school = str(df_one.loc[i, 'School']).strip().lower()
            if 'bilingual' in school:
                df_one.loc[i, 'Group'] = 'Bilingual'
            elif 'non-lep dl' in school or 'non‐lep dl' in school:
                df_one.loc[i, 'Group'] = 'Non-LEP DL'
            elif 'lep dl bil' in school:
                df_one.loc[i, 'Group'] = 'LEP DL BIL'
            elif 'esol' in school:
                df_one.loc[i, 'Group'] = 'ESL'
            elif 'esl' in school:
                df_one.loc[i, 'Group'] = 'ESL'
            elif 'regular' in school:
                df_one.loc[i, 'Group'] = 'Regular'
            elif 's/c' in school:
                df_one.loc[i, 'Group'] = 'Self Contained'
            elif 'self contained' in school:
                df_one.loc[i, 'Group'] = 'Self Contained'

        # change school names
        df_one['School'] = df_one['School'].str.strip().str.replace('/', ' ')
        df_one['School'] = df_one['School'].str.strip().str.replace('  ', ' ')
        change_school_name(df_one)

        # reset index for future use
        df_one.reset_index(inplace=True, drop=True)

        # set all schools to 'ES' (default)
        # then change types accordingly
        df_one['Type'] = 'ES'
        for i in range(len(df_one)):
            if df_one.loc[i, 'School'] in school_type.keys():
                df_one.loc[i, 'Type'] = school_type[df_one.loc[i, 'School']]    

        df_pdf.append(df_one)

df_five_complete = pd.DataFrame(columns=['Date', 'School', 'Grade', 'Enrolled', 'Group', 'Type'])
    
for i in range(len(df_pdf)):
    df_five_complete = df_four_complete.append(df_pdf[i], sort=False, ignore_index=True)

08-25-2015.pdf
09-01-2015.pdf
09-08-2015.pdf
09-15-2015.pdf
09-29-2015.pdf
10-06-2015.pdf
10-13-2015.pdf
10-20-2015.pdf


In [14]:
p5_school = set(df_five_complete['School'])

p5_group = set(df_five_complete['Group'])

p5_type = set(df_five_complete['Type'])

df_five_complete.to_csv('./pdf_etc.csv')

In [None]:
p5_group

In [None]:
df_five_complete