In [1]:
import pandas as pd
import os

pd.options.display.max_rows = 200
pd.options.display.max_columns = 10000

In [2]:
# change campus names
def change_school_name(df):
    
    # school name dictionary
    school_dict = {
        'Bilingual': '',
        'Non-LEP DL': '',
        'NON-LEP DL': '',
        'LEP DL BIL': '',
        'ESL': '',
        'ESOL': '',
        'Regular': '',
        'Self Contained': '',
        
        'Mt.': 'Mountain',
        'Tr.': 'Trail',
        
        'MIS': '',
        'ES': '',
        'MS': '',
        'HS': '',
        'High School': '',
        'Elementary School': '',
        'Middle School': '',
        'Elem School': '',
        'Elementary': '',
        'Elem: ': '',
        'EC': 'Early College',
        'Wm S Lott Juvenile Ctr': 'LOTT',
        'Wm. Lott': 'LOTT',
        'Lott': 'LOTT',
        'RROC': 'Round Rock Opportunity Center',
        'Daep': 'DAEP',
        'Deepwood': 'Deep Wood',
        'Liveoak': 'Live Oak',
        #'Johnson': 'Joe Lee Johnson',
        'Xenia': '',
        'Patsy': '',
        'Neysa': '',
        'Noel': '',
        'C.D.': 'CD',
        'C. D.': 'CD',
        'Claude': '',
        'Elsa': '',
        'James': '',
        'Kathy': '',
        'Linda': '',
        'Patsy': '',
        'RRISD': '',
        
    }
    
    # change keys(k) to values(v)
    for k, v in school_dict.items():
        df['School'] = df['School'].map(lambda dis_cell: dis_cell.replace(k, v).strip())
    
    return None

In [3]:
school_type = {
    # list of middle schools
    'C.D. Fulkes': 'MS',
    'Canyon Vista': 'MS',
    'Cedar Valley': 'MS',
    'Chisholm Trail': 'MS',
    'Deerpark': 'MS',
    'Grisham': 'MS',
    'Hernandez': 'MS',
    'Hopewell': 'MS',
    'Pearson Ranch': 'MS',
    'Ridgeview': 'MS',
    'Walsh': 'MS',
    
    # list of high schools
    'Cedar Ridge': 'HS',
    'McNeil': 'HS',
    'Round Rock': 'HS',
    'Stony Point': 'HS',
    'Westwood': 'HS',
    'Success': 'HS',
    'Early College': 'HS',
    
    # other schools
    'Round Rock Opportunity Center': 'Other',
    'DAEP': 'Other',
    'JJAEP': 'Other',
    'LOTT': 'Other',
}

In [4]:
# file directories
# excel files from 2016-08-23 to 2018-05-29
path_one = os.getcwd() + '/data/excel/one/'

# list of dataframes
# each dataframe is one excel file
df_excel = []

grades = ['ECE', 'PK', 'K', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']

# file names
filenames_one = os.listdir(path_one)
filenames_one.sort()

for filename in filenames_one:
    df_one = pd.DataFrame()

    # xlsx and xls file only
    if filename.endswith('xlsx' or 'xls'):
        print(filename)
        
        # get the first sheet in an excel file
        temp = pd.read_excel(path_one + filename, header=2)
        temp.fillna(0, inplace=True)
    
        # change column names to strings since grades from 1-12 are int
        temp.columns = [str(x) for x in temp.columns]        

        # school name column 
        df_one['School'] = temp['CAMPUS'].str.strip()
        
        # date column
        # change a filename with '.' instead of '-'
        df_one['Date'] = filename[:10].replace('.', '-')
        
        # change Date column into datetime object
        df_one['Date'] = pd.to_datetime(df_one['Date'])
        
        # create 14 more rows for each school (so we can have 15 grades)
        df_one = df_one.append([df_one]*14, ignore_index=True)

        # sort by school name and reset index
        df_one = df_one.sort_values(by=['School'])
        df_one.reset_index(inplace=True, drop=True)

        for i in range(len(df_one)):
            
            # grades
            idx = i % 15
            df_one.loc[i, 'Grade'] = grades[idx]
        
        df_one['Enrolled'] = 0
        for i in range(len(df_one)):

            # enrollment
            school = df_one['School'][i]
            grade = str(df_one['Grade'][i])
            if not temp[temp['CAMPUS'] == school][grade].values.size == 0:
                df_one.loc[i, 'Enrolled'] = temp[temp['CAMPUS'] == school][grade].values[0]


        # set all schools to 'Regular' (default)
        # then change types accordingly
        df_one['Group'] = 'Regular'
        
        for i in range(len(df_one)):

            # group
            school = str(df_one.loc[i, 'School']).lower()

            if 'bilingual' in school:
                df_one.loc[i, 'Group'] = 'Bilingual'
            elif 'non-lep dl' in school:
                df_one.loc[i, 'Group'] = 'Non-LEP DL'
            elif 'esol' in school:
                df_one.loc[i, 'Group'] = 'ESOL'
            elif 'esl' in school:
                df_one.loc[i, 'Group'] = 'ESL'
            elif 'regular' in school:
                df_one.loc[i, 'Group'] = 'Regular'
            elif 'self contained' in school:
                df_one.loc[i, 'Group'] = 'Self Contained'
        
        df_one.dropna(inplace=True)
        
        # remove rows that aren't school (Total, )
        df_one = df_one[df_one['School'].str.contains('tota|campus', case=False, regex=True) != True]

        # change school names
        df_one['School'] = df_one['School'].str.strip().str.replace('/', ' ')
        change_school_name(df_one)
        
        # reset index for future use
        df_one.reset_index(inplace=True, drop=True)

        # set all schools to 'ES' (default)
        # then change types accordingly
        df_one['Type'] = 'ES'
        for i in range(len(df_one)):
            if df_one.loc[i, 'School'] in school_type.keys():
                df_one.loc[i, 'Type'] = school_type[df_one.loc[i, 'School']]
        
        #append each temp dataframes(sheets) to a list
        df_excel.append(df_one)

df_one_complete = pd.DataFrame(columns=['Date', 'School', 'Grade', 'Enrolled', 'Group', 'Type'])

for i in range(len(df_excel)):
    df_one_complete = df_one_complete.append(df_excel[i], sort=False, ignore_index=True)

2016-08-23 enrollment.xls.xlsx
2016-08.30 enrollment.xls.xlsx
2016-09-06 enrollment.xls.xlsx
2016-09-13 enrollment.xls.xlsx
2016-09-20 enrollment.xls.xlsx
2016-09-27 enrollment.xls.xlsx
2016-10-04 enrollment.xls.xlsx
2016-10-11 enrollment.xls.xlsx
2016-10-18 enrollment.xls.xlsx
2016-10-25 enrollment.xls.xlsx
2016-11-01 enrollment.xls.xlsx
2016-11-08 enrollment.xls.xlsx
2016-11-15 enrollment.xls.xlsx
2016-11-29 enrollment.xls.xlsx
2016-12-06 enrollment.xls.xlsx
2016-12-13 enrollment.xls.xlsx
2017-01-10 enrollment.xls.xlsx
2017-01-17 enrollment.xls.xlsx
2017-01-24 enrollment.xls.xlsx
2017-01-31 enrollment.xls.xlsx
2017-02-07 enrollment.xls.xlsx
2017-02-14 enrollment.xls.xlsx
2017-02-21 enrollment.xls.xlsx
2017-02-28 enrollment.xls.xlsx
2017-03-07 enrollment.xls.xlsx
2017-03-21 enrollment.xls.xlsx
2017-03-28 enrollment.xls.xlsx
2017-04-04 enrollment.xls.xlsx
2017-04-11 enrollment.xls.xlsx
2017-04-18 enrollment.xls.xlsx
2017-04-25 enrollment.xls.xlsx
2017-05-02 enrollment.xls.xlsx
2017-05-

In [5]:
set(df_one_complete['School'])

{'Anderson Mill',
 'Berkman',
 'Blackland Prairie',
 'Bluebonnet',
 'Brushy Creek',
 'C. D. Fulkes',
 'Cactus Ranch',
 'Caldwell Heights',
 'Callison',
 'Canyon Creek',
 'Canyon Vista',
 'Caraway',
 'Cedar Ridge',
 'Cedar Valley',
 'Chandler Oaks',
 'Chisholm Trail',
 'DAEP',
 'Deepwood',
 'Deerpark',
 'Double File Trail',
 'Early College',
 'England',
 'Fern Bluff',
 'Forest Creek',
 'Forest North',
 'Gattis',
 'Great Oaks',
 'Grisham',
 'Hernandez',
 'Herrington',
 'Hopewell',
 'JJAEP',
 'Joe Lee Johnson',
 'Johnson',
 'Jollyville',
 'LOTT',
 'Laurel Mountain',
 'Live Oak',
 'McNeil',
 'Old Town',
 'Pearson Ranch',
 'Pond Springs',
 'Purple Sage',
 'Ridgeview',
 'Robertson',
 'Round Rock',
 'Round Rock Opportunity Center',
 'Sommer',
 'Spicewood',
 'Stony Point',
 'Success',
 'Teravista',
 'Union Hill',
 'Voigt',
 'Walsh',
 'Wells Branch',
 'Westwood'}

In [6]:
df_one_complete.to_csv('./part1.csv')

In [7]:
df_one_complete.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161685 entries, 0 to 161684
Data columns (total 6 columns):
Date        161685 non-null datetime64[ns]
School      161685 non-null object
Grade       161685 non-null object
Enrolled    161685 non-null float64
Group       161685 non-null object
Type        161685 non-null object
dtypes: datetime64[ns](1), float64(1), object(4)
memory usage: 7.4+ MB


In [8]:
group_dict = {
    'Bil': 'Bilingual',
    'DL': 'Non-LEP DL',
    'Reg': 'Regular'
}

In [9]:
# file directories
# excel files after 2018-05-29 ~ 11/16
path_two = os.getcwd() + '/data/excel/two/'

# list of dataframes
# each dataframe is one excel file
df_excel = []

grades = ['EE', 'PK', 'KG', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']

# file names
filenames_two = os.listdir(path_two)
filenames_two.sort()

for filename in filenames_two:
    df_two = pd.DataFrame()

    # xlsx and xls file only
    if filename.endswith('xlsx' or 'xls'):
        print(filename)
        
        # get the first sheet in an excel file
        temp = pd.read_excel(path_two + filename, header=2)
        temp.iloc[:, 0].fillna(method='ffill', inplace=True)
        temp.iloc[:, 1:16].fillna(0, inplace=True)

        df_two = pd.DataFrame()

        # school name column 
        df_two['School'] = temp.iloc[:, 0].str.strip()

        # date column
        df_two['Date'] = filename[:10]
        # change Date column into datetime object
        df_two['Date'] = pd.to_datetime(df_two['Date'])

        # group
        df_two['Group'] = temp.iloc[:, 1].str.strip()

        # create 14 more rows for each school (so we can have 15 grades)
        df_two = df_two.append([df_two]*14, ignore_index=True)

        # sort by school name and reset index
        df_two = df_two.sort_values(by=['School', 'Group'])
        df_two.reset_index(inplace=True, drop=True)

        for i in range(len(df_two)):

            # grades
            idx = i % 15
            df_two.loc[i, 'Grade'] = grades[idx]

        df_two['Enrolled'] = 0
        for i in range(len(df_two)):

            # enrollment
            school = df_two['School'][i]
            grade = str(df_two['Grade'][i])
            group = str(df_two['Group'][i])
            
            if not temp[(temp.iloc[:, 0].str.strip() == school) & (temp.iloc[:, 1].str.strip() == group)][grade].values.size == 0:
                    df_two.loc[i, 'Enrolled'] =temp[(temp.iloc[:, 0].str.strip() == school) & (temp.iloc[:, 1].str.strip() == group)][grade].values[0]
        
        df_two.dropna(inplace=True)

        # remove rows that aren't school (Total, )
        df_two = df_two[df_two['School'].str.contains('tota|campus', case=False, regex=True) != True]
        df_two.reset_index(inplace=True, drop=True)
        
        # change school names
        change_school_name(df_two)

        # set all schools to 'ES' (default)
        # then change types accordingly
        df_two['Type'] = 'ES'
        for i in range(len(df_two)):
            if df_two.loc[i, 'School'] in school_type.keys():
                df_two.loc[i, 'Type'] = school_type[df_two.loc[i, 'School']]
        
        
        # 2018-09-11, different format, change group and school name
        for i in range(len(df_two)):
            if df_two.loc[i, 'Group'] in group_dict.keys():
                df_two.loc[i, 'Group'] = group_dict[df_two.loc[i, 'Group']]
        
        #append each temp dataframes(sheets) to a list
        df_excel.append(df_two)

df_two_complete = pd.DataFrame(columns=['Date', 'School', 'Grade', 'Enrolled', 'Group', 'Type'])

for i in range(len(df_excel)):
    df_two_complete = df_two_complete.append(df_excel[i], sort=False, ignore_index=True)

2018-08-28 Enrollment Report.xls.xlsx
2018-09-04 Enrollment Report.xls.xlsx
2018-09-11 Enrollment Report.xls.xlsx
2018-09-18 Enrollment Report.xls.xlsx
2018-09-25 Enrollment Report.xls.xlsx
2018-10-02 Enrollment Report.xls.xlsx
2018-10-09 Enrollment Report.xls.xlsx
2018-10-16 Enrollment Report.xls.xlsx
2018-10-23 Enrollment Report.xls.xlsx
2018-10-30 Enrollment Report.xls.xlsx
2018-11-06 Enrollment Report.xls.xlsx
2018-11-16 Enrollment Report.xls.xlsx


In [10]:
df_two_complete

Unnamed: 0,Date,School,Grade,Enrolled,Group,Type
0,2018-08-28,Anderson Mill,EE,0.0,Bilingual,ES
1,2018-08-28,Anderson Mill,PK,5.0,Bilingual,ES
2,2018-08-28,Anderson Mill,KG,10.0,Bilingual,ES
3,2018-08-28,Anderson Mill,01,14.0,Bilingual,ES
4,2018-08-28,Anderson Mill,02,10.0,Bilingual,ES
5,2018-08-28,Anderson Mill,03,13.0,Bilingual,ES
6,2018-08-28,Anderson Mill,04,11.0,Bilingual,ES
7,2018-08-28,Anderson Mill,05,9.0,Bilingual,ES
8,2018-08-28,Anderson Mill,06,0.0,Bilingual,ES
9,2018-08-28,Anderson Mill,07,0.0,Bilingual,ES


In [11]:
set(df_two_complete['Group'])

{'Bilingual', 'ESL', 'Non-LEP DL', 'Regular'}

In [12]:
s2 = set(df_two_complete['School'])

In [13]:
s1 = set(df_one_complete['School'])

In [14]:
s1 - s2

{'Johnson'}

In [17]:
s1

{'Anderson Mill',
 'Berkman',
 'Blackland Prairie',
 'Bluebonnet',
 'Brushy Creek',
 'C. D. Fulkes',
 'Cactus Ranch',
 'Caldwell Heights',
 'Callison',
 'Canyon Creek',
 'Canyon Vista',
 'Caraway',
 'Cedar Ridge',
 'Cedar Valley',
 'Chandler Oaks',
 'Chisholm Trail',
 'DAEP',
 'Deepwood',
 'Deerpark',
 'Double File Trail',
 'Early College',
 'England',
 'Fern Bluff',
 'Forest Creek',
 'Forest North',
 'Gattis',
 'Great Oaks',
 'Grisham',
 'Hernandez',
 'Herrington',
 'Hopewell',
 'JJAEP',
 'Joe Lee Johnson',
 'Johnson',
 'Jollyville',
 'LOTT',
 'Laurel Mountain',
 'Live Oak',
 'McNeil',
 'Old Town',
 'Pearson Ranch',
 'Pond Springs',
 'Purple Sage',
 'Ridgeview',
 'Robertson',
 'Round Rock',
 'Round Rock Opportunity Center',
 'Sommer',
 'Spicewood',
 'Stony Point',
 'Success',
 'Teravista',
 'Union Hill',
 'Voigt',
 'Walsh',
 'Wells Branch',
 'Westwood'}

In [15]:
s2 - s1

set()

In [16]:
s1.intersection(s2)

{'Anderson Mill',
 'Berkman',
 'Blackland Prairie',
 'Bluebonnet',
 'Brushy Creek',
 'C. D. Fulkes',
 'Cactus Ranch',
 'Caldwell Heights',
 'Callison',
 'Canyon Creek',
 'Canyon Vista',
 'Caraway',
 'Cedar Ridge',
 'Cedar Valley',
 'Chandler Oaks',
 'Chisholm Trail',
 'DAEP',
 'Deepwood',
 'Deerpark',
 'Double File Trail',
 'Early College',
 'England',
 'Fern Bluff',
 'Forest Creek',
 'Forest North',
 'Gattis',
 'Great Oaks',
 'Grisham',
 'Hernandez',
 'Herrington',
 'Hopewell',
 'JJAEP',
 'Joe Lee Johnson',
 'Jollyville',
 'LOTT',
 'Laurel Mountain',
 'Live Oak',
 'McNeil',
 'Old Town',
 'Pearson Ranch',
 'Pond Springs',
 'Purple Sage',
 'Ridgeview',
 'Robertson',
 'Round Rock',
 'Round Rock Opportunity Center',
 'Sommer',
 'Spicewood',
 'Stony Point',
 'Success',
 'Teravista',
 'Union Hill',
 'Voigt',
 'Walsh',
 'Wells Branch',
 'Westwood'}