In [1]:
import pandas as pd
import os

#pd.options.display.max_rows = 20000
#pd.options.display.max_columns = 10000

In [2]:
# change campus names
def change_school_name(df):
    
    # school name dictionary
    school_dict = {
        'Bilingual': '',
        'Non-LEP DL': '',
        'NON-LEP DL': '',
        'LEP DL BIL': '',
        'ESL': '',
        'ESOL': '',
        'Regular': '',
        'Self Contained': '',
        
        'Mt.': 'Mountain',
        'Tr.': 'Trail',
        
        'MIS': '',
        'ES': '',
        'MS': '',
        'HS': '',
        'High School': '',
        
        'Elem: ': '',
        'EC': 'Early College',
        'Wm. Lott': 'LOTT',
        'RROC': 'Round Rock Opportunity Center',
    }
    
    # change keys(k) to values(v)
    for k, v in school_dict.items():
        df['School'] = df['School'].map(lambda dis_cell: dis_cell.replace(k, v).strip())
    
    return None

In [3]:
school_type = {
    # list of middle schools
    'C.D. Fulkes': 'MS',
    'Canyon Vista': 'MS',
    'Cedar Valley': 'MS',
    'Chisholm Trail': 'MS',
    'Deerpark': 'MS',
    'Grisham': 'MS',
    'Hernandez': 'MS',
    'Hopewell': 'MS',
    'Pearson Ranch': 'MS',
    'Ridgeview': 'MS',
    'Walsh': 'MS',
    
    # list of high schools
    'Cedar Ridge': 'HS',
    'McNeil': 'HS',
    'Round Rock': 'HS',
    'Stony Point': 'HS',
    'Westwood': 'HS',
    'Success': 'HS',
    'Early College': 'HS',
    
    # other schools
    'Round Rock Opportunity Center': 'Other',
    'DAEP': 'Other',
    'JJAEP': 'Other',
    'LOTT': 'Other',
}

In [6]:
# file directories
# excel files from 2016-08-23 to 2018-05-29
path_one = os.getcwd() + '/data/excel/one/'

# list of dataframes
# each dataframe is one excel file
df_excel = []

grades = ['ECE', 'PK', 'K', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']

# file names
filenames_one = os.listdir(path_one)
filenames_one.sort()

for filename in filenames_one:
    df_one = pd.DataFrame()

    # xlsx and xls file only
    if filename.endswith('xlsx' or 'xls'):
        print(filename)
        
        # get the first sheet in an excel file
        temp = pd.read_excel(path_one + filename, header=2)

        # change column names to strings since grades from 1-12 are int
        temp.columns = [str(x) for x in temp.columns]        

        # school name column 
        df_one['School'] = temp['CAMPUS'].str.strip()
        
        # date column
        df_one['Date'] = filename[:10]
        
        # create 14 more rows for each school (so we can have 15 grades)
        df_one = df_one.append([df_one]*14, ignore_index=True)

        # sort by school name and reset index
        df_one = df_one.sort_values(by=['School'])
        df_one.reset_index(inplace=True, drop=True)

        for i in range(len(df_one)):
            
            # grades
            idx = i % 15
            df_one.loc[i, 'Grade'] = grades[idx]
            
        for i in range(len(df_one)):

            # enrollment
            school = df_one['School'][i]
            grade = df_one['Grade'][i]

            # some values are empty(na)
            try:
                enroll = temp[temp['CAMPUS'] == school][grade].values[0]
            except:
                enroll = 0

            df_one.loc[i, 'Enrolled'] = enroll

        # set all schools to 'ES' (default)
        # then change types accordingly
        df_one['Group'] = 'Regular'
        
        for i in range(len(df_one)):

            # group
            school = str(df_one.loc[i, 'School']).lower()

            if 'bilingual' in school:
                df_one.loc[i, 'Group'] = 'Bilingual'
            elif 'non-lep dl' in school:
                df_one.loc[i, 'Group'] = 'Non-LEP DL'
            elif 'esol' in school:
                df_one.loc[i, 'Group'] = 'ESOL'
            elif 'esl' in school:
                df_one.loc[i, 'Group'] = 'ESL'
            elif 'regular' in school:
                df_one.loc[i, 'Group'] = 'Regular'
            elif 'self contained' in school:
                df_one.loc[i, 'Group'] = 'Self Contained'
        
        df_one.dropna(inplace=True)
        
        # remove rows that aren't school (Total, )
        df_one = df_one[df_one['School'].str.contains('tota|campus', case=False, regex=True) != True]

        # change school names
        df_one['School'] = df_one['School'].str.strip().str.replace('/', ' ')
        change_school_name(df_one)
        
        # reset index for future use
        df_one.reset_index(inplace=True, drop=True)

        # set all schools to 'ES' (default)
        # then change types accordingly
        df_one['Type'] = 'ES'
        for i in range(len(df_one)):
            if df_one.loc[i, 'School'] in school_type.keys():
                df_one.loc[i, 'Type'] = school_type[df_one.loc[i, 'School']]
        
        #append each temp dataframes(sheets) to a list
        df_excel.append(df_one)

df_one_complete = pd.DataFrame(columns=['Date', 'School', 'Grade', 'Enrolled', 'Group', 'Type'])

for i in range(len(df_excel)):
    df_one_complete = df_one_complete.append(df_excel[i], sort=False, ignore_index=True)

2016-08-23 enrollment.xls.xlsx
2016-08.30 enrollment.xls.xlsx
2016-09-06 enrollment.xls.xlsx
2016-09-13 enrollment.xls.xlsx
2016-09-20 enrollment.xls.xlsx
2016-09-27 enrollment.xls.xlsx
2016-10-04 enrollment.xls.xlsx
2016-10-11 enrollment.xls.xlsx
2016-10-18 enrollment.xls.xlsx
2016-10-25 enrollment.xls.xlsx
2016-11-01 enrollment.xls.xlsx
2016-11-08 enrollment.xls.xlsx
2016-11-15 enrollment.xls.xlsx
2016-11-29 enrollment.xls.xlsx
2016-12-06 enrollment.xls.xlsx
2016-12-13 enrollment.xls.xlsx
2017-01-10 enrollment.xls.xlsx
2017-01-17 enrollment.xls.xlsx
2017-01-24 enrollment.xls.xlsx
2017-01-31 enrollment.xls.xlsx
2017-02-07 enrollment.xls.xlsx
2017-02-14 enrollment.xls.xlsx
2017-02-21 enrollment.xls.xlsx
2017-02-28 enrollment.xls.xlsx
2017-03-07 enrollment.xls.xlsx
2017-03-21 enrollment.xls.xlsx
2017-03-28 enrollment.xls.xlsx
2017-04-04 enrollment.xls.xlsx
2017-04-11 enrollment.xls.xlsx
2017-04-18 enrollment.xls.xlsx
2017-04-25 enrollment.xls.xlsx
2017-05-02 enrollment.xls.xlsx
2017-05-

In [7]:
set(df_one_complete['School'])

{'Anderson Mill',
 'Berkman',
 'Blackland Prairie',
 'Bluebonnet',
 'Brushy Creek',
 'C.D. Fulkes',
 'Cactus Ranch',
 'Caldwell Heights',
 'Callison',
 'Canyon Creek',
 'Canyon Vista',
 'Caraway',
 'Cedar Ridge',
 'Cedar Valley',
 'Chandler Oaks',
 'Chisholm Trail',
 'DAEP',
 'Deepwood',
 'Deerpark',
 'Double File Trail',
 'Early College',
 'England',
 'Fern Bluff',
 'Forest Creek',
 'Forest North',
 'Gattis',
 'Great Oaks',
 'Grisham',
 'Hernandez',
 'Herrington',
 'Hopewell',
 'JJAEP',
 'Joe Lee Johnson',
 'Johnson',
 'Jollyville',
 'LOTT',
 'Laurel Mountain',
 'Liveoak',
 'McNeil',
 'Old Town',
 'Pearson Ranch',
 'Pond Springs',
 'Purple Sage',
 'Ridgeview',
 'Robertson',
 'Round Rock',
 'Round Rock Opportunity Center',
 'Sommer',
 'Spicewood',
 'Stony Point',
 'Success',
 'Teravista',
 'Union Hill',
 'Voigt',
 'Walsh',
 'Wells Branch',
 'Westwood'}

In [8]:
df_one_complete.to_csv('./part1.csv')

In [9]:
df_one_complete.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81804 entries, 0 to 81803
Data columns (total 6 columns):
Date        81804 non-null object
School      81804 non-null object
Grade       81804 non-null object
Enrolled    81804 non-null float64
Group       81804 non-null object
Type        81804 non-null object
dtypes: float64(1), object(5)
memory usage: 3.7+ MB
