In [41]:
# Import Library to parse
import pandas as pd

In [42]:
# Import datasets
xl = pd.ExcelFile("Senior Secondary Achievement and Completion Information  2012-2017.xlsx")

In [43]:
# Find out available sheets
xl.sheet_names

['Read Me', 'Definitions', '2012', '2013', '2014', '2015', '2016', '2017']

In [44]:
# Parse Data and a bit of clean up
# We are only looking at 2016-2017 for analysis for better match with other datasets
df2 = xl.parse("2017").fillna(0)
df3 = xl.parse("2016").fillna(0)

In [45]:
# Insert years before merging
df2.insert(loc=0, column="Year", value='2017')
df3.insert(loc=0, column="Year", value='2016')

In [46]:
# Rename columns for easier use
df2.rename({'APS School/Provider Name': 'School',
            'Number of VCE studies at unit 3-4 level taken up by students in 2017': 'Number of Unit 3/4 VCE Stubjects Offered',
            'Number of VET certificates with 2017 enrolments': ' Number of VET Subjects Offered',
            'Availability of International Baccalaureate (Diploma)': 'IB Offered',
            'Number of students enrolled in at least one VCE unit at level 3-4 in 2017': 'Number of VCE Enrollments',
            'Number of students enrolled in a VET certificate in 2017': 'Number of VET Enrollments',
            'Number of students enrolled in VCAL in 2017': 'Number of VCAL Enrollments',
            'Percentage of satisfactory VCE completions in 2017': 'VCE Percentage of Sastisfactory Completion',
            'Percentage of VET units of competency completed in 2017': 'VET Percentage of Sastisfactory Completion',
            'Percentage of VCAL units completed in 2017': 'VCAL Percentage of Sastisfactory Completion',
           }, axis=1, inplace=True)
df3.rename({'APS School/Provider Name': 'School',
            'Number of VCE studies at unit 3-4 level taken up by students in 2016': 'Number of Unit 3/4 VCE Stubjects Offered',
            'Number of VET certificates with 2016 enrolments': ' Number of VET Subjects Offered',
            'Availability of International Baccalaureate (Diploma)': 'IB Offered',
            'Number of students enrolled in at least one VCE unit at level 3-4 in 2016': 'Number of VCE Enrollments',
            'Number of students enrolled in a VET certificate in 2016': 'Number of VET Enrollments',
            'Number of students enrolled in VCAL in 2016': 'Number of VCAL Enrollments',
            'Percentage of satisfactory VCE completions in 2016': 'VCE Percentage of Sastisfactory Completion',
            'Percentage of VET units of competency completed in 2016': 'VET Percentage of Sastisfactory Completion',
            'Percentage of VCAL units completed in 2016': 'VCAL Percentage of Sastisfactory Completion',
           }, axis=1, inplace=True)

In [47]:
df1 = pd.concat([df2, df3], sort=False)

In [48]:
### Clean up data entries
# Small School column, replace * with 1 for bool
df1.replace('*', 1, inplace=True)

# Results columns, replace - with 0
df1.replace('-', 0, inplace=True)

# Results columns, replace I/D with 0 (schools that have not enrolled into VCE)
df1.replace('I/D', 0, inplace=True)

# Adult column, replace 'A' with 1 for bool
df1.replace('A', 1, inplace=True)

# IB column, replace 'Y' with 1 for bool
df1.replace('Y', 1, inplace=True)

In [49]:
# Drop the adult sector as we are only interesed in high schools
df1 = df1.drop(df1.loc[df1['Adult School'] == 1].index)

# Furthermore, drop small schools as per warning from the dataset:
# A small school is one with 20 or fewer students enrolled in at least one VCE unit at level 3-4 in 2017. 
# Much care is required in interpreting their results as the overall results might be significantly affected 
# by those of one or two students.
df1 = df1.drop(df1.loc[df1['Small School'] == 1].index)

In [50]:
# We can now drop irrelevant columns
df1 = df1.drop(['VCAAcd', 'Adult School', 'Small School'], axis=1)

In [51]:
df1.head()

Unnamed: 0,Year,Sector,School,Locality,Number of Unit 3/4 VCE Stubjects Offered,Number of VET Subjects Offered,IB Offered,Number of VCE Enrollments,Number of VET Enrollments,Number of VCAL Enrollments,Percentage of VCE students applying for tertiary places,VCE Percentage of Sastisfactory Completion,Number of students awarded the VCE (Baccalaureate),VET Percentage of Sastisfactory Completion,VCAL Percentage of Sastisfactory Completion,Median VCE study score,Percentage of study scores of 40 and over
0,2017,CATHOLIC,ACADEMY OF MARY IMMACULATE,FITZROY,41,6,0,149,17,0,93.8,96,9,99,0,31,12.4
1,2017,INDEPENDENT,ADASS ISRAEL SCHOOL,ELSTERNWICK,12,10,0,102,164,63,0.0,0,0,43,70,0,0.0
3,2017,INDEPENDENT,AITKEN COLLEGE,GREENVALE,52,23,0,165,135,22,94.8,100,6,91,100,29,6.1
4,2017,INDEPENDENT,AL SIRAAT COLLEGE,EPPING,12,2,0,34,48,0,100.0,100,0,99,0,25,2.2
5,2017,INDEPENDENT,AL-TAQWA COLLEGE,TRUGANINA,27,11,0,177,46,38,92.7,100,2,93,94,27,3.0


In [52]:
# Export clean data to csv
df1.to_csv("clean_data.csv")