In [1]:
# Import Library to parse
import pandas as pd

In [2]:
# Import datasets
xl = pd.ExcelFile("Senior Secondary Achievement and Completion Information  2012-2017.xlsx")

In [3]:
# Find out available sheets
xl.sheet_names

['Read Me', 'Definitions', '2012', '2013', '2014', '2015', '2016', '2017']

In [4]:
# Parse Data and a bit of clean up
# We are only looking at 2016-2017 for analysis for better match with other datasets
df2 = xl.parse("2017").fillna(0)
df3 = xl.parse("2016").fillna(0)

In [5]:
# Insert years before merging
df2.insert(loc=0, column="Year", value='2017')
df3.insert(loc=0, column="Year", value='2016')

In [6]:
# Rename columns for easier use
df2.rename({'APS School/Provider Name': 'School',
            'Number of VCE studies at unit 3-4 level taken up by students in 2017': 'Number of Unit 3/4 VCE Stubjects Offered',
            'Number of VET certificates with 2017 enrolments': ' Number of VET Subjects Offered',
            'Availability of International Baccalaureate (Diploma)': 'IB Offered',
            'Number of students enrolled in at least one VCE unit at level 3-4 in 2017': 'Number of VCE Enrollments',
            'Number of students enrolled in a VET certificate in 2017': 'Number of VET Enrollments',
            'Number of students enrolled in VCAL in 2017': 'Number of VCAL Enrollments',
            'Percentage of satisfactory VCE completions in 2017': 'VCE Percentage of Sastisfactory Completion',
            'Percentage of VET units of competency completed in 2017': 'VET Percentage of Sastisfactory Completion',
            'Percentage of VCAL units completed in 2017': 'VCAL Percentage of Sastisfactory Completion',
           }, axis=1, inplace=True)
df3.rename({'APS School/Provider Name': 'School',
            'Number of VCE studies at unit 3-4 level taken up by students in 2016': 'Number of Unit 3/4 VCE Stubjects Offered',
            'Number of VET certificates with 2016 enrolments': ' Number of VET Subjects Offered',
            'Availability of International Baccalaureate (Diploma)': 'IB Offered',
            'Number of students enrolled in at least one VCE unit at level 3-4 in 2016': 'Number of VCE Enrollments',
            'Number of students enrolled in a VET certificate in 2016': 'Number of VET Enrollments',
            'Number of students enrolled in VCAL in 2016': 'Number of VCAL Enrollments',
            'Percentage of satisfactory VCE completions in 2016': 'VCE Percentage of Sastisfactory Completion',
            'Percentage of VET units of competency completed in 2016': 'VET Percentage of Sastisfactory Completion',
            'Percentage of VCAL units completed in 2016': 'VCAL Percentage of Sastisfactory Completion',
           }, axis=1, inplace=True)

In [7]:
df1 = pd.concat([df2, df3], sort=False)

In [8]:
### Clean up data entries
# Small School column, replace '*' with 1 for bool
df1.replace('*', 1, inplace=True)

# Results columns, replace '-' with 0
df1.replace('-', 0, inplace=True)

# Results columns, replace I/D with 0 (schools that have not enrolled into VCE)
df1.replace('I/D', 0, inplace=True)

# Adult column, replace 'A' with 1 for bool
df1.replace('A', 1, inplace=True)

# IB column, replace 'Y' with 1 for bool
df1.replace('Y', 1, inplace=True)

# Enrollment columns, replace '<4' with 0
df1.replace('< 4', 0, inplace=True)

In [9]:
# Drop the adult sector as we are only interesed in high schools
df1 = df1.drop(df1.loc[df1['Adult School'] == 1].index)

# Furthermore, drop small schools as per warning from the dataset:
# A small school is one with 20 or fewer students enrolled in at least one VCE unit at level 3-4 in 2017. 
# Much care is required in interpreting their results as the overall results might be significantly affected 
# by those of one or two students.
df1 = df1.drop(df1.loc[df1['Small School'] == 1].index)

# Ignore Median VCE study scores that are 0 (schools that did not offer VCE / students failed VCE)
df1 = df1.loc[df1['Median VCE study score'] != 0]

In [10]:
# We can now drop irrelevant columns
df1 = df1.drop(['VCAAcd', 'Adult School', 'Small School'], axis=1)

In [11]:
# Add in selective school to differentiate
selective_schools = ['MELBOURNE HIGH SCHOOL', 'MACROBERTSON GIRLS HIGH SCHOOL', 'NOSSAL HIGH SCHOOL',\
                    'SUZANNE CORY HIGH SCHOOL', 'JOHN MONASH SCIENCE SCHOOL']
df1.loc[df1.School.isin(selective_schools), 'Sector'] = 'SELECTIVE'

In [12]:
df1.dtypes

Year                                                        object
Sector                                                      object
School                                                      object
Locality                                                    object
Number of Unit 3/4 VCE Stubjects Offered                    object
 Number of VET Subjects Offered                             object
IB Offered                                                   int64
Number of VCE Enrollments                                   object
Number of VET Enrollments                                   object
Number of VCAL Enrollments                                  object
Percentage of VCE students applying for tertiary places    float64
VCE Percentage of Sastisfactory Completion                  object
Number of students awarded the VCE (Baccalaureate)          object
VET Percentage of Sastisfactory Completion                  object
VCAL Percentage of Sastisfactory Completion                 ob

In [13]:
df1[df1.columns[4:-4]] = df1[df1.columns[4:-4]].astype(int)
df1[df1.columns[-4:]] = df1[df1.columns[-4:]].astype(float)

In [14]:
df1.dtypes

Year                                                        object
Sector                                                      object
School                                                      object
Locality                                                    object
Number of Unit 3/4 VCE Stubjects Offered                     int32
 Number of VET Subjects Offered                              int32
IB Offered                                                   int32
Number of VCE Enrollments                                    int32
Number of VET Enrollments                                    int32
Number of VCAL Enrollments                                   int32
Percentage of VCE students applying for tertiary places      int32
VCE Percentage of Sastisfactory Completion                   int32
Number of students awarded the VCE (Baccalaureate)           int32
VET Percentage of Sastisfactory Completion                 float64
VCAL Percentage of Sastisfactory Completion                flo

In [15]:
df1.head()

Unnamed: 0,Year,Sector,School,Locality,Number of Unit 3/4 VCE Stubjects Offered,Number of VET Subjects Offered,IB Offered,Number of VCE Enrollments,Number of VET Enrollments,Number of VCAL Enrollments,Percentage of VCE students applying for tertiary places,VCE Percentage of Sastisfactory Completion,Number of students awarded the VCE (Baccalaureate),VET Percentage of Sastisfactory Completion,VCAL Percentage of Sastisfactory Completion,Median VCE study score,Percentage of study scores of 40 and over
0,2017,CATHOLIC,ACADEMY OF MARY IMMACULATE,FITZROY,41,6,0,149,17,0,93,96,9,99.0,0.0,31.0,12.4
3,2017,INDEPENDENT,AITKEN COLLEGE,GREENVALE,52,23,0,165,135,22,94,100,6,91.0,100.0,29.0,6.1
4,2017,INDEPENDENT,AL SIRAAT COLLEGE,EPPING,12,2,0,34,48,0,100,100,0,99.0,0.0,25.0,2.2
5,2017,INDEPENDENT,AL-TAQWA COLLEGE,TRUGANINA,27,11,0,177,46,38,92,100,2,93.0,94.0,27.0,3.0
6,2017,GOVERNMENT,ALBERT PARK COLLEGE,ALBERT PARK,50,19,1,248,39,0,91,99,10,84.0,0.0,32.0,12.5


In [16]:
# Export clean data to csv
df1.to_csv("clean_VCAA_data.csv")