In [7]:
# create a function 'contain' to do partial string search (case insensitive) which is used in filtering data fields
def contain(string,target):
    return string.lower() in target.lower()

# create the function to extract campus performance
def extract_performance(yr):
    # import dependencies
    import pandas as pd
    import numpy as np
    
    # convert yr to string
    year = str(yr)
    
    # read raw data file as dataframe 
    df_campus = pd.read_csv('CAMPSTAAR2_year%s.dat' %year)
    
    # do initial data cleansing by replacing '.' or '-1' (masked data) with NaN
    df_campus.replace('-1',np.nan,inplace=True)
    df_campus.replace('.',np.nan,inplace=True)
    
    # read 3 indiviudal header mapping tables and combine them into one mapping table
    df_header1 = pd.read_csv('header_mapping_1_%s.csv' %year)
    df_header2 = pd.read_csv('header_mapping_2_%s.csv' %year)
    df_header3 = pd.read_csv('header_mapping_3_%s.csv' %year)
    df_header = pd.concat([df_header1,df_header2,df_header3],ignore_index=True)
    
    # create a dictionary for the column header mapping table, and use it to rename the column headers
    header_dict = {}
    for i in range(len(df_header)):
        header_dict[df_header.loc[i,'NAME']] = df_header.loc[i,'LABEL']    
    df_campus = df_campus.rename(columns = header_dict)
    
    # use melt function to transform dataset from wide table to long table so that it will be easier to set filter on the fields
    df_campus = df_campus.melt(id_vars='CAMPUS', var_name='Category', value_name='Value')
    
    # select data for the year requested
    df_campus = df_campus[df_campus['Category'].apply(lambda x: contain(year,x))]
    
    # get the total student count by campus and remove NaN
    df_student_count = df_campus[df_campus['Category'].apply(
        lambda x: contain('All Students All Tests Performance Denominator',x))].dropna()
    
    # get the student count for those passing the exam by campus and remove NaN
    df_pass_count = df_campus[df_campus['Category'].apply(
        lambda x: contain('All Students All Tests Numerator',x))].dropna()
    
    # get the student pass rate by campus and remove NaN
    df_pass_rate = df_campus[df_campus['Category'].apply(
        lambda x: contain('All Students All Tests Rate',x))].dropna()
    
    # merge dataframes to include all data elements
    df_performance = df_student_count.merge(df_pass_count,on='CAMPUS',how='outer').merge(df_pass_rate,on='CAMPUS',how='inner')
    
    # select columns and rename column headers, add a column for the school year requested
    df_performance = df_performance[['CAMPUS','Value_x','Value_y','Value']].rename(
        columns = {'CAMPUS':'campus_id','Value_x':'student_count','Value_y':'pass_count','Value':'pct_pass'})
    df_performance['year'] = year
    
    return df_performance