In [1]:
import pandas as pd
import random
import csv
import numpy as np

In [2]:
PATH = "../Impulso-tek/year_data"

In [3]:
def clean_int(s) -> int:
    """
    Clean integer value
    """
    if isinstance(s, str):
        s = s.strip()
        if s == "":
            return None
        else:
            return int(s)
    else:
        return s


def get_grade(df: pd.DataFrame, grade: str,VET: bool) -> pd.DataFrame:
    """
    Function that filters original data frame accordingly to the grade filter
    defined by the specific conditions
    """
    df_new = pd.DataFrame()

    if grade == "8th":
        df_new = df[
            (
                  ((df["cod_ense"] == 110) & (df["cod_grado"].isin([1, 2, 3, 4, 5, 6, 7, 8])))
                | ((df["cod_ense"] == 160) & (df["cod_grado"].isin([1, 2, 3, 4, 5, 6, 7, 8])))
                | ((df["cod_ense"] == 161) & (df["cod_grado"].isin([2, 3, 4])))
                | ((df["cod_ense"] == 163) & (df["cod_grado"].isin([2, 3, 4])))
                | ((df["cod_ense"] == 165) & (df["cod_grado"].isin([1, 2, 3])))
                | ((df["cod_ense"] == 167) & (df["cod_grado"].isin([2, 3])))
            )
        ].copy()

    if grade == "9th":
        df_new = df[
            (
                (df["cod_ense"].isin(
                        [310, 360, 361, 363, 410, 460, 463, 510, 560, 561, 563, 610,
                         661, 663, 710, 760, 761, 763, 810, 860, 861, 863, 910]))
                & (df["cod_grado"] == 1)
            )
        ].copy()

    if grade == "10th":
        df_new = df[
            (
                (df["cod_ense"].isin([310, 360, 410, 510, 610, 710, 810, 910]))
                & (df["cod_grado"] == 2)
            )
        ].copy()

    if grade == "11th":
        df_new = df[
            (
                (df["cod_ense"].isin(
                        [310, 360, 410, 460, 461,463, 510, 560, 561, 563, 610, 660,
                         661, 663, 710, 760, 761, 763, 810, 860, 861, 863, 910,963]))
                & (df["cod_grado"] == 3)
            )
        ].copy()
        


    if grade == "12th":
        df_new = df[
            (
                (df["cod_ense"].isin(
                        [310, 360, 410, 460, 461,463, 510, 560, 561,563, 610,660, 661,663,
                         710, 760,761, 763, 810, 860,861,863, 910,963]))
                & (df["cod_grado"] == 4)
            )
        ].copy()

    if grade == "11th12th" and VET:
        df_new = df[
            (
                (df["cod_ense"].isin(
                        [410, 460,461,463, 510,560,561, 563, 610, 660, 663, 710,760,761, 763, 810,860,861,863, 910,963]))
                & ((df["cod_grado"] == 3) | (df["cod_grado"] == 4))
            )
        ].copy()

    if grade == "11th12th" and not VET:
        df_new = df[
            (
                (df["cod_ense"].isin(
                        [361,363]))
                & (df["cod_grado"] == 3)
            )
        ].copy()


    return df_new



def read_data_all_years(
    agno: int,
    source: str,
    sep: str = ",",
    cols: str = None,
    grade: str = "8th",
    approved: bool = False,
    VET: bool = False,
) -> pd.DataFrame:
    """
    Reads csv of the form "{PATH}/{source}_{agno}.csv", filters columns, grades and approved 
    if specified. Returns a DataFrame with mrun identifier clean
    """
    # Read data
    df_all = pd.DataFrame()
    
    df = pd.read_csv(f"{PATH}/{source}_{agno}.csv", sep=sep)
    print(f"All data in year {str(agno)}:", df.shape[0])
    cols = cols or df.columns

    # Filters grade
    if grade and not VET:
        df = get_grade(df, grade,False)
        print(f"All data in year {str(agno)} and grade {grade}:", df.shape[0])

    # Filter approved
    if approved:
        df = df[df["sit_fin"] == "P"]
        print(f"All data in year {str(agno)} and grade {grade} that approved:", df.shape[0])
    if grade and VET:
        df = get_grade(df,grade,True)
        print(f"All data in year {agno} and grade {grade} and VET:", df.shape[0])


    # clean mrun column and removing none values
    df["mrun"] = df["mrun"].map(clean_int)
    df = df[df["mrun"].notnull()]
    df["mrun"] = df["mrun"].astype(int)
    print("Valid MRUN rows: ", df.shape[0])
        
    df_all = df_all.append(df[cols])
    print("df_all rows: ",df_all.shape[0])
    agno +=1
    return df_all


def get_attendance(baseline: pd.DataFrame) -> pd.DataFrame:
    """
    Calculates atendance rate 
    Returns a DataFrame with percentage of atttendance rate
    """
    
    return baseline.groupby("agno").mean().reset_index()

def get_attendance_by_rbd(baseline: pd.DataFrame) -> pd.DataFrame:
    """
    Calculates atendance rate 
    Returns a DataFrame with percentage of atttendance rate
    """
    attendance = pd.DataFrame()
 
    aggregator = {'mrun' : 'nunique',
              'asistencia' : 'mean',
              }

    baseline = baseline.groupby(["agno",'rbd'])[['mrun','asistencia']].agg(aggregator).reset_index()
    attendance = baseline.groupby(["agno",'rbd']).mean().reset_index()
  
    return attendance


def access_to_VET(students11th: pd.DataFrame, students12th: pd.DataFrame, students_11th12thVET: pd.DataFrame) -> pd.DataFrame:
    """
    Calculates access to VET prgram at 11th or 12th grade
    Returns a DataFrame with percentage of students that access to VET program at 11th or 12th grade
    """

    #count unique mruns per year and per rbd
    unique_11th = students11th.groupby(['agno','rbd'])['mrun'].nunique().reset_index()
    unique_12th = students12th.groupby(['agno','rbd'])['mrun'].nunique().reset_index()
    unique_11th12thVET = students_11th12thVET.groupby(['agno','rbd'])['mrun'].nunique().reset_index()
    
    unique_11and_12th =  unique_11th.set_index('rbd').add(unique_12th.set_index('rbd'), fill_value=0).reset_index() 
    unique_11th12thVET['VET%'] = unique_11th12thVET['mrun'] / unique_11and_12th['mrun']
    
    return unique_11th12thVET
  

# Half-Cycle

### Read data

In [4]:
t = 2008
path = "../Impulso-tek/year_data"
source = "Rendimiento"

## 1. Access to VET program at 11th grade
% of students who enroll at a VET program at 11th or 12th grade

In [5]:
cols = ["agno", "mrun", "rbd","asistencia"]
#Read data 11th grade
students_11th = read_data_all_years(t, source, cols=cols, grade="11th", approved=True)

All data in year 2008: 3356256
All data in year 2008 and grade 11th: 248982
All data in year 2008 and grade 11th that approved: 215310
Valid MRUN rows:  215295
df_all rows:  215295


In [6]:
cols = ["agno", "mrun", "rbd","asistencia"]
#Read data 12th grade
students_12th = read_data_all_years(t, source, cols=cols, grade="12th", approved=True)

All data in year 2008: 3356256
All data in year 2008 and grade 12th: 223163
All data in year 2008 and grade 12th that approved: 211430
Valid MRUN rows:  211420
df_all rows:  211420


In [7]:
cols = ["agno", "mrun", "rbd","asistencia"]
#Read data 11th and 12th grade and VET program
students_11th12thVET = read_data_all_years(t, source, cols=cols, grade="11th12th", approved=True, VET=True)

All data in year 2008: 3356256
All data in year 2008 and grade 11th12th that approved: 2968963
All data in year 2008 and grade 11th12th and VET: 189561
Valid MRUN rows:  189555
df_all rows:  189555


In [8]:
access_to_VET_percentage = access_to_VET(students_11th,students_12th,students_11th12thVET)
access_to_VET_percentage

Unnamed: 0,agno,rbd,mrun,VET%
0,2008,1,631,0.842457
1,2008,7,582,0.801653
2,2008,32,57,0.438462
3,2008,52,186,0.319588
4,2008,72,13,0.046931
...,...,...,...,...
953,2008,26219,26,0.472727
954,2008,26292,128,2.909091
955,2008,26384,135,1.516854
956,2008,40029,35,0.304348


In [9]:
filename = str(t) + "_Half_Cycle_1_Access_to_VET"
access_to_VET_percentage.to_csv(filename,index=False)

## 2. Attendance rate at 10th grade
attendance rate at 10th grade

In [10]:
t=2016
cols = ["agno", "mrun", "rbd","asistencia"]
students_10th = read_data_all_years(t, source, cols=cols, grade="10th", approved=True)

All data in year 2016: 3226943
All data in year 2016 and grade 10th: 249911
All data in year 2016 and grade 10th that approved: 214960
Valid MRUN rows:  214960
df_all rows:  214960


In [11]:
attendance = get_attendance(students_10th)

In [12]:
attendance[['agno','asistencia']]

Unnamed: 0,agno,asistencia
0,2016,91.614789


In [13]:
attendance_by_rbd = get_attendance_by_rbd(students_10th)

In [14]:
attendance_per_rbd = attendance_by_rbd[['agno','rbd','mrun','asistencia']]
attendance_per_rbd

Unnamed: 0,agno,rbd,mrun,asistencia
0,2016,1,118,88.966102
1,2016,4,269,93.773234
2,2016,5,123,94.837398
3,2016,7,287,89.627178
4,2016,8,77,96.324675
...,...,...,...,...
2906,2016,40403,7,84.571429
2907,2016,40422,53,90.905660
2908,2016,40436,10,91.500000
2909,2016,40455,22,95.818182


In [15]:
filename = str(t)+"_Half_Cycle_2_Attendance_10th.csv"
attendance_per_rbd.to_csv(filename,index=False)