In [1]:
import pandas as pd
import random
import numpy as np

In [2]:
PATH = "../Impulso-tek/year_data"

In [18]:
def clean_int(s) -> int:
    """
    Clean integer value
    """
    if isinstance(s, str):
        s = s.strip()
        if s == "":
            return None
        else:
            return int(s)
    else:
        return s


def get_grade(df: pd.DataFrame, grade: str) -> pd.DataFrame:
    """
    Function that filters original data frame accordingly to the grade filter
    defined by the specific conditions
    """
    df_new = pd.DataFrame()

    if grade == "8th":
        df_new = df[
            (
                  ((df["cod_ense"] == 110) & (df["cod_grado"].isin([1, 2, 3, 4, 5, 6, 7, 8])))
                | ((df["cod_ense"] == 160) & (df["cod_grado"].isin([1, 2, 3, 4, 5, 6, 7, 8])))
                | ((df["cod_ense"] == 161) & (df["cod_grado"].isin([2, 3, 4])))
                | ((df["cod_ense"] == 163) & (df["cod_grado"].isin([2, 3, 4])))
                | ((df["cod_ense"] == 165) & (df["cod_grado"].isin([1, 2, 3])))
                | ((df["cod_ense"] == 167) & (df["cod_grado"].isin([2, 3])))
            )
        ].copy()

    if grade == "9th":
        df_new = df[
            (
                (df["cod_ense"].isin(
                        [310, 360, 361, 363, 410, 460, 463, 510, 560, 561, 563, 610,
                         661, 663, 710, 760, 761, 763, 810, 860, 861, 863, 910]))
                & (df["cod_grado"] == 1)
            )
        ].copy()

    if grade == "10th":
        df_new = df[
            (
                (df["cod_ense"].isin([310, 360, 410, 510, 610, 710, 810, 910]))
                & (df["cod_grado"] == 2)
            )
        ].copy()

    if grade == "11th":
        df_new = df[
            (
                (df["cod_ense"].isin(
                        [310, 360, 361, 363, 410, 460, 463, 510, 560, 561, 563, 610,
                         661, 663, 710, 760, 761, 763, 810, 860, 861, 863, 910]))
                & (df["cod_grado"] == 3)
            )
        ].copy()

    if grade == "12th":
        df_new = df[
            (
                (df["cod_ense"].isin(
                        [310, 360, 410, 463, 510, 563, 610, 663, 710, 763, 810, 910]))
                & (df["cod_grado"] == 4)
            )
        ].copy()
    return df_new


def read_data(
    agno: str,
    source: str,
    sep: str = ",",
    cols: str = None,
    grade: str = "8th",
    approved: bool = False,
) -> pd.DataFrame:
    """
    Reads csv of the form "{PATH}/{source}_{agno}.csv", filters columns, grades and approved 
    if specified. Returns a DataFrame with mrun identifier clean
    """
    # Read data
    df = pd.read_csv(f"{PATH}/{source}_{agno}.csv", sep=sep)
    print(f"All data in year {agno}:", df.shape[0])
    cols = cols or df.columns

    # Filters grade
    if grade:
        df = get_grade(df, grade)
        print(f"All data in year {agno} and grade {grade}:", df.shape[0])

    # Filter approved
    if approved:
        df = df[df["sit_fin"] == "P"]
        print(f"All data in year {agno} and grade {grade} that approved:", df.shape[0])

    # clean mrun column and removing none values
    df["mrun"] = df["mrun"].map(clean_int)
    df = df[df["mrun"].notnull()]
    df["mrun"] = df["mrun"].astype(int)
    print("Valid MRUN rows: ", df.shape[0])

    return df[cols]


def get_HE_data(year: int, cols: list, filter_values: dict = dict()) -> pd.DataFrame:
    """
    Reads High Education tables and filters based on filter_value dict
    Note: filter_values only supports one length dictionary
    """
    # Append filter column if not included
    if (len(cols) > 0) & (len(filter_values) == 1):
        cols = cols.append(list(filter_values.keys())[0])

    # Reads Matricula Ed Superior File
    he = read_data(
        year, "Matricula_Ed_Superior", sep=";", cols=cols, grade=None, approved=None
    )
    # Appends year and enrolled dummy variable
    he["he_agno"] = year
    he["he_enrolled"] = 1

    # Removes duplicates
    he = he.drop_duplicates(subset=["mrun"])
    # Filters data if filter_values (length 1) is specified
    if len(filter_values) == 1:
        he = he[he[list(filter_values.keys())[0]] == list(filter_values.values())[0]]
        print(
            f"Total data for year {year} and {list(filter_values.keys())[0]} {list(filter_values.values())[0]}:",
            he.shape[0],
        )
    return he


def get_statistics(baseline: pd.DataFrame, he: pd.DataFrame) -> pd.DataFrame:
    """
    Calculates the % and total number of students on baseline that appear on he.
    Returns a DataFrame by school with counts, enrolled counts and enrolled percentages.
    """
    join = baseline.merge(he, on="mrun", how="left")
    values = {"he_enrolled": 0}
    join.fillna(value=values, inplace=True)
    print("Baseline",baseline.head())
    print("he", he.head())
    print(join)
    res = join.pivot_table(
        index="rbd", aggfunc={"he_enrolled": [np.sum, np.mean, len]}
    ).reset_index()
    res.columns = ["rbd", "8th", "he_enrolled_%", "he_enrolled"]
    print(res["he_enrolled_%"].describe())
    return res

## Read baseline

In [19]:
t = 2011
path = "../Impulso-tek/year_data"
source = "Rendimiento"

In [20]:
cols = ["agno", "mrun", "rbd", "nom_reg_rbd_a"]
baseline = read_data(t, source, cols=cols, grade="8th", approved=True)

All data in year 2011: 3326746
All data in year 2011 and grade 8th: 2137053
All data in year 2011 and grade 8th that approved: 1904289
Valid MRUN rows:  1904289


In [21]:
baseline.shape

(1904289, 4)

In [22]:
baseline.head()

Unnamed: 0,agno,mrun,rbd,nom_reg_rbd_a
1819,2011,155212,3,
1820,2011,174207,3,
1821,2011,699709,3,
1827,2011,1302116,3,
1829,2011,1472368,3,


In [23]:
baseline['rbd'].nunique()

9012

# Ex-Post

## 1. First year enrollment rate
Number of students enrolled in a  Higher Ed. Institution (HEI) right after graduated (t+5) / number of students from the 8th graduated cohort (t)
% students enrolled in Higher Education (ME) right after graduation (t+5) 


In [24]:
# Read Higher Ed. Insititution (HEI) right after graduated (t+5)
cols = ["mrun", "codigo_unico"]
he = get_HE_data(t+5, cols)
print(he.head())

  exec(code_obj, self.user_global_ns, self.user_ns)


All data in year 2016: 1247178
Valid MRUN rows:  1245695
      mrun     codigo_unico  he_agno  he_enrolled
1483     5  I260S47C118J1V1     2016            1
1484    37   I498S6C132J2V1     2016            1
1485    42     I26S1C32J1V1     2016            1
1486    80     I45S2C33J1V1     2016            1
1487   107  I111S12C104J1V1     2016            1


In [25]:
exp1 = get_statistics(baseline, he)


Baseline       agno     mrun  rbd  nom_reg_rbd_a
1819  2011   155212    3            NaN
1820  2011   174207    3            NaN
1821  2011   699709    3            NaN
1827  2011  1302116    3            NaN
1829  2011  1472368    3            NaN
he       mrun     codigo_unico  he_agno  he_enrolled
1483     5  I260S47C118J1V1     2016            1
1484    37   I498S6C132J2V1     2016            1
1485    42     I26S1C32J1V1     2016            1
1486    80     I45S2C33J1V1     2016            1
1487   107  I111S12C104J1V1     2016            1
         agno      mrun    rbd  nom_reg_rbd_a   codigo_unico  he_agno  \
0        2011    155212      3            NaN            NaN      NaN   
1        2011    174207      3            NaN            NaN      NaN   
2        2011    699709      3            NaN            NaN      NaN   
3        2011   1302116      3            NaN            NaN      NaN   
4        2011   1472368      3            NaN            NaN      NaN   
...       

## 2. First and second year enrollment rate

Number of students enrolled in a  HEI at first or second year after graduation (combined) / number of students from the 8th graduated cohort (t)

In [18]:
# Read Higher Ed. Insititution (HEI) right after graduated (t+5)
cols = ["mrun", "codigo_unico"]
h5 = get_HE_data(t+5, cols)
h6 = get_HE_data(t+6, cols)
# Conact t+5 and t+6
he = pd.concat([h5,h6], axis=0).drop_duplicates(subset=["mrun"])

All data in year 2016: 1247178
Valid MRUN rows:  1245695


  exec(code_obj, self.user_global_ns, self.user_ns)


All data in year 2017: 1247746
Valid MRUN rows:  1245985


In [19]:
exp2 = get_statistics(baseline, he)

count    9012.000000
mean        0.088253
std         0.096588
min         0.000000
25%         0.000000
50%         0.081967
75%         0.134146
max         1.000000
Name: he_enrolled_%, dtype: float64


## 3. First, second and third year enrollment rate

Number of students enrolled in a  HEI at first, second or third year after graduation (combined) / number of students from the 8th graduated cohort (t)

In [20]:
cols = ["mrun", "codigo_unico"]
h5 = get_HE_data(t+5, cols)
h6 = get_HE_data(t+6, cols)
h7 = get_HE_data(t+7, cols)
# Conact t+5 and t+6
he = pd.concat([h5,h6, h7], axis=0).drop_duplicates(subset=["mrun"])

All data in year 2016: 1247178
Valid MRUN rows:  1245695
All data in year 2017: 1247746
Valid MRUN rows:  1245985
All data in year 2018: 1262771
Valid MRUN rows:  1261750


In [21]:
exp3 = get_statistics(baseline, he)

count    9012.000000
mean        0.154854
std         0.128653
min         0.000000
25%         0.058824
50%         0.152637
75%         0.225405
max         1.000000
Name: he_enrolled_%, dtype: float64


## 4. Enrollment in TNS program rate

Number of students enrolled in a  TNS (tecnico nivel superior) program (4-5 semesters)  / number of students from the 8th graduated cohort (t)


In [22]:
filter_values = {"nivel_carrera_1": "Técnico de Nivel Superior"}
cols = ["mrun", "codigo_unico", "nivel_carrera_1"]
he = get_HE_data(t+5, cols, filter_values=filter_values)

All data in year 2016: 1247178
Valid MRUN rows:  1245695
Total data for year 2016 and nivel_carrera_1 Técnico de Nivel Superior: 358375


In [23]:
exp4 = get_statistics(baseline, he)

count    9012.000000
mean        0.013533
std         0.021794
min         0.000000
25%         0.000000
50%         0.007866
75%         0.020137
max         0.428571
Name: he_enrolled_%, dtype: float64


## 5. Enrollment in a PSL program  rate
Number of students enrolled in a  PSL (professional sin licenciatura) program (8-9 semesters)  / number of students from the 8th graduated cohort (t)

In [24]:
filter_values = {"nivel_carrera_1": "Profesional Sin Licenciatura"}
cols = ["mrun", "codigo_unico", "nivel_carrera_1"]
he = get_HE_data(t+5, cols, filter_values=filter_values)

All data in year 2016: 1247178
Valid MRUN rows:  1245695
Total data for year 2016 and nivel_carrera_1 Profesional Sin Licenciatura: 221658


In [25]:
exp5 = get_statistics(baseline, he)

count    9012.000000
mean        0.003888
std         0.007757
min         0.000000
25%         0.000000
50%         0.000000
75%         0.005917
max         0.142857
Name: he_enrolled_%, dtype: float64


## 6.  Enrollment in a PCL (professional con licenciatura) program  rate
Number of students enrolled in a  PCL program (10-12  semesters)  / number of students from the 8th graduated cohort (t)

In [26]:
filter_values = {"nivel_carrera_1": "Profesional Con Licenciatura"}
cols = ["mrun", "codigo_unico", "nivel_carrera_1"]
he = get_HE_data(t+5, cols, filter_values=filter_values)

All data in year 2016: 1247178
Valid MRUN rows:  1245695
Total data for year 2016 and nivel_carrera_1 Profesional Con Licenciatura: 572163


In [27]:
exp6 = get_statistics(baseline, he)

count    9012.000000
mean        0.017082
std         0.031075
min         0.000000
25%         0.000000
50%         0.006289
75%         0.023479
max         1.000000
Name: he_enrolled_%, dtype: float64


## 7. Enrollment in a program during day rate
Number of students enrolled in a  day (diurno) program/ number of students from the 8th graduated cohort (t)

In [28]:
filter_values =  {"jornada": "Diurno"}
cols = ["mrun", "codigo_unico", "jornada"]
he = get_HE_data(t+5, cols, filter_values)

All data in year 2016: 1247178
Valid MRUN rows:  1245695
Total data for year 2016 and jornada Diurno: 853541


In [29]:
exp7 = get_statistics(baseline, he)

count    9012.000000
mean        0.031926
std         0.043173
min         0.000000
25%         0.000000
50%         0.023810
75%         0.048387
max         1.000000
Name: he_enrolled_%, dtype: float64


## 8. Enrollment in a evening program rate
Number of students enrolled in a  evening (vespertino) program / number of students from the 8th graduated cohort (t)

In [30]:
filter_values =  {"jornada": "Vespertino"}
cols = ["mrun", "codigo_unico", "jornada"]
he = get_HE_data(t+5, cols, filter_values)

All data in year 2016: 1247178
Valid MRUN rows:  1245695
Total data for year 2016 and jornada Vespertino: 326185


In [31]:
exp8 = get_statistics(baseline, he)

count    9012.000000
mean        0.003810
std         0.009582
min         0.000000
25%         0.000000
50%         0.000000
75%         0.004819
max         0.250000
Name: he_enrolled_%, dtype: float64


## 9. Enrollment in accredited programs rate
Number of students enrolled in programs which has been accredited/number of students from the 8th graduated cohort (t)

In [32]:
filter_values =  {"acreditada_carr": "ACREDITADA"}
cols = ["mrun", "codigo_unico", "acreditada_carr"]
he = get_HE_data(t+5, cols, filter_values=filter_values)

All data in year 2016: 1247178
Valid MRUN rows:  1245695
Total data for year 2016 and acreditada_carr ACREDITADA: 698372


In [33]:
exp9 = get_statistics(baseline, he)

count    9012.000000
mean        0.021532
std         0.031412
min         0.000000
25%         0.000000
50%         0.015135
75%         0.033333
max         1.000000
Name: he_enrolled_%, dtype: float64


In [34]:
exp1.head()

Unnamed: 0,rbd,8th,he_enrolled_%,he_enrolled
0,3,485.0,0.0,0.0
1,5,273.0,0.054945,15.0
2,8,302.0,0.043046,13.0
3,9,1336.0,0.085329,114.0
4,10,315.0,0.104762,33.0


In [35]:
exp2.head()

Unnamed: 0,rbd,8th,he_enrolled_%,he_enrolled
0,3,485.0,0.0,0.0
1,5,273.0,0.29304,80.0
2,8,302.0,0.145695,44.0
3,9,1336.0,0.184132,246.0
4,10,315.0,0.196825,62.0
