In [1]:
import pandas as pd
import numpy as np
from src import util
from src.process2004_2017 import get_processed_enade_2014_2017
from src.process2011 import get_processed_enade_2011
from src.process2008 import get_processed_enade_2008
from src.process2005 import get_processed_enade_2005
from src import config
from src import subject_manipulation
from typing import Tuple

In [2]:
subject_df = subject_manipulation.get_processed_subject_df()

In [None]:
def get_display_df(year: int, input_df: pd.DataFrame,
                   subject_df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    temp_subject_df = subject_df.loc[subject_df['ano'] == year].copy()
    
    
    new_mapping = input_df["TP_PRES"].map({222:"Ausente", 555:"Presente"}).values
    input_df.loc[:, "TP_PRES"] = new_mapping
    presence_df = input_df["TP_PRES"].value_counts()
    
    input_df = input_df.loc[input_df["TP_PRES"] == "Presente"]
    input_df = util.add_all_score_subjects(input_df, temp_subject_df, objective=True)
    
    
    num_questions = []
    mean_acertos_by_subject = []
    std_acertos_by_subject = []
    column_zero_subject = []
    column_geq_one_subject = []
    column_all_subject = []
    num_alunos = input_df.shape[0]

    subjects = util.get_subjects(temp_subject_df)

    
    for subject in subjects:
        subject_questions = util.get_subject_valid_questions(subject, 
                                                                 temp_subject_df,
                                                                 input_df,
                                                                 just_objective=True)
        num_obj_subject_questions = len(subject_questions)
    
    
    
        mean_acertos_subject = input_df[f"ACERTOS_OBJ_{subject}"].mean()
    
        std_acertos_subject = input_df[f"ACERTOS_OBJ_{subject}"].std()
    
        zero_subject_count = input_df[f"ACERTOS_OBJ_{subject}"].value_counts()

        if 0 in zero_subject_count.index:
            zero_subject = zero_subject_count[0] / num_alunos
        else:
            zero_subject = 0
    
        geq_one_subject_count = (input_df[f"ACERTOS_OBJ_{subject}"] >= 1).value_counts()
        if True in geq_one_subject_count.index:
            geq_one_subject = geq_one_subject_count[True] / num_alunos
        else:
            geq_one_subject = 0
    
        all_subject_count = (input_df[f"ACERTOS_OBJ_{subject}"] == num_obj_subject_questions).value_counts()
        if True in all_subject_count.index:
            all_subject = (all_subject_count[True] / num_alunos)
        else:
            all_subject = 0
        
        if num_obj_subject_questions > 0:
            num_questions.append(num_obj_subject_questions)
            mean_acertos_by_subject.append(mean_acertos_subject)
            std_acertos_by_subject.append(std_acertos_subject)
            column_zero_subject.append(zero_subject)
            column_geq_one_subject.append(geq_one_subject)
            column_all_subject.append(all_subject)
        else:
            num_questions.append(0)
            mean_acertos_by_subject.append(None)
            std_acertos_by_subject.append(None)
            column_zero_subject.append(None)
            column_geq_one_subject.append(None)
            column_all_subject.append(None)

    subjects_labels = [f"SCORE_OBJ_{x}" for x in subjects]

    mean_by_subject = input_df[subjects_labels].mean().values

    data = np.array([mean_by_subject, num_questions]).T

    display_df = pd.DataFrame(data=data, index=subjects, 
                          columns=["Nota %", "Nº Questões"])
    display_df["Nº Questões"] = display_df["Nº Questões"].astype(int, errors="ignore")
    display_df["Média Acertos"] = mean_acertos_by_subject
    display_df["Desvio Padrão Acertos"] = std_acertos_by_subject
    display_df["% de Zeros"] = column_zero_subject
    display_df["% de Zeros"] = display_df["% de Zeros"]*100
    display_df["% de Alunos que acertaram pelo menos uma questão"] = column_geq_one_subject
    display_df["% de Alunos que acertaram pelo menos uma questão"] = display_df["% de Alunos que acertaram pelo menos uma questão"]*100
    display_df["% de Alunos que acertaram todas"] = column_all_subject
    display_df["% de Alunos que acertaram todas"] = display_df["% de Alunos que acertaram todas"]*100

    display_df = display_df.sort_values(by=["Nota %"]).round(2)

    
    return display_df, input_df, presence_df
    
    

In [None]:
years = [2017, 2014, 2011, 2008, 2005]
first_paths = [f"data/enade/enade{x}/3.DADOS/MICRODADOS_ENADE_{x}.txt" for x in years[:3]]
second_paths = [f"data/enade/enade{x}/2.DADOS/microdados_enade_{x}.csv" for x in years[3:]]
paths = first_paths + second_paths
enade_2017 = get_processed_enade_2014_2017(paths[0])
enade_2014 = get_processed_enade_2014_2017(paths[1])
enade_2011 = get_processed_enade_2011(paths[2])
enade_2008, _ = get_processed_enade_2008(paths[3])
enade_2005, _ = get_processed_enade_2005(paths[4])

dfs = [enade_2017, enade_2014, enade_2011, enade_2008, enade_2005]
display_dfs = []
presence_dfs = []

for index, (enade_df, year) in enumerate(zip(dfs, years)):
    display_df, df, presence_df = get_display_df(year, enade_df, subject_df)
    dfs[index] = df
    display_dfs.append(display_df)
    presence_dfs.append(presence_df)
    
    

In [None]:
print("ANO de 2017")
display_dfs[0]


In [None]:
print("Ano de 2014")
display_dfs[1]

In [None]:
print("Ano de 2011")
display_dfs[2]

In [None]:
print("Ano de 2008")
display_dfs[3]

In [None]:
print("Ano de 2005")
display_dfs[4]

In [None]:
subject_score_by_year = {"2005": display_dfs[4]["Nota %"].copy(),
                         "2008": display_dfs[3]["Nota %"].copy(),
                         "2011": display_dfs[2]["Nota %"].copy(),
                         "2014": display_dfs[1]["Nota %"].copy(),
                         "2017": display_dfs[0]["Nota %"].copy()}
subject_score_by_year_df = pd.DataFrame(subject_score_by_year)
print("Nota % por ano")
subject_score_by_year_df

In [None]:
num_questions_answered = []
for df, year in zip(display_dfs, years):
    num_questions = df["Nº Questões"].astype(int).copy()
    print(num_questions)
    num_questions *= int(presence_by_year_df.loc["Presente", f"{year}"])
    num_questions_answered.append(num_questions)
num_questions_answered[0]
sum_questions_answered = num_questions_answered[0].copy()
for df in num_questions_answered[1:]:
    sum_questions_answered = pd.concat([sum_questions_answered, df])
sum_questions_answered

In [None]:
columns = [f"ACERTOS_OBJ_{x}" for x in subjects]
print(len(columns))
subjects = util.get_subjects(subject_df)
big_df = dfs[0].copy()
for df in dfs[1:]:
    big_df = pd.concat([big_df, df.copy()], ignore_index=True)
big_df[columns].sum()

In [None]:
presence_by_year = {"2005": presence_dfs[4].copy(),
                    "2008": presence_dfs[3].copy(),
                    "2011": presence_dfs[2].copy(),
                    "2014": presence_dfs[1].copy(),
                    "2017": presence_dfs[0].copy()}

presence_by_year_df = pd.DataFrame(presence_by_year)
#presence_by_year_df["2011"] = presence_by_year_df["2011"].fillna(0)
presence_by_year_df.fillna(0).astype(int)


In [None]:
for subject in subjects:
    subject_questions = util.get_subject_valid_questions(subject, 
                                                         subject_df,
                                                                 input_df,
                                                                 just_objective=True)
        num_obj_subject_questions = len(subject_questions)

In [None]:
subjects = util.get_subjects(subject_df)
columns = [f"ACERTOS_OBJ_{x}" for x in subjects]
dfs[0][columns].sum()

In [None]:
dfs[0].columns

In [None]:
subjects = util.get_subjects(subject_df)
subjects

In [None]:
enade_2017_df, original_df = get_processed_enade_2005(f"data/enade/enade{year}/2.DADOS/microdados_enade_{year}.csv")
enade_2017_df.head()

In [None]:
enade_2017_df = util.add_all_score_subjects(enade_2017_df, subject_df, objective=True)

In [None]:
enade_2017_df.head()[[f"QUESTAO_{i}_NOTA" for i in range(1, 41)]]

In [None]:
enade_2017_df[[f"QUESTAO_{x}_NOTA" for x in range(11, 11+10)]]

In [None]:
enade_2017_df[f"QUESTAO_{36}_STATUS"]

In [None]:
# medias dos alunos por cada tema, junto com o número de questões
num_questions = []
mean_acertos_by_subject = []
std_acertos_by_subject = []
column_zero_subject = []
column_geq_one_subject = []
column_all_subject = []
num_alunos = enade_2017_df.shape[0]

for subject in subjects:
    subject_questions = util.get_subject_valid_questions(subject, 
                                                                 subject_df,
                                                                 enade_2017_df,
                                                                 just_objective=True)
    num_obj_subject_questions = len(subject_questions)
    
    
    
    mean_acertos_subject = enade_2017_df[f"ACERTOS_OBJ_{subject}"].mean()
    
    std_acertos_subject = enade_2017_df[f"ACERTOS_OBJ_{subject}"].std()
    
    zero_subject_count = enade_2017_df[f"ACERTOS_OBJ_{subject}"].value_counts()

    if 0 in zero_subject_count.index:
        zero_subject = zero_subject_count[0] / num_alunos
    else:
        zero_subject = 0
    
    geq_one_subject_count = (enade_2017_df[f"ACERTOS_OBJ_{subject}"] >= 1).value_counts()
    if True in geq_one_subject_count.index:
        geq_one_subject = geq_one_subject_count[True] / num_alunos
    else:
        geq_one_subject = 0
    
    all_subject_count = (enade_2017_df[f"ACERTOS_OBJ_{subject}"] == num_obj_subject_questions).value_counts()
    if True in all_subject_count.index:
        all_subject = (all_subject_count[True] / num_alunos)
    else:
        all_subject = 0
        
    if num_obj_subject_questions > 0:
        num_questions.append(num_obj_subject_questions)
        mean_acertos_by_subject.append(mean_acertos_subject)
        std_acertos_by_subject.append(std_acertos_subject)
        column_zero_subject.append(zero_subject)
        column_geq_one_subject.append(geq_one_subject)
        column_all_subject.append(all_subject)
    else:
        num_questions.append(0)
        mean_acertos_by_subject.append(None)
        std_acertos_by_subject.append(None)
        column_zero_subject.append(None)
        column_geq_one_subject.append(None)
        column_all_subject.append(None)

    
subjects_labels = [f"SCORE_OBJ_{x}" for x in subjects]

mean_by_subject = enade_2017_df[subjects_labels].mean().values

data = np.array([mean_by_subject, num_questions]).T

display_df = pd.DataFrame(data=data, index=subjects, 
                          columns=["Nota %", "Nº Questões"])
display_df["Nº Questões"] = display_df["Nº Questões"].astype(int, errors="ignore")
display_df["Média Acertos"] = mean_acertos_by_subject
display_df["Desvio Padrão Acertos"] = std_acertos_by_subject
display_df["% de Zeros"] = column_zero_subject
display_df["% de Zeros"] = display_df["% de Zeros"]*100
display_df["% de Alunos que acertaram pelo menos uma questão"] = column_geq_one_subject
display_df["% de Alunos que acertaram pelo menos uma questão"] = display_df["% de Alunos que acertaram pelo menos uma questão"]*100
display_df["% de Alunos que acertaram todas"] = column_all_subject
display_df["% de Alunos que acertaram todas"] = display_df["% de Alunos que acertaram todas"]*100

display_df = display_df.sort_values(by=["Nota %"]).round(2)
display_df


In [None]:
original_df[["vt_esc_ofg", "vt_esc_oce"]]

In [None]:
objective_questions = subject_manipulation.get_objective_questions(subject_df)
enade_2017_df_objective = enade_2017_df[[f"QUESTAO_{i}_NOTA" for i in objective_questions]]
num_blank = 0
num_deletion = 0
for column in enade_2017_df_objective.columns:
    if "BRANCO" in enade_2017_df_objective[column].values:
        num_blank += enade_2017_df_objective[column].value_counts()["BRANCO"]
    if "RASURA" in enade_2017_df_objective[column].values:
        num_deletion += enade_2017_df_objective[column].value_counts()["RASURA"]
percentage_blank = num_blank * 100 / np.prod(enade_2017_df_objective.shape)
percentage_blank = round(percentage_blank, 2)



print(f"{num_blank} questão/questões marcada(s) em branco, o que equivale a "
      f"{percentage_blank}% das questões objetivas")
print(f"{num_deletion} questão/questões rasurada(s)")