In [1]:
import pandas as pd
import numpy as np
import sys

sys.path.append("../")

from src import util
from src.transformation import transform
from src import config
from src import subject_manipulation
from typing import Tuple, List
from src.config import SUBJECT_DF_PATH, BLANK_ANSWER_LABEL, DELETION_ANSWER_LABEL, CANCELLED_LABEL, DIFFICULTIES, \
    MATH_SUBJECTS, COMPUTING_SUBJECTS, HUMAN_SUBJECTS, TECHNOLOGY_SUBJECTS
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
import matplotlib.patches as mpatches
import os

In [2]:
def read_csv_course(year, filter_by_course):
    df = transform.read_csv(year)
    if filter_by_course:
        df = df.loc[df["CO_CURSO"] == config.CODE_COURSE]
    return df

In [3]:
years = [2017, 2014, 2011, 2008, 2005]

In [4]:
def get_dict_all_years(filter_by_course: bool) -> dict:
        
    result = {}
    for year in years:
        result[year] = read_csv_course(year, filter_by_course=filter_by_course)
        
    return result

In [5]:
def display_all_years_nota(all_dfs: dict) -> pd.DataFrame:
    data = {"2005": all_dfs[2005]["Nota %"].copy(),
            "2008": all_dfs[2008]["Nota %"].copy(),
            "2011": all_dfs[2011]["Nota %"].copy(),
            "2014": all_dfs[2014]["Nota %"].copy(),
            "2017": all_dfs[2017]["Nota %"].copy()}
    return pd.DataFrame(data)

In [6]:
"""Numero de questões respondidas por categoria considerando todos os anos
    
    contador para cada categoria iniciado em 0
    para cada prova
        para cada categoria
            multiplicar numero de presentes pelo numero de questões da categoria
            incrementar o contador da categoria
     """

def return_num_answered(display_dfs, original_dfs, multiply_by_students=True):
    num_questions_answered_dfs = []
    for year in display_dfs.keys():
        df = display_dfs[year]
        num_questions = df["Nº Questões"].astype(int).copy()
        if multiply_by_students:
            num_questions *= original_dfs[year].shape[0]
        num_questions_answered_dfs.append(num_questions)
    num_questions_answered_df = pd.concat(num_questions_answered_dfs, axis=1)
    num_answered_by_category = num_questions_answered_df.fillna(0).sum(axis=1).sort_index()
    return num_answered_by_category

"""Numero de questões respondidas corretamente por categoria considerando todos os anos
    
    contador para cada categoria iniciado em 0
    para cada prova
        para cada categoria
            multiplicar numero de presentes pelo numero de questões acertadas da categoria
            incrementar o contador da categoria
     """

def return_num_correct(dfs, categories):
    columns = [f"ACERTOS_OBJ_{x}" for x in categories]
    correct_by_category  = pd.concat(dfs.values(), ignore_index=True)
    correct_by_category = correct_by_category[columns].sum(0).sort_index()
    return correct_by_category
    

def get_all_years_combined_df(display_dfs, dfs, categories):
    num_answered = return_num_answered(display_dfs, dfs)
    num_correct = return_num_correct(dfs, categories)
    num_correct.index = num_answered.index
    data_df = {"Questões Respondidas no total (Questões x Alunos)": num_answered,
               "Acertos no Total (Acertos x Alunos)": num_correct}
    score_df_category = pd.DataFrame(data_df)
    score_df_category["Nota %"] = score_df_category.iloc[:, 1]*100/score_df_category.iloc[:, 0]
    score_df_category.iloc[:, 0] =  score_df_category.iloc[:, 0].astype(int)
    score_df_category.iloc[:, 1] =  score_df_category.iloc[:, 1].astype(int)

    score_df_category["Nota %"] = score_df_category["Nota %"].round(2)
    
    return score_df_category.sort_values(by=["Nota %"])

In [7]:
path = os.path.join("data", "raw_data", "dificuldade_questao.csv")
difficulty_df = pd.read_csv(path)

In [8]:
def get_display_df_difficulty(year: int, input_df: pd.DataFrame,
                              difficulty_df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
   
    temp_difficulty_df = difficulty_df.loc[difficulty_df['ano'] == year].copy()
    
    util.map_presence(input_df)
    presence_df = input_df["TP_PRES"].value_counts()
    
    input_df = util.filter_present_students(input_df)
    
    input_df = util.add_all_score_categories(input_df, temp_difficulty_df, True,
                                             DIFFICULTIES, util.get_difficulty_valid_questions)
        
    categories = DIFFICULTIES
    
    num_questions = []
    mean_acertos = []
    std_acertos = []
    column_zero = []
    column_geq_one = []
    column_all = []
    num_alunos = input_df.shape[0]

    
    for category in categories:
        questions = util.get_difficulty_valid_questions(category, 
                                                             temp_difficulty_df,
                                                             input_df)
        num_obj_questions = len(questions)
    
        mean_acertos_category = input_df[f"ACERTOS_OBJ_{category}"].mean()
    
        std_acertos_category = input_df[f"ACERTOS_OBJ_{category}"].std()
        
        zero_category_percentage = list(input_df[f"ACERTOS_OBJ_{category}"]).count(0) * 100 / num_alunos
    
        geq_one_category_percentage = list(input_df[f"ACERTOS_OBJ_{category}"] >= 1).count(True) * 100 / num_alunos
        
        all_category_percentage = list(input_df[f"ACERTOS_OBJ_{category}"] == num_obj_questions).count(True) * 100 / num_alunos
        
        if num_obj_questions > 0:
            num_questions.append(num_obj_questions)
            mean_acertos.append(mean_acertos_category)
            std_acertos.append(std_acertos_category)
            column_zero.append(zero_category_percentage)
            column_geq_one.append(geq_one_category_percentage)
            column_all.append(all_category_percentage)
        else:
            num_questions.append(0)
            mean_acertos.append(None)
            std_acertos.append(None)
            column_zero.append(None)
            column_geq_one.append(None)
            column_all.append(None)

    category_labels = [f"SCORE_OBJ_{x}" for x in categories]

    mean_by_category = input_df[category_labels].mean().values

    data = np.array([mean_by_category, num_questions]).T

    display_df = pd.DataFrame(data=data, index=categories, 
                          columns=["Nota %", "Nº Questões"])
    display_df["Nº Questões"] = display_df["Nº Questões"].astype(int, errors="ignore")
    display_df["Média Acertos"] = mean_acertos
    display_df["Desvio Padrão Acertos"] = std_acertos
    display_df["% de Zeros"] = column_zero

    display_df["% de Alunos que acertaram pelo menos uma questão"] = column_geq_one
    display_df["% de Alunos que acertaram todas"] = column_all


    display_df = display_df.sort_values(by=["Nota %"]).round(2)

    
    return display_df, input_df


In [9]:
enade_all_years_course = get_dict_all_years(filter_by_course=True)
enade_all_years_br = get_dict_all_years(filter_by_course=False)

FileNotFoundError: [Errno 2] No such file or directory: 'data/transformed_data/microdados_transformados_2017.csv'

In [None]:
def get_display_dfs_difficulty(enade_all_years_dict: dict) -> dict:
    result = {}
    for year in enade_all_years_dict.keys():
        display_df, enade_all_years_dict[year] = get_display_df_difficulty(year,
                                                                           enade_all_years_dict[year],
                                                                           difficulty_df)
        result[year] = display_df
    return result

display_dfs_difficulty_course = get_display_dfs_difficulty(enade_all_years_course)
display_dfs_difficulty_br = get_display_dfs_difficulty(enade_all_years_br)

In [None]:

print("Curso - 2017")
display_dfs_difficulty_course[2017]

In [None]:
print("Brasil - 2017")
display_dfs_difficulty_br[2017]


In [None]:
print("Curso - 2014")
display_dfs_difficulty_course[2014]

In [None]:
print("Brasil - 2014")

display_dfs_difficulty_br[2014]

In [None]:
print("Curso - 2011")
display_dfs_difficulty_course[2011]

In [None]:
print("Brasil - 2011")

display_dfs_difficulty_br[2011]

In [None]:
print("Curso - 2008")
display_dfs_difficulty_course[2008]

In [None]:
print("Brasil - 2008")

display_dfs_difficulty_br[2008]

In [None]:
print("Curso - 2005")
display_dfs_difficulty_course[2005]

In [None]:
print("Brasil - 2005")

display_dfs_difficulty_br[2005]

In [None]:
print("Nota % por ano - Curso")
display_all_years_nota(display_dfs_difficulty_course)

In [None]:
print("Nota % por ano - Brasil")
display_all_years_nota(display_dfs_difficulty_br)

In [None]:
print("Brasil")
difficulty_all_br_df = get_all_years_combined_df(display_dfs_difficulty_br, enade_all_years_br, DIFFICULTIES)
difficulty_all_br_df

In [None]:
print("Curso")
difficulty_all_course_df = get_all_years_combined_df(display_dfs_difficulty_course, enade_all_years_course, DIFFICULTIES)
difficulty_all_course_df

In [None]:
N = 5
br = difficulty_all_br_df["Nota %"]
course = difficulty_all_course_df["Nota %"]
ind = np.arange(N)    # the x locations for the groups
width = 0.35       # the width of the bars: can also be len(x) sequence

ax = plt.subplot(111)
p2 = ax.bar(ind - width/2, br, width, color="black")
p1 = ax.bar(ind + width/2, course, width, color="gray")


plt.ylabel('%')
plt.title('Percentual de acerto no ENADE por dificuldade da questão (considerando todos os anos do exame)')
plt.xticks(ind, difficulty_all_br_df.index)
#plt.yticks(np.arange(0, 81, 10))
plt.legend((p1[0], p2[0]), ('Curso', 'Brasil'))
ax.grid(axis = 'y', color ='white', linestyle='-')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.show()