In [1]:
%cd ../

/home/renan/git/KDD-Enade-Computing


In [2]:
import pandas as pd
import numpy as np
from sys import path
from src import util
from src.transformation import transform
from src import config

In [3]:
from src import config
from src import subject_manipulation
from textwrap import wrap
from typing import Tuple, List
from src.config import SUBJECT_DF_PATH, BLANK_ANSWER_LABEL, DELETION_ANSWER_LABEL, CANCELLED_LABEL, DIFFICULTIES, \
    MATH_SUBJECTS, COMPUTING_SUBJECTS, HUMAN_SUBJECTS, TECHNOLOGY_SUBJECTS
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
import matplotlib.patches as mpatches
import matplotlib.ticker as mtick
from tqdm import tqdm

In [4]:
def remove_absent_students(dfs):
    for x in dfs.keys():
        df = dfs[x]
        util.map_presence(df)
        dfs[x] = util.filter_present_students(df)
    return dfs

In [5]:
def get_display_df(year: int, input_df: pd.DataFrame,
                   subject_df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    ### this shouldn't stay here
    subject_df_year = subject_df.loc[subject_df['ano'] == year].copy()
    
    year_subjects = util.get_subjects(subject_df_year)
    input_df = util.add_all_score_categories(input_df, subject_df_year, True, 
                                             year_subjects, util.get_subject_valid_questions)
    
    num_questions = []
    mean_acertos_by_subject = []
    std_acertos_by_subject = []
    column_zero_subject = []
    column_geq_one_subject = []
    column_all_subject = []
    num_alunos = input_df.shape[0]

    for subject in year_subjects:
        subject_questions = util.get_subject_valid_questions(subject, 
                                                                 subject_df_year,
                                                                 input_df,
                                                                 just_objective=True)
        num_obj_subject_questions = len(subject_questions)
    
    
    
        mean_acertos_subject = input_df[f"ACERTOS_OBJ_{subject}"].mean()
    
        std_acertos_subject = input_df[f"ACERTOS_OBJ_{subject}"].std()
    
        zero_subject_percentage = list(input_df[f"ACERTOS_OBJ_{subject}"]).count(0) * 100 / num_alunos
    
        geq_one_subject_percentage = list(input_df[f"ACERTOS_OBJ_{subject}"] >= 1).count(True) * 100 / num_alunos
        
        all_subject_percentage = list(input_df[f"ACERTOS_OBJ_{subject}"] == num_obj_subject_questions).count(True) * 100 / num_alunos
        
        if num_obj_subject_questions > 0:
            num_questions.append(num_obj_subject_questions)
            mean_acertos_by_subject.append(mean_acertos_subject)
            std_acertos_by_subject.append(std_acertos_subject)
            column_zero_subject.append(zero_subject_percentage)
            column_geq_one_subject.append(geq_one_subject_percentage)
            column_all_subject.append(all_subject_percentage)
        else:
            num_questions.append(0)
            mean_acertos_by_subject.append(None)
            std_acertos_by_subject.append(None)
            column_zero_subject.append(None)
            column_geq_one_subject.append(None)
            column_all_subject.append(None)

    subject_labels = [f"SCORE_OBJ_{x}" for x in year_subjects]

    mean_by_subject = input_df[subject_labels].mean().values

    data = np.array([mean_by_subject, num_questions]).T

    display_df = pd.DataFrame(data=data, index=year_subjects, 
                              columns=["Nota %", "Nº Questões"])
    display_df["Nº Questões"] = display_df["Nº Questões"].astype(int, errors="ignore")
    display_df["Média Acertos"] = mean_acertos_by_subject
    display_df["Desvio Padrão Acertos"] = std_acertos_by_subject
    display_df["% de Zeros"] = column_zero_subject

    display_df["% de Alunos que acertaram pelo menos uma questão"] = column_geq_one_subject
    display_df["% de Alunos que acertaram todas"] = column_all_subject

    display_df = display_df.sort_values(by=["Nota %"]).round(2)

    return display_df, input_df

In [6]:
def make_display_dfs(dfs_dict, subject_df):
    display_dfs_dict = {}
    for year in dfs_dict.keys():
        df_year = dfs_dict[year]
        display_df, df_year = get_display_df(year, df_year, subject_df)
        dfs_dict[year] = df_year
        display_dfs_dict[year] = display_df
    return display_dfs_dict, dfs_dict



In [7]:
def display_data_by_subject(year: int, dfs_dict, display_dfs) -> pd.DataFrame:
    print(f"ANO de {year}")
    display_dfs[year] = display_dfs[year].loc[display_dfs[year]["Nº Questões"] != 0]
    bar_graph_display_df(display_dfs[year])
    poscomp_score = return_poscomp_score(dfs_dict[year], display_dfs[year])
    make_radar_plot(poscomp_score, "Percentual de Acerto por Área no ENADE")
    return display_dfs[year]

In [8]:
def display_all_years_nota(all_dfs: dict) -> pd.DataFrame:
    data = {"2005": all_dfs[2005]["Nota %"].copy(),
            "2008": all_dfs[2008]["Nota %"].copy(),
            "2011": all_dfs[2011]["Nota %"].copy(),
            "2014": all_dfs[2014]["Nota %"].copy(),
            "2017": all_dfs[2017]["Nota %"].copy()}
    return pd.DataFrame(data)

In [9]:
"""Numero de questões respondidas por categoria considerando todos os anos
    
    contador para cada categoria iniciado em 0
    para cada prova
        para cada categoria
            multiplicar numero de presentes pelo numero de questões da categoria
            incrementar o contador da categoria
     """

def return_num_answered(display_dfs, original_dfs):
    num_questions_answered_dfs = []
    for year in display_dfs.keys():
        df = display_dfs[year]
        num_questions = df["Nº Questões"].astype(int).copy()
        num_questions *= original_dfs[year].shape[0]
        num_questions_answered_dfs.append(num_questions)
    num_questions_answered_df = pd.concat(num_questions_answered_dfs, axis=1)
    num_answered_by_category = num_questions_answered_df.fillna(0).sum(axis=1).sort_index()
    return num_answered_by_category

"""Numero de questões respondidas corretamente por categoria considerando todos os anos
    
    contador para cada categoria iniciado em 0
    para cada prova
        para cada categoria
            multiplicar numero de presentes pelo numero de questões acertadas da categoria
            incrementar o contador da categoria
     """

def return_num_correct(dfs, categories):
    columns = [f"ACERTOS_OBJ_{x}" for x in categories]
    correct_by_category  = pd.concat(dfs.values(), ignore_index=True)
    correct_by_category = correct_by_category[columns].sum(0).sort_index()
    return correct_by_category
    

def get_all_years_combined_df(display_dfs, dfs, categories):
    num_answered = return_num_answered(display_dfs, dfs)
    num_correct = return_num_correct(dfs, categories)
    num_correct.index = num_answered.index
    data_df = {"Questões Respondidas no total (Questões x Alunos)": num_answered,
               "Acertos no Total (Acertos x Alunos)": num_correct}
    score_df_category = pd.DataFrame(data_df)
    score_df_category["Nota %"] = score_df_category.iloc[:, 1]*100/score_df_category.iloc[:, 0]
    score_df_category.iloc[:, 0] =  score_df_category.iloc[:, 0].astype(int)
    score_df_category.iloc[:, 1] =  score_df_category.iloc[:, 1].astype(int)

    score_df_category["Nota %"] = score_df_category["Nota %"].round(2)
    
    return score_df_category.sort_values(by=["Nota %"])

In [10]:
subject_df = subject_manipulation.get_processed_subject_df(SUBJECT_DF_PATH)

def get_course_score_by_subject(code_course):
    dfs_dict_course = util.get_dict_all_years(filter_by_course=code_course)
    dfs_dict_course = remove_absent_students(dfs_dict_course)
    for year in dfs_dict_course.keys():
        if dfs_dict_course[year].shape[0] == 0:
            raise ValueError("Empty")
    display_dfs_course, dfs_dict_course = make_display_dfs(dfs_dict_course, subject_df)
    all_years_combined_subject_course = get_all_years_combined_df(display_dfs_course, dfs_dict_course, 
                                                            util.get_subjects(subject_df))
    return all_years_combined_subject_course["Nota %"]

In [11]:
dfs_dict_course = util.get_dict_all_years(filter_by_course=False)

In [12]:
valid_courses = set(pd.unique(dfs_dict_course[2017]["CO_CURSO"]))
for year in [2014, 2011, 2008, 2005]:
    print(len(valid_courses))
    year_courses = set(pd.unique(dfs_dict_course[year]["CO_CURSO"]))
    valid_courses = valid_courses.intersection(year_courses)
len(valid_courses)

311
260
226
170


147

In [23]:
result_df = pd.DataFrame(columns=util.get_subjects(subject_df))
for code_course in tqdm(list(valid_courses)):
    try:
        course_scores = get_course_score_by_subject(code_course).sort_index().rename(code_course)
        result_df = result_df.append(course_scores)
    except ValueError:
        continue
result_df

100%|██████████| 147/147 [23:28<00:00,  9.58s/it]


Unnamed: 0,Algoritmos e estruturas de dados,Arquitetura de computadores e sistemas operacionais,Banco de dados,Compiladores,Computação gráfica e processamento de imagem,Engenharia de software e interação homem-computador,Fundamentos e técnicas de programação,Inteligência artificial e computacional,Lógica e matemática discreta,Paradigmas de linguagens de programação,Probabilidade e estatística,Redes de computadores,Sistemas digitais,Sistemas distribuídos,Teoria da computação,Teoria dos grafos,"Ética, computador e sociedade"
11238,33.40,42.05,44.95,40.07,32.52,47.46,31.67,42.05,38.86,37.99,35.96,37.27,23.70,34.62,25.18,32.24,69.82
18974,40.49,43.36,49.53,52.65,32.89,54.68,37.48,57.69,40.31,45.35,41.58,34.76,33.64,23.47,33.12,31.92,72.47
36,38.37,40.91,27.03,49.78,28.85,50.00,36.70,57.35,34.28,33.78,34.44,27.62,25.97,20.45,34.97,26.92,69.91
13595,50.76,49.71,53.90,61.90,44.83,64.74,41.13,63.74,54.42,43.93,40.31,42.08,47.44,62.39,43.04,37.80,76.43
12025,34.17,46.93,35.86,34.73,31.44,53.46,32.03,45.88,32.82,35.66,29.21,36.55,32.04,41.18,25.97,29.20,64.62
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66272,19.42,30.30,32.67,22.59,27.18,34.64,23.95,45.83,24.85,26.86,33.76,21.60,25.16,25.29,19.62,17.51,57.71
47603,23.94,35.23,22.41,26.06,27.27,36.08,29.81,48.98,24.39,32.56,35.59,24.53,20.54,13.33,20.49,14.63,55.70
21271,40.80,50.84,48.11,41.52,38.60,49.73,41.46,50.56,50.00,47.73,31.88,31.34,40.18,50.00,37.62,24.00,52.86
3211,30.91,34.53,39.22,34.94,20.00,47.22,28.78,35.66,38.56,42.97,13.76,16.99,28.87,34.29,20.90,25.61,44.26


In [24]:

codes = []
results = []
for column in result_df.columns:
    code = result_df[column].idxmax()
    result = result_df[column].max()
    codes.append(code)
    results.append(result)

display_df = pd.DataFrame({"disciplinas": result_df.columns, "curso": codes, "resultado": results})
display_df["curso"] = display_df['curso'].replace({"12946": "UFMG", 
                                                   "15869": "UFMS",
                                                   "13717": "UFRGS",
                                                   "13980": "UFC",
                                                   "13595": "UFPE",
                                                   "18559": "UESPI", 
                                                   "7511": "PUC-Minas"}) 
display_df

Unnamed: 0,disciplinas,curso,resultado
0,Algoritmos e estruturas de dados,UFMS,61.63
1,Arquitetura de computadores e sistemas operaci...,UFRGS,65.56
2,Banco de dados,UFMG,64.96
3,Compiladores,UFC,73.33
4,Computação gráfica e processamento de imagem,UFRGS,52.96
5,Engenharia de software e interação homem-compu...,UFPE,64.74
6,Fundamentos e técnicas de programação,UFMG,51.92
7,Inteligência artificial e computacional,UFMG,64.91
8,Lógica e matemática discreta,UFC,57.98
9,Paradigmas de linguagens de programação,UFMG,59.32


In [45]:
result_df.rank(pct=True).loc['13717'].round(2).sort_values()

Ética, computador e sociedade                          0.92
Teoria da computação                                   0.97
Sistemas digitais                                      0.97
Probabilidade e estatística                            0.97
Fundamentos e técnicas de programação                  0.98
Algoritmos e estruturas de dados                       0.98
Engenharia de software e interação homem-computador    0.98
Lógica e matemática discreta                           0.98
Inteligência artificial e computacional                0.99
Teoria dos grafos                                      0.99
Paradigmas de linguagens de programação                0.99
Compiladores                                           0.99
Banco de dados                                         0.99
Sistemas distribuídos                                  0.99
Computação gráfica e processamento de imagem           1.00
Redes de computadores                                  1.00
Arquitetura de computadores e sistemas o

In [14]:
a = util.get_dict_all_years(filter_by_course="14137")
a = remove_absent_students(a)
for year in a.keys():
    print(a[year].shape)

(33, 270)
(0, 274)
(15, 235)
(16, 318)
(14, 330)


In [15]:
get_course_score_by_subject("12025").sort_index()

Algoritmos e estruturas de dados                       34.17
Arquitetura de computadores e sistemas operacionais    46.93
Banco de dados                                         35.86
Compiladores                                           34.73
Computação gráfica e processamento de imagem           31.44
Engenharia de software e interação homem-computador    53.46
Fundamentos e técnicas de programação                  32.03
Inteligência artificial e computacional                45.88
Lógica e matemática discreta                           32.82
Paradigmas de linguagens de programação                35.66
Probabilidade e estatística                            29.21
Redes de computadores                                  36.55
Sistemas digitais                                      32.04
Sistemas distribuídos                                  41.18
Teoria da computação                                   25.97
Teoria dos grafos                                      29.20
Ética, computador e soci