In [51]:
import pandas as pd

df = pd.read_csv("datasets/original.csv")
df = df.sort_values(by=["student_id", "period", "discipline"])

df.head(5)

Unnamed: 0,id,student_id,birthdate,sex,city,course,period,week_day,discipline,status,g1,g2,final_grade,class_skips
2138,1,99951,1992-05-21 02:00:00.000 -0300,F,Faxinal do Soturno,Ciências Contábeis,2021/1,Quarta,Matemática Aplicada - 60,Aprovado,7.5,7.0,7.2,0
5665,3,99951,1992-05-21 02:00:00.000 -0300,F,Faxinal do Soturno,Ciências Contábeis,2021/2,Segunda,Contabilidade Intermediária - 60,Aprovado,7.2,8.9,8.0,0
9160,2,99951,1992-05-21 02:00:00.000 -0300,F,Faxinal do Soturno,Ciências Contábeis,2022/1,Terça,Contabilidade Avançada - 60,Trancado,-1.0,-1.0,-1.0,0
28710,4,99955,1989-01-06 04:00:00.000 -0200,M,Faxinal do Soturno,Ciências Contábeis,2025/1,Segunda,Contabilidade Introdutória - 60,Aprovado,8.4,8.8,8.6,0
28312,8,99955,1989-01-06 04:00:00.000 -0200,M,Faxinal do Soturno,Ciências Contábeis,2025/1,Quarta,Legislação e Ética Profissional - 30,Aprovado,9.8,9.9,9.8,0


In [52]:
original_df_len = len(df)
print(f"Quantidade de registros: {original_df_len}")

Quantidade de registros: 35995


In [53]:
# Realiza a limpeza da coluna "status"
df['status_clean'] = df['status'].astype(str).str.lower()
# Aplica o Label Encoding em "status"
df['status_encoded'] = df['status_clean'].astype('category').cat.codes


# Normalização das colunas de notas
def normalize_grade_column(colum_name: str):
    new_column_name = f"{ colum_name }_normalized"
    df[new_column_name] = pd.to_numeric(df[colum_name], errors='coerce') # valores inválidos viram NaN
    df[new_column_name] = (df[new_column_name] / 10).round(2) # Normalização e ajuste de decimais
    df.loc[df[new_column_name] == -0.1, new_column_name] = 0 # Valores "-0.1" trocados para 0

normalize_grade_column("g1")
normalize_grade_column("g2")
normalize_grade_column("final_grade")

# Tratamento da coluna "discipline"

# # Remoção do "traço" e do crédito da coluna disciplina no nome da mesma
df["discipline_normalized"] = df["discipline"].str.split(" - ").str[0].str.strip()
# Remoção para caso especial em "Gestão da Qualidade de Software"
df["discipline_normalized"] = df["discipline_normalized"].str.split(" -72").str[0].str.strip()

# Elimina do dataset disciplinas pouco cursadas

# Conta o número de alunos únicos em cada disciplina
discipline_counts = df.groupby("discipline")["student_id"].nunique()
# Identifica as disciplinas POPULARES (com MAIS alunos que o limiar)
popular_disciplines = discipline_counts[discipline_counts > 10].index
# Filtra o DataFrame original, mantendo APENAS as linhas que pertencem às disciplinas populares
df = df[df["discipline"].isin(popular_disciplines)]


# Criação de novas colunas binárias
df["is_approved"] = df["status_clean"].isin(["aprovado"]).astype(int)
df["canceled_discipline"] = df["status_clean"].isin(["trancado", "cancelado"]).astype(int)
df["skipped_discipline"] = df["status_clean"].isin(["reprovado por frequência"]).astype(int)


# Criação de "sex_normalized" para normalização do sexo biológico
df["sex_normalized"] = df["sex"].map({"M": 1, "F": 2}).fillna(0).astype(int)


# Criação de colunas normalizadas para "week_day", "course" e "period"
for col in ["week_day", "course", "period"]:
    df[f"{ col }_normalized"] = pd.factorize(df[col])[0] + 1


# Trata a coluna "birthdate"
df["birthdate_normalized"] = df["birthdate"].str.split(" ").str[0].str.strip()
# Converte essa string para um datetime "naive" (sem fuso)
naive_datetime = pd.to_datetime(df["birthdate_normalized"], errors='coerce')
# Define fuso horário GMT -3
df["birthdate_normalized"] = naive_datetime.dt.tz_localize('-03:00')


# Criação de "class_skips_normalized" para a normalização e ajuste das faltas por dia
df["class_skips_normalized"] = (df["class_skips"] / 4).astype(int)


# Realiza a normalização da coluna "discipline"
df['discipline_normalized'] = df['discipline_normalized'].astype('category').cat.codes


# Realiza a normalização da coluna "city"
df['city_normalized'] = df['city'].astype('category').cat.codes


# Salva dataset original com tratamentos
df.to_csv("datasets/original_and_treated.csv", index=False, encoding="utf-8")

df.head(5)

Unnamed: 0,id,student_id,birthdate,sex,city,course,period,week_day,discipline,status,...,is_approved,canceled_discipline,skipped_discipline,sex_normalized,week_day_normalized,course_normalized,period_normalized,birthdate_normalized,class_skips_normalized,city_normalized
2138,1,99951,1992-05-21 02:00:00.000 -0300,F,Faxinal do Soturno,Ciências Contábeis,2021/1,Quarta,Matemática Aplicada - 60,Aprovado,...,1,0,0,2,1,1,1,1992-05-21 00:00:00-03:00,0,38
5665,3,99951,1992-05-21 02:00:00.000 -0300,F,Faxinal do Soturno,Ciências Contábeis,2021/2,Segunda,Contabilidade Intermediária - 60,Aprovado,...,1,0,0,2,2,1,2,1992-05-21 00:00:00-03:00,0,38
9160,2,99951,1992-05-21 02:00:00.000 -0300,F,Faxinal do Soturno,Ciências Contábeis,2022/1,Terça,Contabilidade Avançada - 60,Trancado,...,0,1,0,2,3,1,3,1992-05-21 00:00:00-03:00,0,38
28710,4,99955,1989-01-06 04:00:00.000 -0200,M,Faxinal do Soturno,Ciências Contábeis,2025/1,Segunda,Contabilidade Introdutória - 60,Aprovado,...,1,0,0,1,2,1,4,1989-01-06 00:00:00-03:00,0,38
28312,8,99955,1989-01-06 04:00:00.000 -0200,M,Faxinal do Soturno,Ciências Contábeis,2025/1,Quarta,Legislação e Ética Profissional - 30,Aprovado,...,1,0,0,1,1,1,4,1989-01-06 00:00:00-03:00,0,38


In [54]:
original_and_treated_df_len = len(df)
print(f"Quantidade de registros: {original_and_treated_df_len}")
print(f"Registros removidos durante o tratamento: {original_df_len - original_and_treated_df_len}")

Quantidade de registros: 35656
Registros removidos durante o tratamento: 339


In [55]:
treated_df = df.copy(deep=True)

cols = [
    "id",
    "student_id",
    # "birthdate_normalized",
    "sex_normalized",
    "city_normalized",
    "course_normalized",
    "period_normalized",
    "week_day_normalized",
    "discipline_normalized",
    "status_encoded",
    "g1_normalized",
    "g2_normalized",
    "final_grade_normalized",
    "canceled_discipline",
    "skipped_discipline",
    "class_skips_normalized",
    "is_approved",
]

rename_map = {
    # "birthdate_normalized": "birthdate",
    "sex_normalized": "sex",
    "course_normalized": "course",
    "period_normalized": "period",
    "week_day_normalized": "week_day",
    "discipline_normalized": "discipline",
    "status_encoded": "status",
    "g1_normalized": "g1",
    "g2_normalized": "g2",
    "final_grade_normalized": "final_grade",
    "class_skips_normalized": "class_skips",
}

# Mantém somente colunas desejadas
treated_df = treated_df[cols]

# Realiza o renomeamento das colunas
treated_df = treated_df.rename(columns=rename_map)

treated_df.head(5)

Unnamed: 0,id,student_id,sex,city_normalized,course,period,week_day,discipline,status,g1,g2,final_grade,canceled_discipline,skipped_discipline,class_skips,is_approved
2138,1,99951,2,38,1,1,1,276,0,0.75,0.7,0.72,0,0,0,1
5665,3,99951,2,38,1,2,2,62,0,0.72,0.89,0.8,0,0,0,1
9160,2,99951,2,38,1,3,3,60,8,0.0,0.0,0.0,1,0,0,0
28710,4,99955,1,38,1,4,2,64,0,0.84,0.88,0.86,0,0,0,1
28312,8,99955,1,38,1,4,1,251,0,0.98,0.99,0.98,0,0,0,1


In [56]:
# Salva dataset com tratamentos apenas
treated_df.to_csv("datasets/original_treated.csv", index=False, encoding="utf-8")

In [57]:
print("Total de registros no dataset pré limpeza de outliers", len(treated_df))

# Busca por alunos em exame
filter = (treated_df['final_grade'] <= 0.7) & (treated_df['is_approved'] == 1)
exam_students = treated_df[filter]

print(f"Total de alunos aprovados em exame: {len(exam_students)}")

treated_df = treated_df.drop(exam_students.index)
treated_df = treated_df.reset_index(drop=True)

print("Total de registros no dataset pós limpeza de outliers", len(treated_df))

Total de registros no dataset pré limpeza de outliers 35656
Total de alunos aprovados em exame: 2675
Total de registros no dataset pós limpeza de outliers 32981
