<a href="https://colab.research.google.com/github/repitta/CienciaDeDadosEducacionais/blob/master/artigoOrientacaoAcademica/orientacao_academica.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 2.4.1 Load Libraries

In [0]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import pydotplus
from IPython.display import Image
from sklearn import tree
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
import pickle
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
import time
from sklearn.neural_network import MLPClassifier

### 2.4.2 Get Data

In [0]:
# table.csv has not missing values
table = pd.read_csv("https://raw.githubusercontent.com/repitta/CienciaDeDadosEducacionais/master/dadosUFRN/dados%20tratados%20ufrn/disciplinasECT201520191.csv")
table = table.loc[(table["disciplina"]=="ECT2203")]
table.drop(columns=["Unnamed: 0","matricula","disciplina"], inplace=True)
table.head()

Unnamed: 0,media,min,max,QuantDisciCursadas,quantAprovado,quantReprovado,carga_total,situacao,carga_total_atual,QuantDisciAtual,AnosMatriculado,vezesReprovado
9,1.72,0.0,5.3,5,1,4,285,REPROVADO,465,9,6,2
472,3.36,1.4,5.0,5,2,3,355,REPROVADO,245,4,3,1
586,3.86,0.0,7.8,5,2,3,355,APROVADO,365,6,3,1
623,3.3,0.0,6.6,2,1,1,140,REPROVADO,215,4,3,1
661,5.8,5.8,5.8,1,1,0,30,REPROVADO,255,4,3,1


In [0]:
table.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2932 entries, 9 to 52487
Data columns (total 12 columns):
media                 2932 non-null float64
min                   2932 non-null float64
max                   2932 non-null float64
QuantDisciCursadas    2932 non-null int64
quantAprovado         2932 non-null int64
quantReprovado        2932 non-null int64
carga_total           2932 non-null int64
situacao              2932 non-null object
carga_total_atual     2932 non-null int64
QuantDisciAtual       2932 non-null int64
AnosMatriculado       2932 non-null int64
vezesReprovado        2932 non-null int64
dtypes: float64(3), int64(8), object(1)
memory usage: 297.8+ KB


### 2.4.3 Clean, prepare and manipulate Data (feature engineering)

### 1.4.3 Numerical Pipeline

In [0]:
#converts certain features to categorical
class NumericalTransformer(BaseEstimator, TransformerMixin):
  def __init__(self, model=0):
     self.model = model
  def fit(self, X, y = None ):
    return self 

  def create_dummies(self, df, column_name, drop_first_col):
    """Create Dummy Columns from a single Column
    """
    dummies = pd.get_dummies(df[column_name],prefix=column_name, drop_first=drop_first_col)
    return dummies

# manipulate column "media"
  def process_media(self, df):
    # divide media column into a range of values
    cut_points_notas = [-1,3,5,7,8,11]
    label_names_notas = ["muito baixo","baixo","media","alto","muito alto"]
    df["media_categorias"] = pd.cut(df["media"],
                                 cut_points_notas,
                                 labels=label_names_notas)
    return self.create_dummies(df,"media_categorias",False)
   
  # manipulate column "min"
  def process_min(self, df):
    # divide media column into a range of values
    cut_points_notas = [-1,3,5,7,8,11]
    label_names_notas = ["muito baixo","baixo","media","alto","muito alto"]
    df["min_categorias"] = pd.cut(df["min"],
                                 cut_points_notas,
                                 labels=label_names_notas)
    return self.create_dummies(df,"min_categorias",False) 

  # manipulate column "max"
  def process_max(self, df):
    # divide media column into a range of values
    cut_points_notas = [-1,3,5,7,8,11]
    label_names_notas = ["muito baixo","baixo","media","alto","muito alto"]
    df["max_categorias"] = pd.cut(df["max"],
                                 cut_points_notas,
                                 labels=label_names_notas)
    return self.create_dummies(df,"max_categorias",False) 

  # manipulate column "carga_total"
  def process_carga_total(self, df):
    # divide media column into a range of values
    cut_points_carga = [0,100,200,300,400,600]
    label_names_carga = ["muito baixo","baixo","media","alto","muito alto"] 
    df["carga_total_categorias"] = pd.cut(df["carga_total"],
                                 cut_points_carga,
                                 labels=label_names_carga)
    return self.create_dummies(df,"carga_total_categorias",False) 

# manipulate column "carga_total_atual"
  def process_carga_total_atual(self, df):
    # divide media column into a range of values
    cut_points_carga = [0,100,200,300,400,600]
    label_names_carga = ["muito baixo","baixo","media","alto","muito alto"] 
    df["carga_total_atual_categorias"] = pd.cut(df["carga_total_atual"],
                                 cut_points_carga,
                                 labels=label_names_carga)
    return self.create_dummies(df,"carga_total_atual_categorias",False) 
  
  def tax_suc(self, df):
    df["tax_suc"] = df["quantAprovado"] / df["QuantDisciCursadas"]
    return pd.DataFrame(df["tax_suc"],columns=["tax_suc"])

  def addcol(sefl, df):
    QuantDisciCursadas = 	df["QuantDisciCursadas"]
    quantAprovado = 	df["quantAprovado"]
    quantReprovado = df["quantReprovado"]
    QuantDisciAtual = df["QuantDisciAtual"]
    AnosMatriculado = df["AnosMatriculado"]
    return pd.concat([QuantDisciCursadas, quantAprovado, quantReprovado, QuantDisciAtual,AnosMatriculado ],axis=1)


  #Transformer method we wrote for this transformer 
  def transform(self, X , y = None ):
    df = X.copy()

    media = self.process_media(df)  
    min1 = self.process_min(df)
    max1 = self.process_max(df)
    carga_total = self.process_carga_total(df)
    carga_total_atual = self.process_carga_total_atual(df)
    tax_suc = self.tax_suc(df)
    ad = self.addcol(df)
      
    return pd.concat([media,min1,max1,carga_total,carga_total_atual,tax_suc,ad],axis=1)

# Montar o gráfico

In [0]:
def process_media(df):
    # divide media column into a range of values
    cut_points_notas = [-1,3,5,7,8,11]
    label_names_notas = ["muito baixo","baixo","media","alto","muito alto"]
    df["media_categorias"] = pd.cut(df["media"],
                                 cut_points_notas,
                                 labels=label_names_notas)
    return df["media_categorias"] 
    #return self.create_dummies(df,"media_categorias",False)
   
  # manipulate column "min"
def process_min(df):
  # divide media column into a range of values
  cut_points_notas = [-1,3,5,7,8,11]
  label_names_notas = ["muito baixo","baixo","media","alto","muito alto"]
  df["min_categorias"] = pd.cut(df["min"],
                                cut_points_notas,
                                labels=label_names_notas)
  return df["min_categorias"] 
  #return self.create_dummies(df,"min_categorias",False) 

# manipulate column "max"
def process_max(df):
  # divide media column into a range of values
  cut_points_notas = [-1,3,5,7,8,11]
  label_names_notas = ["muito baixo","baixo","media","alto","muito alto"]
  df["max_categorias"] = pd.cut(df["max"],
                                cut_points_notas,
                                labels=label_names_notas)
  return df["max_categorias"] 
  #return self.create_dummies(df,"max_categorias",False) 

# manipulate column "carga_total"
def process_carga_total(df):
  # divide media column into a range of values
  cut_points_carga = [0,100,200,300,400,600]
  label_names_carga = ["muito baixo","baixo","media","alto","muito alto"] 
  df["carga_total_categorias"] = pd.cut(df["carga_total"],
                                cut_points_carga,
                                labels=label_names_carga)
  return df["carga_total_categorias"] 
  #return self.create_dummies(df,"carga_total_categorias",False) 

# manipulate column "carga_total_atual"
def process_carga_total_atual(df):
  # divide media column into a range of values
  cut_points_carga = [0,100,200,300,400,600]
  label_names_carga = ["muito baixo","baixo","media","alto","muito alto"] 
  df["carga_total_atual_categorias"] = pd.cut(df["carga_total_atual"],
                                cut_points_carga,
                                labels=label_names_carga)
  return df["carga_total_atual_categorias"]
  #return self.create_dummies(df,"carga_total_atual_categorias",False) 

def tax_suc(df):
  df["tax_suc"] = df["quantAprovado"] / df["QuantDisciCursadas"]
  cut_points_taxa = [-1,0.6,0.8,2]
  label_names_taxa = ["baixo","media", "alto"] 
  df["tax_suc_categorias"] = pd.cut(df["tax_suc"],
                                cut_points_taxa,
                                labels=label_names_taxa)
  return df["tax_suc_categorias"]
  #return pd.DataFrame(df["tax_suc"],columns=["tax_suc"])

def QuantDisciAtual(df):
  cut_points_QuantDisciAtual = [-1,5,11]
  label_names_QuantDisciAtual = ["baixo","alto"] 
  df["QuantDisciAtual_categorias"] = pd.cut(df["QuantDisciAtual"],
                                cut_points_QuantDisciAtual,
                                labels=label_names_QuantDisciAtual)
  return df["QuantDisciAtual_categorias"]
  #return pd.DataFrame(df["tax_suc"],columns=["tax_suc"])

def QuantDisciCursadas(df):
  cut_points_DisciCursadas = [-1,5,11]
  label_names_DisciCursadas = ["baixo","alto"] 
  df["DisciCursadas_categorias"] = pd.cut(df["QuantDisciCursadas"],
                                cut_points_DisciCursadas,
                                labels=label_names_DisciCursadas)
  return df["DisciCursadas_categorias"]
  #return pd.DataFrame(df["tax_suc"],columns=["tax_suc"])


def quantAprovado(df):
  cut_points_quantAprovado = [-1,5,11]
  label_names_quantAprovado = ["baixo","alto"] 
  df["quantAprovado_categorias"] = pd.cut(df["quantAprovado"],
                                cut_points_quantAprovado,
                                labels=label_names_quantAprovado)
  return df["quantAprovado_categorias"]
  #return pd.DataFrame(df["tax_suc"],columns=["tax_suc"]) 

def quantReprovado(df):
  cut_points_quantReprovado = [-1,5,11]
  label_names_quantReprovado = ["baixo","alto"] 
  df["quantReprovado_categorias"] = pd.cut(df["quantReprovado"],
                                cut_points_quantReprovado,
                                labels=label_names_quantReprovado)
  return df["quantReprovado_categorias"]
  #return pd.DataFrame(df["tax_suc"],columns=["tax_suc"]) 

def AnosMatriculado(df):
  cut_points_AnosMatriculado = [-1,1.5,11]
  label_names_AnosMatriculado = ["baixo","alto"] 
  df["AnosMatriculado_categorias"] = pd.cut(df["AnosMatriculado"],
                                cut_points_AnosMatriculado,
                                labels=label_names_AnosMatriculado)
  return df["AnosMatriculado_categorias"]
  #return pd.DataFrame(df["tax_suc"],columns=["tax_suc"]) 

def situacao(df):
  col = pd.Categorical(df.situacao)
  df["situacao_categoria"] = col.codes
  return df["situacao_categoria"]

def transform(df_antigo):
  df = pd.DataFrame()
  df["media_categorias"] = process_media(df_antigo)  
  df["min_categorias"] =  process_min(df_antigo)
  df["max_categorias"] =  process_max(df_antigo)
  df["carga_total_categorias"]= process_carga_total(df_antigo)
  df["carga_total_atual_categorias"] = process_carga_total_atual(df_antigo)
  df["tax_suc_categorias"] = tax_suc(df_antigo)
  df["DisciCursadas_categorias"] = QuantDisciCursadas(df_antigo)
  df["quantAprovado_categorias"] = quantAprovado(df_antigo)
  df["quantReprovado_categorias"] = quantReprovado(df_antigo)
  df["AnosMatriculado_categorias"] = AnosMatriculado(df_antigo)
  df["situacao_categoria"] = situacao(df_antigo)
  df["QuantDisciAtual_categorias"] =   QuantDisciAtual(df_antigo)
  return df
  

In [0]:
df  = transform(table)
df

Unnamed: 0,media_categorias,min_categorias,max_categorias,carga_total_categorias,carga_total_atual_categorias,tax_suc_categorias,DisciCursadas_categorias,quantAprovado_categorias,quantReprovado_categorias,AnosMatriculado_categorias,situacao_categoria,QuantDisciAtual_categorias
9,muito baixo,muito baixo,media,media,muito alto,baixo,baixo,baixo,baixo,alto,1,alto
472,baixo,muito baixo,baixo,alto,media,baixo,baixo,baixo,baixo,alto,1,baixo
586,baixo,muito baixo,alto,alto,alto,baixo,baixo,baixo,baixo,alto,0,alto
623,baixo,muito baixo,media,baixo,media,baixo,baixo,baixo,baixo,alto,1,baixo
661,media,media,media,muito baixo,media,alto,baixo,baixo,baixo,alto,1,baixo
...,...,...,...,...,...,...,...,...,...,...,...,...
52466,baixo,muito baixo,alto,alto,baixo,media,alto,baixo,baixo,baixo,0,baixo
52472,baixo,muito baixo,media,media,media,baixo,baixo,baixo,baixo,baixo,1,baixo
52474,baixo,muito baixo,media,alto,baixo,baixo,alto,baixo,baixo,baixo,1,baixo
52477,baixo,muito baixo,alto,media,media,baixo,baixo,baixo,baixo,baixo,0,baixo


In [0]:
titanic_df = df.copy()

# Create dimensions
media_cat_dim = go.parcats.Dimension(values=titanic_df.media_categorias, label="Media")

carga_dim = go.parcats.Dimension(values=titanic_df.carga_total_categorias	, label="Carga total")

disc_dim = go.parcats.Dimension(
    values=titanic_df.DisciCursadas_categorias, label="Quant Dis Cursadas")

taxa_dim = go.parcats.Dimension(
    values=titanic_df.tax_suc_categorias, label="taxa suc")

situacao_dim = go.parcats.Dimension(
    values=titanic_df.situacao_categoria, label="Situacao", categoryarray=[0, 1], ticktext=['aprovado', 'reprovado'] )

# Create parcats trace
color = titanic_df.situacao_categoria
colorscale = [[0, 'lightsteelblue'], [1, 'mediumseagreen']]

fig = go.Figure(data = [go.Parcats(dimensions=[media_cat_dim, carga_dim, disc_dim,taxa_dim, situacao_dim],
        line={'color': color, 'colorscale': colorscale},
        hoveron='color', hoverinfo='all',
        labelfont={'size': 18, 'family': 'Times'},
        tickfont={'size': 16, 'family': 'Times'},bundlecolors=True, 
        arrangement='freeform')])
fig.update_layout(width=800,height=500)

fig.show()

In [0]:
titanic_df = df.copy()

# Create dimensions
AnosMatriculado_categorias_cat_dim = go.parcats.Dimension(values=titanic_df.AnosMatriculado_categorias, label="Anos Matriculados")

carga_dim = go.parcats.Dimension(values=titanic_df.carga_total_atual_categorias	, label="Carga total Atual")

disc_dim = go.parcats.Dimension(
    values=titanic_df.QuantDisciAtual_categorias, label="Quant Dis atual")

situacao_dim = go.parcats.Dimension(
    values=titanic_df.situacao_categoria, label="Situacao", categoryarray=[0, 1], ticktext=['aprovado', 'reprovado'] )

# Create parcats trace
color = titanic_df.situacao_categoria
colorscale = [[0, 'lightsteelblue'], [1, 'mediumseagreen']]

fig = go.Figure(data = [go.Parcats(dimensions=[AnosMatriculado_categorias_cat_dim, carga_dim, disc_dim, situacao_dim  ],
        line={'color': color, 'colorscale': colorscale},
        hoveron='color', hoverinfo='all',
        labelfont={'size': 18, 'family': 'Times'},
        tickfont={'size': 16, 'family': 'Times'},bundlecolors=True, 
        arrangement='freeform')])
fig.update_layout(width=800,height=500)

fig.show()

# Rodar o pipeline

In [0]:
# for validation purposes
model = NumericalTransformer()
df = model.transform(table.drop(labels=["situacao"],axis=1))

In [0]:
df.head()

Unnamed: 0,media_categorias_muito baixo,media_categorias_baixo,media_categorias_media,media_categorias_alto,media_categorias_muito alto,min_categorias_muito baixo,min_categorias_baixo,min_categorias_media,min_categorias_alto,min_categorias_muito alto,max_categorias_muito baixo,max_categorias_baixo,max_categorias_media,max_categorias_alto,max_categorias_muito alto,carga_total_categorias_muito baixo,carga_total_categorias_baixo,carga_total_categorias_media,carga_total_categorias_alto,carga_total_categorias_muito alto,carga_total_atual_categorias_muito baixo,carga_total_atual_categorias_baixo,carga_total_atual_categorias_media,carga_total_atual_categorias_alto,carga_total_atual_categorias_muito alto,tax_suc,QuantDisciCursadas,quantAprovado,quantReprovado,QuantDisciAtual,AnosMatriculado
9,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0.2,5,1,4,9,6
472,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0.4,5,2,3,4,3
586,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0.4,5,2,3,6,3
623,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0.5,2,1,1,4,3
661,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1.0,1,1,0,4,3


### 2.4.4 Modeling (train and test)

In [0]:
# global varibles
seed = 42
num_folds = 10
scoring = {'Accuracy': make_scorer(accuracy_score)}

In [0]:
col = pd.Categorical(table.situacao)
table["situacao_categoria"] = col.codes

In [0]:
# split-out train/validation and test dataset
X_train, X_test, y_train, y_test = train_test_split(table.drop(labels="situacao_categoria",axis=1),
                                                    table["situacao_categoria"],
                                                    test_size=0.20,
                                                    random_state=seed,
                                                    shuffle=True,
                                                    stratify=table["situacao_categoria"])

### 2.4.5 Algorithm Tuning

In [0]:
# The full pipeline as a step in another pipeline with an estimator as the final step
pipe = Pipeline(steps = [('num_pipeline', NumericalTransformer()),
                         ("fs",SelectKBest()),
                         ("clf",XGBClassifier())
                        ]
                )

# create a dictionary with the hyperparameters
search_space = [
                {"clf":[RandomForestClassifier()],
                 "clf__n_estimators": [100],
                 "clf__criterion": ["entropy"],
                 "clf__max_leaf_nodes": [64],
                 "clf__random_state": [seed],
                 "fs__score_func":[chi2],
                 "fs__k":[4,9,15,30]
                 },
                {"clf":[KNeighborsClassifier()],
                 "clf__n_neighbors":[5,9,11],
                 "fs__score_func":[chi2],
                 "fs__k":[4,9,15,30]                 
                },
                {"clf":[SVC()],
                 "clf__kernel":["sigmoid",'rbf'],
                 "clf__degree":[3,4],
                 "clf__gamma":[0.1,5],
                 "clf__C":[0.001,1000],
                 "fs__score_func":[chi2],
                 "fs__k":[4,9,15,25,31] 
                 },
                {"clf":[GaussianNB()]
                 },
               {"clf":[MLPClassifier()],
                 "clf__hidden_layer_sizes": [(64,),(128,)],
                 "clf__activation": ["logistic"],
                 "clf__solver": ["sgd"],
                 "clf__max_iter": [500],
                 "clf__early_stopping":[True],
                 "clf__n_iter_no_change":[20],
                 "clf__validation_fraction":[0.20], 
                 },
                {"clf":[XGBClassifier()],
                "clf__n_estimators": [50,100],
                "clf__max_depth": [4,6],
                "clf__learning_rate": [0.001, 0.01,0.1],
                "clf__random_state": [seed],
                "clf__subsample": [1.0],
                "clf__colsample_bytree": [1.0],
                "fs__score_func":[chi2],
                "fs__k":[5,8,15,25,31]
                }
              ]

# create grid search
kfold = StratifiedKFold(n_splits=num_folds,random_state=seed)

# return_train_score=True
# official documentation: "computing the scores on the training set can be
# computationally expensive and is not strictly required to
# select the parameters that yield the best generalization performance".
grid = GridSearchCV(estimator=pipe, 
                    param_grid=search_space,
                    cv=kfold,
                    scoring=scoring,
                    return_train_score=True,
                    n_jobs=-1,
                    refit="Accuracy")

tmp = time.time()

# fit grid search
best_model = grid.fit(X_train,y_train)

print("CPU Training Time: %s seconds" % (str(time.time() - tmp)))

CPU Training Time: 604.1469466686249 seconds


In [0]:
print("Best: %f using %s" % (best_model.best_score_,best_model.best_params_))


Best: 0.723241 using {'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=64,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False), 'clf__criterion': 'entropy', 'clf__max_leaf_nodes': 64, 'clf__n_estimators': 100, 'clf__random_state': 42, 'fs__k': 15, 'fs__score_func': <function chi2 at 0x7ff022f33598>}


In [0]:
result = pd.DataFrame(best_model.cv_results_)
result.head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf,param_clf__criterion,param_clf__max_leaf_nodes,param_clf__n_estimators,param_clf__random_state,param_fs__k,param_fs__score_func,param_clf__n_neighbors,param_clf__C,param_clf__degree,param_clf__gamma,param_clf__kernel,param_clf__activation,param_clf__early_stopping,param_clf__hidden_layer_sizes,param_clf__max_iter,param_clf__n_iter_no_change,param_clf__solver,param_clf__validation_fraction,param_clf__colsample_bytree,param_clf__learning_rate,param_clf__max_depth,param_clf__subsample,params,split0_test_Accuracy,split1_test_Accuracy,split2_test_Accuracy,split3_test_Accuracy,split4_test_Accuracy,split5_test_Accuracy,split6_test_Accuracy,split7_test_Accuracy,split8_test_Accuracy,split9_test_Accuracy,mean_test_Accuracy,std_test_Accuracy,rank_test_Accuracy,split0_train_Accuracy,split1_train_Accuracy,split2_train_Accuracy,split3_train_Accuracy,split4_train_Accuracy,split5_train_Accuracy,split6_train_Accuracy,split7_train_Accuracy,split8_train_Accuracy,split9_train_Accuracy,mean_train_Accuracy,std_train_Accuracy
0,0.346735,0.00635,0.047786,0.002419,"(DecisionTreeClassifier(class_weight=None, cri...",entropy,64.0,100.0,42.0,4,<function chi2 at 0x7ff022f33598>,,,,,,,,,,,,,,,,,{'clf': (DecisionTreeClassifier(class_weight=N...,0.731915,0.676596,0.719149,0.778723,0.740426,0.662393,0.683761,0.683761,0.722222,0.696581,0.709595,0.033626,37,0.729858,0.734123,0.730806,0.725592,0.726066,0.737091,0.734723,0.735197,0.731407,0.733302,0.731817,0.003634
1,0.35082,0.019703,0.050196,0.009051,"(DecisionTreeClassifier(class_weight=None, cri...",entropy,64.0,100.0,42.0,9,<function chi2 at 0x7ff022f33598>,,,,,,,,,,,,,,,,,{'clf': (DecisionTreeClassifier(class_weight=N...,0.72766,0.702128,0.72766,0.774468,0.72766,0.666667,0.696581,0.688034,0.739316,0.735043,0.71855,0.029072,13,0.741706,0.746445,0.742654,0.739336,0.74218,0.746566,0.747039,0.747987,0.744197,0.742776,0.744089,0.00267
2,0.359322,0.021936,0.047438,0.001064,"(DecisionTreeClassifier(class_weight=None, cri...",entropy,64.0,100.0,42.0,15,<function chi2 at 0x7ff022f33598>,,,,,,,,,,,,,,,,,{'clf': (DecisionTreeClassifier(class_weight=N...,0.744681,0.706383,0.740426,0.774468,0.740426,0.683761,0.700855,0.683761,0.735043,0.722222,0.723241,0.027781,1,0.766825,0.770616,0.766351,0.759242,0.761137,0.765988,0.765514,0.769304,0.768356,0.770251,0.766358,0.003534
3,0.377959,0.017268,0.046146,0.000914,"(DecisionTreeClassifier(class_weight=None, cri...",entropy,64.0,100.0,42.0,30,<function chi2 at 0x7ff022f33598>,,,,,,,,,,,,,,,,,{'clf': (DecisionTreeClassifier(class_weight=N...,0.723404,0.706383,0.73617,0.774468,0.73617,0.679487,0.700855,0.688034,0.722222,0.722222,0.718977,0.025917,10,0.769194,0.775829,0.77346,0.769668,0.772038,0.775462,0.770725,0.767409,0.77262,0.772146,0.771855,0.002549
4,0.059498,0.01062,0.042946,0.001428,"KNeighborsClassifier(algorithm='auto', leaf_si...",,,,,4,<function chi2 at 0x7ff022f33598>,5.0,,,,,,,,,,,,,,,,"{'clf': KNeighborsClassifier(algorithm='auto',...",0.680851,0.693617,0.67234,0.642553,0.710638,0.611111,0.675214,0.623932,0.67094,0.653846,0.663539,0.029226,85,0.682464,0.695261,0.681043,0.648341,0.684834,0.691615,0.717669,0.672667,0.703932,0.682615,0.686044,0.017572
5,0.062958,0.01399,0.048875,0.007936,"KNeighborsClassifier(algorithm='auto', leaf_si...",,,,,9,<function chi2 at 0x7ff022f33598>,5.0,,,,,,,,,,,,,,,,"{'clf': KNeighborsClassifier(algorithm='auto',...",0.693617,0.638298,0.697872,0.66383,0.706383,0.636752,0.67094,0.645299,0.683761,0.653846,0.669083,0.024214,84,0.705687,0.713744,0.712796,0.720853,0.705687,0.734249,0.720038,0.708669,0.723828,0.699668,0.714522,0.009802
6,0.057489,0.009205,0.048589,0.002517,"KNeighborsClassifier(algorithm='auto', leaf_si...",,,,,15,<function chi2 at 0x7ff022f33598>,5.0,,,,,,,,,,,,,,,,"{'clf': KNeighborsClassifier(algorithm='auto',...",0.67234,0.680851,0.702128,0.702128,0.655319,0.688034,0.692308,0.675214,0.662393,0.696581,0.682729,0.015489,82,0.775829,0.777725,0.775829,0.770142,0.763507,0.77783,0.773093,0.766935,0.770725,0.769777,0.772139,0.004527
7,0.061345,0.006479,0.056168,0.001224,"KNeighborsClassifier(algorithm='auto', leaf_si...",,,,,30,<function chi2 at 0x7ff022f33598>,5.0,,,,,,,,,,,,,,,,"{'clf': KNeighborsClassifier(algorithm='auto',...",0.67234,0.697872,0.689362,0.73617,0.685106,0.662393,0.705128,0.67094,0.662393,0.666667,0.684861,0.022196,80,0.785782,0.781043,0.779147,0.77346,0.774408,0.77783,0.77262,0.782094,0.773567,0.777357,0.777731,0.00412
8,0.056355,0.014166,0.044814,0.004063,"KNeighborsClassifier(algorithm='auto', leaf_si...",,,,,4,<function chi2 at 0x7ff022f33598>,9.0,,,,,,,,,,,,,,,,"{'clf': KNeighborsClassifier(algorithm='auto',...",0.702128,0.702128,0.67234,0.67234,0.723404,0.636752,0.683761,0.641026,0.675214,0.649573,0.675906,0.026762,83,0.697156,0.709953,0.683412,0.664929,0.704739,0.699195,0.720038,0.691615,0.712459,0.683089,0.696658,0.01561
9,0.061251,0.016303,0.051123,0.010418,"KNeighborsClassifier(algorithm='auto', leaf_si...",,,,,9,<function chi2 at 0x7ff022f33598>,9.0,,,,,,,,,,,,,,,,"{'clf': KNeighborsClassifier(algorithm='auto',...",0.706383,0.676596,0.706383,0.706383,0.714894,0.645299,0.662393,0.679487,0.709402,0.683761,0.689126,0.02209,77,0.709479,0.733649,0.721801,0.727014,0.712796,0.729038,0.725722,0.73567,0.72288,0.732354,0.72504,0.008174


In [0]:
result_acc = result[['mean_train_Accuracy', 'std_train_Accuracy','mean_test_Accuracy', 'std_test_Accuracy','rank_test_Accuracy']].copy()
result_acc["std_ratio"] = result_acc.std_test_Accuracy/result_acc.std_train_Accuracy
result_acc.sort_values(by="rank_test_Accuracy",ascending=True)

Unnamed: 0,mean_train_Accuracy,std_train_Accuracy,mean_test_Accuracy,std_test_Accuracy,rank_test_Accuracy,std_ratio
2,0.766358,0.003534,0.723241,0.027781,1,7.860223
125,0.726936,0.003948,0.723241,0.030995,1,7.850712
105,0.724946,0.003794,0.722388,0.030750,3,8.105034
100,0.724946,0.003794,0.722388,0.030750,3,8.105034
124,0.725278,0.003331,0.721962,0.034189,5,10.264240
...,...,...,...,...,...,...
79,0.542810,0.011821,0.545416,0.028382,154,2.401089
60,0.543142,0.011439,0.544563,0.027839,156,2.433633
80,0.543142,0.011439,0.544563,0.027839,156,2.433633
58,0.510590,0.006094,0.508742,0.019698,158,3.232182


In [0]:
result_acc = result[['mean_train_Accuracy', 'std_train_Accuracy','mean_test_Accuracy', 'std_test_Accuracy']]
result_acc

Unnamed: 0,mean_train_Accuracy,std_train_Accuracy,mean_test_Accuracy,std_test_Accuracy
0,0.731817,0.003634,0.709595,0.033626
1,0.744089,0.002670,0.718550,0.029072
2,0.766358,0.003534,0.723241,0.027781
3,0.771855,0.002549,0.718977,0.025917
4,0.686044,0.017572,0.663539,0.029226
...,...,...,...,...
154,0.733049,0.002711,0.703198,0.023277
155,0.740346,0.002205,0.707889,0.027774
156,0.785548,0.006091,0.705757,0.024796
157,0.801469,0.004976,0.695096,0.030773


In [0]:
# final model
predict = best_model.predict(X_test)
print(accuracy_score(y_test, predict))
print(confusion_matrix(y_test,predict))
print(classification_report(y_test,predict))

0.7137989778534923
[[290  75]
 [ 93 129]]
              precision    recall  f1-score   support

           0       0.76      0.79      0.78       365
           1       0.63      0.58      0.61       222

    accuracy                           0.71       587
   macro avg       0.69      0.69      0.69       587
weighted avg       0.71      0.71      0.71       587



In [0]:
model = NumericalTransformer()
df = model.transform(table)
df

Unnamed: 0,media_categorias_muito baixo,media_categorias_baixo,media_categorias_media,media_categorias_alto,media_categorias_muito alto,min_categorias_muito baixo,min_categorias_baixo,min_categorias_media,min_categorias_alto,min_categorias_muito alto,max_categorias_muito baixo,max_categorias_baixo,max_categorias_media,max_categorias_alto,max_categorias_muito alto,carga_total_categorias_muito baixo,carga_total_categorias_baixo,carga_total_categorias_media,carga_total_categorias_alto,carga_total_categorias_muito alto,carga_total_atual_categorias_muito baixo,carga_total_atual_categorias_baixo,carga_total_atual_categorias_media,carga_total_atual_categorias_alto,carga_total_atual_categorias_muito alto,tax_suc,QuantDisciCursadas,quantAprovado,quantReprovado,QuantDisciAtual,AnosMatriculado
9,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0.200000,5,1,4,9,6
472,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0.400000,5,2,3,4,3
586,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0.400000,5,2,3,6,3
623,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0.500000,2,1,1,4,3
661,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1.000000,1,1,0,4,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52466,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0.666667,6,4,2,4,1
52472,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0.400000,5,2,3,4,1
52474,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0.500000,6,3,3,3,1
52477,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0.600000,5,3,2,5,1


In [0]:
sns.set()
carga_pivot = table.pivot_table(index="carga_total_categorias",values="situacao_categoria")
carga_pivot.plot.bar()
plt.show()
# aprovados em LOP tem carga horária muito baixa ou baixa no semstre anterior

In [0]:
sns.set()
carga_pivot = table.pivot_table(index="carga_total_atual_categorias",values="situacao_categoria")
carga_pivot.plot.bar()
plt.show()

In [0]:
g = sns.catplot(x="media_categorias", 
                hue="carga_total_categorias", 
                col="situacao",
                data=table, kind="count",
                height=8, aspect=.7)

In [0]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# feature selection
fsel_model = SelectKBest(score_func=chi2, k=4)
new_income = fsel_model.fit_transform(df,table.situacao_categoria)

# only k=5 columns
new_income

array([[1., 0., 1., 4.],
       [0., 0., 2., 3.],
       [0., 0., 2., 3.],
       ...,
       [0., 0., 3., 3.],
       [0., 0., 3., 2.],
       [0., 0., 2., 1.]])

In [0]:
df

Unnamed: 0,media_categorias_muito baixo,media_categorias_baixo,media_categorias_media,media_categorias_alto,media_categorias_muito alto,min_categorias_muito baixo,min_categorias_baixo,min_categorias_media,min_categorias_alto,min_categorias_muito alto,max_categorias_muito baixo,max_categorias_baixo,max_categorias_media,max_categorias_alto,max_categorias_muito alto,carga_total_categorias_muito baixo,carga_total_categorias_baixo,carga_total_categorias_media,carga_total_categorias_alto,carga_total_categorias_muito alto,carga_total_atual_categorias_muito baixo,carga_total_atual_categorias_baixo,carga_total_atual_categorias_media,carga_total_atual_categorias_alto,carga_total_atual_categorias_muito alto,tax_suc,QuantDisciCursadas,quantAprovado,quantReprovado,QuantDisciAtual,AnosMatriculado
9,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0.200000,5,1,4,9,6
472,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0.400000,5,2,3,4,3
586,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0.400000,5,2,3,6,3
623,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0.500000,2,1,1,4,3
661,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1.000000,1,1,0,4,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52466,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0.666667,6,4,2,4,1
52472,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0.400000,5,2,3,4,1
52474,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0.500000,6,3,3,3,1
52477,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0.600000,5,3,2,5,1


In [0]:
# columns that were selected - univariate selection
df.loc[:,fsel_model.get_support()].columns

Index(['media_categorias_muito baixo', 'max_categorias_muito alto',
       'quantAprovado', 'quantReprovado'],
      dtype='object')