In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import math


In [3]:
df = pd.read_csv('Data laboratorio 1/students.csv', sep=';')
df.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [4]:
df['Target'].unique()

array(['Dropout', 'Graduate', 'Enrolled'], dtype=object)

In [5]:
#Unificando Graduate + Enrolled
df['Target']=df['Target'].replace('Graduate','Enrolled')
df['Target'].unique()

array(['Dropout', 'Enrolled'], dtype=object)

In [6]:
#Tipos de datos, buscando los datos continuos
df.dtypes

Marital status                                      int64
Application mode                                    int64
Application order                                   int64
Course                                              int64
Daytime/evening attendance\t                        int64
Previous qualification                              int64
Previous qualification (grade)                    float64
Nacionality                                         int64
Mother's qualification                              int64
Father's qualification                              int64
Mother's occupation                                 int64
Father's occupation                                 int64
Admission grade                                   float64
Displaced                                           int64
Educational special needs                           int64
Debtor                                              int64
Tuition fees up to date                             int64
Gender        

In [7]:
print(df.select_dtypes(include=['float']).columns)

Index(['Previous qualification (grade)', 'Admission grade',
       'Curricular units 1st sem (grade)', 'Curricular units 2nd sem (grade)',
       'Unemployment rate', 'Inflation rate', 'GDP'],
      dtype='object')


#### Preprocesing
* Unificar los valores graduate enrolled de Target
* Pasar los datos continuos a low mid y high (según aplique)
    * Previous Qualification
    * Admission Grade
    * Curricular units 1st sem (grade) - (no dice continuous en la pagina pero float)
    * Curricular units 2nd sem (grade) -    "
    * Unemployment rate
    * Inflation rate
    * GDP


In [8]:
from sklearn.preprocessing import KBinsDiscretizer

In [9]:
#Defino Entrenamiento y testeo
X = df.drop("Target", axis=1)
y = df['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2 , stratify=y, random_state=42)
#Separo Entrenamiento en entrenamiento y validación
train = X_train.join(y_train)
test = X_test.join(y_test)
y = train["Target"]
training, validation =  train_test_split(train, test_size=0.2 , stratify=y, random_state=42)

In [10]:
#Discretizar las columnas continuas

col_cont = training.select_dtypes(include=['float']).columns

discretizer = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform', subsample=None)

# Fit the discretizer on the selected columns
discretizer.fit(training[col_cont])

# Transform the selected columns
discretized_data_training = discretizer.transform(training[col_cont]).astype(int)

discretized_data_validation = discretizer.transform(validation[col_cont]).astype(int)


training[col_cont] = discretized_data_training
validation[col_cont] = discretized_data_validation


In [11]:
discretizer_fin = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform', subsample=None)
discretizer_fin.fit(train[col_cont])

discretized_data_train = discretizer_fin.transform(train[col_cont]).astype(int)
discretized_data_test = discretizer_fin.transform(test[col_cont]).astype(int)

train[col_cont] = discretized_data_train
test[col_cont] = discretized_data_test

In [12]:
train["Target"].value_counts()

Enrolled    2402
Dropout     1137
Name: Target, dtype: int64

#### ID3

In [13]:
def entropia(df, objetivo, resultados):
    entropia = [0] * len(resultados)
    for i, val in enumerate(resultados):
        ci = df[df[objetivo]==val].shape[0]
        tot = df.shape[0]
        entropia[i]= -ci/tot * math.log(ci/tot) if not (ci==0) else 0
    return sum(entropia)

In [14]:
def entropiasbcjto(df, atributo, objetivo): #df datos, atributo que separa, columna objetivo
    valores = df[atributo].unique() #valores posibles del atributo
    aux = [0]*len(valores)
    s = df.shape[0] # cantidad de datos
    for i,valor in enumerate(valores):
        sv = df[df[atributo]==valor].shape[0] #cantidad de datos con atributo = valor
        EntrSv = entropia(df[df[atributo]==valor], objetivo, df[objetivo].unique())#Entropía de cada valor posible del atributo
        aux[i]=sv/s*EntrSv
    return sum(aux)

In [15]:
# Information Gain
def info_gain(df, atributo, objetivo):
    return entropia(df, objetivo, df[objetivo].unique()) - entropiasbcjto(df, atributo, objetivo)

In [16]:
def max_info (df, objetivo):
    atributos = df.drop(objetivo,axis = 1).columns
    max_info = -1
    max_info_atrib = None

    for col in atributos:
        gain_col = info_gain(df, col, objetivo)

        if gain_col > max_info:
            max_info = gain_col
            max_info_atrib = col
    
    return max_info_atrib



In [17]:
def generar_subarbol(atributo, data, objetivo, min_gain_split, min_sample_split):
    resultados = data[objetivo].unique() # Etiquetas resultado
    valores = data[atributo].unique() # valores del atributo
    tree = {} #subarbol

    for valor in valores:
        datos_valor = data[data[atributo]==valor] # me quedo con los datos de esta rama
        sig_nodo = max_info(datos_valor,objetivo)
        cant_datos = datos_valor.shape[0]
        info = info_gain(datos_valor, sig_nodo, objetivo) > min_gain_split #umbral información
        cant = cant_datos > min_sample_split # umbral cantidad de datos
        if info and cant: #si supera los dos umbrales
            tree[valor]="?" #marco la rama a extender
    
        else: #Si la rama no supera los umbrales hoja
            cant0 = len(datos_valor[datos_valor[objetivo]==resultados[0]])#cantidad de datos de cada etiqueta
            cant1 = len(datos_valor[datos_valor[objetivo]==resultados[1]])
            tree[valor] = resultados[0] if cant0>cant1 else resultados[1] #asigno la etiqueta mayoritaria
            data = data[data[atributo]!=valor] #Como es hoja saco estos datos
         
    return tree, data

In [18]:
def generar_arbol(raiz, val_prev, data, objetivo, min_gain_split, min_sample_split):
    if data.shape[0]!=0: 
        sig_nodo = max_info(data,objetivo)
        arbol, data = generar_subarbol(sig_nodo, data, objetivo, min_gain_split, min_sample_split)

        if val_prev != None :
            raiz[val_prev]={}
            raiz[val_prev][sig_nodo] = arbol
            nueva_raiz = raiz[val_prev][sig_nodo]

        if val_prev == None: # si es el comienzo del arbol
            raiz[sig_nodo]=arbol # raiz del arbol
            nueva_raiz = raiz[sig_nodo]

        for nodo, rama in list(nueva_raiz.items()):
            if rama =="?":
                generar_arbol(nueva_raiz, nodo, data[data[sig_nodo]==nodo], objetivo, min_gain_split, min_sample_split) 
                


In [19]:
def id3(data, objetivo, min_split_gain, min_sample_split):
    arbol={}
    generar_arbol(arbol, None, data, objetivo, min_split_gain, min_sample_split)
    return arbol

In [20]:
def predict(tree, instance):
    if not isinstance(tree, dict): #if it is leaf node
        return tree #return the value
    else:
        root_node = next(iter(tree)) #getting first key/feature name of the dictionary
        feature_value = instance[root_node] #value of the feature
        if feature_value in tree[root_node]: #checking the feature value in current tree node
            return predict(tree[root_node][feature_value], instance) #goto next feature
        else:
            return None

In [21]:
def evaluate(tree, test_data_m, label):
    correct_preditct = 0
    wrong_preditct = 0
    for index, row in test_data_m.iterrows(): #for each row in the dataset
        result = predict(tree, test_data_m.loc[index]) #predict the row
        if result == test_data_m[label].loc[index]: #predicted value and expected value is same or not
            correct_preditct += 1 #increase correct count
        else:
            wrong_preditct += 1 #increase incorrect count
    accuracy = correct_preditct / (correct_preditct + wrong_preditct) #calculating accuracy
    return accuracy

In [22]:
arbol_03510 = id3(training, "Target", 0.35, 10)
print(evaluate(arbol_03510, validation, "Target"))
arbol_03510

#Profundidad 48 lineas

0.827683615819209


{'Curricular units 2nd sem (approved)': {3: 'Enrolled',
  5: 'Enrolled',
  6: 'Enrolled',
  7: 'Enrolled',
  8: 'Enrolled',
  2: 'Dropout',
  9: 'Enrolled',
  0: 'Dropout',
  4: 'Enrolled',
  10: 'Enrolled',
  14: 'Enrolled',
  12: {'Curricular units 1st sem (credited)': {9: 'Enrolled',
    14: 'Dropout',
    13: 'Dropout',
    12: 'Enrolled',
    0: 'Enrolled',
    4: 'Enrolled',
    11: 'Enrolled',
    7: 'Enrolled',
    10: 'Enrolled',
    16: 'Enrolled',
    6: 'Enrolled',
    18: 'Dropout',
    8: 'Enrolled'}},
  17: 'Enrolled',
  11: {'Age at enrollment': {21: 'Enrolled',
    25: 'Enrolled',
    22: 'Enrolled',
    20: 'Enrolled',
    37: 'Enrolled',
    34: 'Enrolled',
    36: 'Enrolled',
    19: 'Enrolled',
    44: 'Enrolled',
    28: 'Dropout',
    26: 'Enrolled',
    33: 'Enrolled',
    31: 'Dropout',
    29: 'Dropout',
    24: 'Enrolled',
    23: 'Dropout',
    54: 'Enrolled'}},
  1: 'Dropout',
  13: 'Enrolled',
  18: 'Enrolled',
  16: 'Dropout',
  19: 'Enrolled',
  20: 'Enr

In [23]:
arbol_0310 = id3(training, "Target", 0.3, 10)
print(evaluate(arbol_0310, validation, "Target"))
# Profundidad: 64 lineas

0.8305084745762712


In [24]:
arbol_02510 = id3(training, "Target", 0.25, 10)
print(evaluate(arbol_02510, validation, "Target"))
arbol_02510

#Profundidad 70 lineas

0.8305084745762712


{'Curricular units 2nd sem (approved)': {3: 'Enrolled',
  5: 'Enrolled',
  6: 'Enrolled',
  7: 'Enrolled',
  8: 'Enrolled',
  2: 'Dropout',
  9: {'Curricular units 1st sem (credited)': {2: 'Enrolled',
    8: 'Enrolled',
    7: 'Enrolled',
    11: 'Enrolled',
    5: 'Enrolled',
    6: 'Enrolled',
    4: 'Enrolled',
    12: 'Dropout',
    10: 'Enrolled',
    3: 'Enrolled',
    9: 'Dropout'}},
  0: 'Dropout',
  4: 'Enrolled',
  10: {'Course': {8014: 'Enrolled',
    9070: 'Dropout',
    9147: 'Enrolled',
    9238: 'Enrolled',
    9991: 'Enrolled',
    9003: 'Dropout',
    171: 'Enrolled'}},
  14: 'Enrolled',
  12: {'Curricular units 1st sem (credited)': {9: 'Enrolled',
    14: 'Dropout',
    13: 'Dropout',
    12: 'Enrolled',
    0: 'Enrolled',
    4: 'Enrolled',
    11: 'Enrolled',
    7: 'Enrolled',
    10: 'Enrolled',
    16: 'Enrolled',
    6: 'Enrolled',
    18: 'Dropout',
    8: 'Enrolled'}},
  17: 'Enrolled',
  11: {'Age at enrollment': {21: 'Enrolled',
    25: 'Enrolled',
    22: '

In [25]:
arbol_0210 = id3(training, "Target", 0.2, 10)
print(evaluate(arbol_0210, validation, "Target"))
arbol_0210

#Profundidad del árbol: 85 lineas


0.826271186440678


{'Curricular units 2nd sem (approved)': {3: 'Enrolled',
  5: 'Enrolled',
  6: 'Enrolled',
  7: 'Enrolled',
  8: 'Enrolled',
  2: 'Dropout',
  9: {'Curricular units 1st sem (credited)': {2: 'Enrolled',
    8: 'Enrolled',
    7: 'Enrolled',
    11: 'Enrolled',
    5: 'Enrolled',
    6: 'Enrolled',
    4: 'Enrolled',
    12: 'Dropout',
    10: 'Enrolled',
    3: 'Enrolled',
    9: 'Dropout'}},
  0: 'Dropout',
  4: 'Enrolled',
  10: {'Course': {8014: 'Enrolled',
    9070: 'Dropout',
    9147: 'Enrolled',
    9238: 'Enrolled',
    9991: 'Enrolled',
    9003: 'Dropout',
    171: 'Enrolled'}},
  14: 'Enrolled',
  12: {'Curricular units 1st sem (credited)': {9: 'Enrolled',
    14: 'Dropout',
    13: 'Dropout',
    12: 'Enrolled',
    0: 'Enrolled',
    4: 'Enrolled',
    11: 'Enrolled',
    7: 'Enrolled',
    10: 'Enrolled',
    16: 'Enrolled',
    6: 'Enrolled',
    18: 'Dropout',
    8: 'Enrolled'}},
  17: 'Enrolled',
  11: {'Age at enrollment': {21: 'Enrolled',
    25: 'Enrolled',
    22: '

In [26]:
arbol_0110 = id3(training, "Target", 0.1, 10)
print(evaluate(arbol_0110, validation, "Target"))

#Profundidad 319 lineas


0.7669491525423728


### Mejores parámetros:
* min_split_gain = 0.3
* min_sample_split = 10
* bins: resultan indiferentes ya que con los parámetros anteriores las variables discretizadas no aparecen arriba en el árbol



#### Resultados en test: 0.825 accuracy

In [27]:
arbol_final = id3(train, "Target", 0.3, 10)


In [28]:
print(evaluate(arbol_final, test, "Target"))

0.8248587570621468


## Comparación con Scikit learn

In [32]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics


#### Comparación con decision tree de sklearn

Los resultados de accuracy dan iguales (si usamos el mismo parámetro de min_sample split)

In [33]:
tree = DecisionTreeClassifier(criterion="entropy")
#con min_sample_split = 2

In [34]:
tree.fit(X_train, y_train)

y_pred = tree.predict(X_test)

print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     Dropout       0.71      0.71      0.71       284
    Enrolled       0.86      0.86      0.86       601

    accuracy                           0.81       885
   macro avg       0.78      0.79      0.79       885
weighted avg       0.81      0.81      0.81       885



In [35]:
tree2 = DecisionTreeClassifier(criterion="entropy", min_samples_split = 10)

tree2.fit(X_train, y_train)

y_pred = tree2.predict(X_test)

print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     Dropout       0.72      0.70      0.71       284
    Enrolled       0.86      0.87      0.87       601

    accuracy                           0.82       885
   macro avg       0.79      0.79      0.79       885
weighted avg       0.82      0.82      0.82       885



#### Comparación con Random Forest de Sklearn

El método de Random Forest obtiene mejores resultados con los hiperparámetros predeterminados

In [38]:
forest = RandomForestClassifier()

forest.fit(X_train, y_train)

y_pred = forest.predict(X_test)

print(metrics.classification_report(y_test, y_pred))


              precision    recall  f1-score   support

     Dropout       0.85      0.72      0.78       284
    Enrolled       0.88      0.94      0.91       601

    accuracy                           0.87       885
   macro avg       0.86      0.83      0.84       885
weighted avg       0.87      0.87      0.87       885

