In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
from tensorflow.keras.optimizers import SGD

In [None]:
# Import the SGD optimizer
from keras.callbacks import EarlyStopping

In [None]:
import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder

In [None]:
# New data set with 'cambio_matricula' column
x = pd.read_csv('https://raw.githubusercontent.com/edghero/data/main/escuelas_nuevas.csv').dropna()

In [None]:
x['cambio_matricula'] = pd.to_numeric(x['cambio_matricula'], errors = 'coerce')

In [None]:
# Setting index
x = x.set_index('escuela')

In [None]:
# renaming nivel to match DE's current naming convention. I NEED TO GO OVER INTERLOCKING ONE BY ONE TO RECLASS THEM PROPERLY
x['nivel'] = np.where(x['nivel'] == 'elemental', 'primario',
                      np.where(x['nivel'] == 'intermedia','primario',
                               np.where(x['nivel'] == 'especializada','todos los niveles',
                                        np.where(x['nivel'] == 'superior', 'secundario',
                                                 np.where(x['nivel'] == 'interlocking','todos los niveles','otro')))))

In [None]:
x = x[['region','nivel','consolidada','matricula','cambio_matricula','promedio_espanol','promedio_matematica','promedio_ingles','promedio_ciencias']]

In [None]:
x

Unnamed: 0,ANO_ESCOLAR,CODIGO_ESCUELA,region,municipio,escuela,escuela_receptora,nivel,consolidada,matricula,matricula_2016-17,cambio_matricula,promedio_espanol,promedio_matematica,promedio_ingles,promedio_ciencias
0,2017-18,15784,ARECIBO,ARECIBO,Abelardo Martinez Otero,0,superior,0,464,373.0,0.24,65.5,12.1,80.2,67.0
1,2017-18,74807,ARECIBO,VEGA BAJA,Agapito Rosario Rosario,0,elemental,0,473,531.0,-0.11,57.8,65.3,67.3,77.1
2,2017-18,17467,ARECIBO,CAMUY,Amalia Lopez de Avila (Nueva),0,elemental,0,332,148.0,1.24,70.4,76.7,76.1,90.4
3,2017-18,71886,ARECIBO,VEGA BAJA,Angel Sandin Martinez,0,intermedia,0,473,531.0,-0.11,57.8,65.3,67.3,77.1
4,2017-18,10637,ARECIBO,ARECIBO,Angelica Gomez de Betancourt,0,interlocking,0,214,223.0,-0.04,31.9,23.5,24.4,51.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1086,2017-18,69179,SAN JUAN,TRUJILLO ALTO,Alejandro Tapia y Rivera,0,interlocking,1,411,203.0,1.02,24.0,1.0,18.0,28.0
1087,2017-18,69054,SAN JUAN,TRUJILLO ALTO,Jose F Diaz,0,elemental,0,229,373.0,-0.39,24.0,43.8,17.9,40.0
1088,2017-18,62968,SAN JUAN,SAN JUAN,Villa Capri,0,elemental,0,222,255.0,-0.13,96.8,98.9,92.6,100.0
1089,2017-18,65557,SAN JUAN,SAN JUAN,Villa Granada,0,elemental,0,387,405.0,-0.04,29.7,10.8,14.7,32.6


In [None]:
primario.dtypes

In [None]:
# Filtering by escuelas nivel primario
primario = x[x['nivel'] == 'primario'].drop(columns = 'nivel')

In [None]:
# Filtering by escuelas nivel secundario
secundario = x[x['nivel'] == 'secundario'].drop(columns = 'nivel')

In [None]:
# Filtering by escuelas nivel todos los niveles
todos = x[x['nivel'] == 'todos los niveles'].drop(columns = 'nivel')

In [None]:
primario = pd.get_dummies(primario, prefix_sep = "_").drop(columns = 'region_SAN JUAN')

In [None]:
secundario = pd.get_dummies(secundario, prefix_sep = "_").drop(columns = 'region_SAN JUAN')


In [None]:
todos = pd.get_dummies(todos, prefix_sep = "_").drop(columns = 'region_SAN JUAN')

In [None]:
predictors_primario = primario.drop(columns = 'consolidada')
target_primario = to_categorical(primario['consolidada'])

In [None]:
predictors_secundario = secundario.drop(columns = 'consolidada')
target_secundario = to_categorical(secundario['consolidada'])

In [None]:
predictors_todos = todos.drop(columns = 'consolidada')
target_todos = to_categorical(todos['consolidada'])

First Deep Learning Model for 'escuelas primarias' with 'Adam' optimizer

In [None]:
early_stopping_monitor = EarlyStopping(patience=5)

In [None]:
# Set up the model: model
n_cols = predictors_primario.shape[1]
model = Sequential()
model.add(Dense(100, activation="relu", input_shape=(n_cols,))) # Add the first layer
model.add(Dense(30, activation="relu")) # Add the second layer
model.add(Dense(50, activation="sigmoid")) # Add the second layer
model.add(Dense(2, activation = 'softmax')) # Add the output layer; needed to add softmax activation since this method is for classifiication problem set

# Compile the model
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics = ['accuracy'])

model.fit(predictors_primario, target_primario, validation_split = 0.3, epochs = 50, callbacks = [early_stopping_monitor])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50


<keras.callbacks.History at 0x7f633079cfd0>

In [None]:
# Calculate predictions
predictions_primario = model.predict(predictors_primario)

In [None]:
primario['predictions'] = predictions_primario[:,1]

In [None]:
primario['predictions'] = round(primario.predictions)

In [None]:
print(confusion_matrix(primario['consolidada'], primario['predictions']))
print(classification_report(primario['consolidada'], primario['predictions']))

[[372  71]
 [ 83 127]]
              precision    recall  f1-score   support

           0       0.82      0.84      0.83       443
           1       0.64      0.60      0.62       210

    accuracy                           0.76       653
   macro avg       0.73      0.72      0.73       653
weighted avg       0.76      0.76      0.76       653



In [None]:
from keras.models import load_model

In [None]:
model.save('model_DL_adam.h5')

Deep Learning Model with SGD optimizer with different learning rates

In [None]:
early_stopping_monitor = EarlyStopping(patience=2)

In [None]:
n_cols = predictors_primario.shape[1]

lr_to_test = [.000001, .001, .00025]

# loop over learning rates
for lr in lr_to_test:
  print('\n\nTesting model with learning rate: %f\n'%lr )
  model = Sequential()
  model.add(Dense(100, activation="relu", input_shape=(n_cols,))) # Add the first layer
  model.add(Dense(30, activation="relu")) # Add the second layer
  model.add(Dense(50, activation="relu")) # Add the second layer
  model.add(Dense(2, activation = 'softmax')) # Add the output layer; needed to add softmax activation since this method is for classifiication problem set
  my_optimizer = SGD(learning_rate=lr)
  model.compile(optimizer= my_optimizer, loss = 'categorical_crossentropy', metrics=['accuracy'])
  #print("Loss function: " + model.loss)
  model.fit(predictors_primario, target_primario, validation_split=0.3, epochs= 50, callbacks=[early_stopping_monitor])


####Model with SGD as optimizer and learning rate of 0.00025

In [None]:
early_stopping_monitor = EarlyStopping(patience=5)

In [None]:
n_cols = predictors_primario.shape[1]

lr_to_test = [.00025]

# loop over learning rates
for lr in lr_to_test:
  print('\n\nTesting model with learning rate: %f\n'%lr )
  model = Sequential()
  model.add(Dense(100, activation="relu", input_shape=(n_cols,))) # Add the first layer
  model.add(Dense(30, activation="relu")) # Add the second layer
  model.add(Dense(50, activation="relu")) # Add the second layer
  model.add(Dense(2, activation = 'softmax')) # Add the output layer; needed to add softmax activation since this method is for classifiication problem set
  my_optimizer = SGD(learning_rate=lr)
  model.compile(optimizer= my_optimizer, loss = 'categorical_crossentropy', metrics=['accuracy'])
  #print("Loss function: " + model.loss)
  model.fit(predictors_primario, target_primario, validation_split=0.3, epochs= 50, callbacks=[early_stopping_monitor])




Testing model with learning rate: 0.000250

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50


In [None]:
# Calculate predictions
predictions_primario = model.predict(predictors_primario)

In [None]:
primario['predictions'] = predictions_primario[:,1]

In [None]:
primario['predictions'] = round(primario.predictions)

In [None]:
primario

In [None]:
print(confusion_matrix(primario['consolidada'], primario['predictions']))
print(classification_report(primario['consolidada'], primario['predictions']))

[[440   3]
 [189  21]]
              precision    recall  f1-score   support

           0       0.70      0.99      0.82       443
           1       0.88      0.10      0.18       210

    accuracy                           0.71       653
   macro avg       0.79      0.55      0.50       653
weighted avg       0.76      0.71      0.61       653

