# XGBoost Model

In [1]:
import pandas as pd
import numpy as np

In [19]:
#import xgboost
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
# New data set with 'cambio_matricula' column
x = pd.read_csv('https://raw.githubusercontent.com/edghero/data/main/escuelas_nuevas.csv').dropna()

In [4]:
x['cambio_matricula'] = pd.to_numeric(x['cambio_matricula'], errors = 'coerce')

In [5]:
# Setting index
x = x.set_index('escuela')

In [6]:
# renaming nivel to match DE's current naming convention. I NEED TO GO OVER INTERLOCKING ONE BY ONE TO RECLASS THEM PROPERLY
x['nivel'] = np.where(x['nivel'] == 'elemental', 'primario',
                      np.where(x['nivel'] == 'intermedia','primario',
                               np.where(x['nivel'] == 'especializada','todos los niveles',
                                        np.where(x['nivel'] == 'superior', 'secundario',
                                                 np.where(x['nivel'] == 'interlocking','todos los niveles','otro')))))

In [7]:
x = x[['region','nivel','consolidada','matricula','cambio_matricula','promedio_espanol','promedio_matematica','promedio_ingles','promedio_ciencias']]

In [8]:
# Filtering by escuelas nivel primario
primario = x[x['nivel'] == 'primario'].drop(columns = 'nivel')

In [9]:
primario = pd.get_dummies(primario, prefix_sep = "_").drop(columns = 'region_SAN JUAN')

In [10]:
X= primario.drop(columns = 'consolidada')
y= primario['consolidada']

In [11]:
# Create the training and test sets
X_train,X_test,y_train,y_test= train_test_split(X, y, test_size=0.3, random_state=123)

XGBoost Model

In [20]:
# Instantiate the XGBClassifier: xg_cl
xg_cl = xgb.XGBClassifier(objective='binary:logistic', n_estimators=10, seed=123)

# Fit the classifier to the training set
xg_cl.fit(X_train,y_train)

# Predict the labels of the test set: preds
preds = xg_cl.predict(X_test)

# Compute the accuracy: accuracy
accuracy = float(np.sum(preds==y_test))/y_test.shape[0]
print("accuracy: %f" % (accuracy))

accuracy: 0.760204


In [21]:
# Compute and print the confusion matrix and classification report
print(confusion_matrix(y_test, preds))
print(classification_report(y_test, preds))

[[105  26]
 [ 21  44]]
              precision    recall  f1-score   support

           0       0.83      0.80      0.82       131
           1       0.63      0.68      0.65        65

    accuracy                           0.76       196
   macro avg       0.73      0.74      0.73       196
weighted avg       0.77      0.76      0.76       196



In [16]:
xg_cl.fit(X,y)

prediction = xg_cl.predict_proba(X)

primario['prediction'] = prediction[:,1]

In [13]:
# Create the DMatrix from X and y: churn_dmatrix
churn_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter dictionary: params
params = {"objective":"reg:logistic", "max_depth":3}

# Perform cross-validation: cv_results
cv_results = xgb.cv(dtrain=churn_dmatrix, params=params, 
                    nfold=3, num_boost_round=5, 
                    metrics="error", as_pandas=True, seed=123)

# Print cv_results
print(cv_results)

# Print the accuracy
print(((1-cv_results["test-error-mean"]).iloc[-1]))

   train-error-mean  train-error-std  test-error-mean  test-error-std
0          0.202910         0.008481         0.278823        0.053854
1          0.192970         0.013193         0.251237        0.040715
2          0.193740         0.017213         0.232825        0.025490
3          0.183014         0.011019         0.229760        0.028735
4          0.180723         0.017608         0.231303        0.032374
0.7686973333333333


In [14]:
# Perform cross_validation: cv_results
cv_results = xgb.cv(dtrain=churn_dmatrix, params=params, 
                    nfold=3, num_boost_round=5, 
                    metrics="auc", as_pandas=True, seed=123)

# Print cv_results
print(cv_results)

# Print the AUC
print((cv_results["test-auc-mean"]).iloc[-1])

   train-auc-mean  train-auc-std  test-auc-mean  test-auc-std
0        0.863757       0.002804       0.810364      0.019062
1        0.875846       0.001067       0.814486      0.022019
2        0.886505       0.000782       0.819896      0.021514
3        0.896948       0.002784       0.826558      0.018537
4        0.902939       0.002021       0.829445      0.016234
0.8294453333333335


Decision trees Model

In [None]:
# Instantiate the classifier: dt_clf_4
dt_clf_4 = DecisionTreeClassifier(max_depth=4)

# Fit the classifier to the training set
dt_clf_4.fit(X_train, y_train)

# Predict the labels of the test set: y_pred_4
y_pred_4 = dt_clf_4.predict(X_test)

# Compute the accuracy of the predictions: accuracy
accuracy = float(np.sum(y_pred_4==y_test))/y_test.shape[0]
print("accuracy:", accuracy)

accuracy: 0.7142857142857143
