In [270]:
import pandas as pd
import numpy as np

from sklearn import datasets, ensemble

from evidently.report import Report

from evidently.metrics import *


from evidently import ColumnMapping

In [271]:
df = pd.read_csv('../data/raw/training_2009.csv')

In [272]:
temp = df[['male', 'race_ethnicity', 'frpl','iep', 'ell', 'ever_alternative', 'ap_ever_take_class',
           'math_ss', 'read_ss','pct_days_absent', 'gpa', 'scale_score_11_eng', 'scale_score_11_math', 
           'scale_score_11_read', 'scale_score_11_comp','hs_diploma']]

In [273]:
from sklearn.model_selection import train_test_split
X = temp.drop(columns = ['hs_diploma'], axis = 1)
y = temp['hs_diploma']

In [274]:
# split the data:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 67)

In [275]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.compose import make_column_selector as selector

numeric_features = X.select_dtypes(exclude = 'object').columns
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), 
           ("scaler", StandardScaler())]
)

categorical_features = X.select_dtypes(include='object').columns
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
        ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, selector(dtype_exclude="object")),
        ("cat", categorical_transformer, selector(dtype_include="object")),
    ]
)

In [276]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [277]:
import xgboost as xgb

xgb_clf = xgb.XGBClassifier(
    objective = 'binary:logistic',colsample_bytree = 0.7055649040625979,
    gamma = 6.528941578100736, 
    max_depth = 11, 
    min_child_weight = 7.0,
    reg_alpha = 40.0, 
    reg_lambda =  0.0067382393835737076,
    random_state = 42
 )

In [278]:
xgb_clf.fit(X_train, y_train)
# Make predictions
predictions = xgb_clf.predict(X_test)

In [358]:
model_details = """
  # Model Details

  ## Description
  * **Model name**: Proof of Concept for Machine Learning in Student Analytics
  * **Model author**: Ryan Wilson - Student Analytics
  * **Model type**: Probabilistic classification model.
  * **Model architecture**: XGBoost
  * **Date**: October 2023.

  ## Intended use
  * **Primary use case**: The purpose of this project is to create a pipeline for classification problems.  In the case of this project we
  are predicting whether a student will recieve a diploma.  The pipleline tested 7 classification models and the best model was selected on
  its accruracy, precision, and recall. The result is predicited probabilitys of graduation and metrics of data quality and model quality.  
  This can serve as template which can be used in different classification questions across different analytic products.  
   
  * **Out of scope**: This model is a proof of concept and not intended for production use.
"""

In [376]:
training_dataset = """
  # Training dataset

  * **Training dataset**: Fake data set from the strategic data project.
  * **Source**: dataset from [OpenSDP](https://opensdp.github.io)
  """

In [377]:

model_evaluation = """
  # Model evaluation

  * **Evaluation dataset**: SDP Fake data set 20% of the original
  * **Metrics**: Accuracy, Precision, Recall.  Accuracy measures how often the model is correct overall.  Precision measures how often the model 
  is correct predicting the target class.  Recall measures how often the model is correct at predicting the positive class.  
  * **Decision threshold**: 0.5, objects with predicted probability over 0.5 belong to the target class.
"""

In [378]:
considerations = """
  # Caveats and Recommendations
Proof of concept, not for production use.
"""

In [379]:
# prep data
reference = pd.DataFrame(X_train, y_train, columns = ['num__male', 'num__frpl', 'num__iep', 'num__ell',
       'num__ever_alternative', 'num__ap_ever_take_class', 'num__math_ss',
       'num__read_ss', 'num__pct_days_absent', 'num__gpa',
       'num__scale_score_11_eng', 'num__scale_score_11_math',
       'num__scale_score_11_read', 'num__scale_score_11_comp',
       'cat__race_ethnicity_African-American',
       'cat__race_ethnicity_Asian/Pacific Islander',
       'cat__race_ethnicity_Hispanic',
       'cat__race_ethnicity_Multiple/Native American',
       'cat__race_ethnicity_White', 'cat__race_ethnicity_nan']).reset_index()

current = pd.DataFrame(X_test, y_test,columns = ['num__male', 'num__frpl', 'num__iep', 'num__ell',
       'num__ever_alternative', 'num__ap_ever_take_class', 'num__math_ss',
       'num__read_ss', 'num__pct_days_absent', 'num__gpa',
       'num__scale_score_11_eng', 'num__scale_score_11_math',
       'num__scale_score_11_read', 'num__scale_score_11_comp',
       'cat__race_ethnicity_African-American',
       'cat__race_ethnicity_Asian/Pacific Islander',
       'cat__race_ethnicity_Hispanic',
       'cat__race_ethnicity_Multiple/Native American',
       'cat__race_ethnicity_White', 'cat__race_ethnicity_nan']).reset_index()


In [380]:
reference.head()

Unnamed: 0,hs_diploma,num__male,num__frpl,num__iep,num__ell,num__ever_alternative,num__ap_ever_take_class,num__math_ss,num__read_ss,num__pct_days_absent,...,num__scale_score_11_eng,num__scale_score_11_math,num__scale_score_11_read,num__scale_score_11_comp,cat__race_ethnicity_African-American,cat__race_ethnicity_Asian/Pacific Islander,cat__race_ethnicity_Hispanic,cat__race_ethnicity_Multiple/Native American,cat__race_ethnicity_White,cat__race_ethnicity_nan
0,1,0.96134,0.800675,-0.355583,-0.126301,-0.615368,-0.519719,0.692965,0.509396,-0.377791,...,1.101301,0.559964,1.023842,0.856601,0.0,0.0,1.0,0.0,0.0,0.0
1,1,0.96134,-1.248946,-0.355583,-0.126301,-0.615368,-0.519719,1.154435,0.509396,-0.368607,...,1.442667,1.969583,1.405498,1.752841,0.0,0.0,0.0,0.0,1.0,0.0
2,1,0.96134,-1.248946,-0.355583,-0.126301,-0.615368,-0.519719,0.128946,-0.294521,0.577701,...,-0.09348,-0.379783,-0.121125,-0.03964,0.0,0.0,0.0,0.0,1.0,0.0
3,1,-1.040215,0.800675,-0.355583,-0.126301,-0.615368,-0.519719,-0.383799,0.308417,-0.347587,...,0.759935,0.7949,0.451358,0.63254,0.0,0.0,0.0,0.0,0.0,1.0
4,0,-1.040215,0.800675,-0.355583,-0.126301,1.625043,-0.519719,1.974827,1.782266,0.848197,...,2.466765,1.734647,1.405498,1.976901,1.0,0.0,0.0,0.0,0.0,0.0


In [385]:
reference['target'] = pd.DataFrame(xgb_clf.predict(reference[preprocessor.get_feature_names_out()]))
current['target'] = pd.DataFrame(xgb_clf.predict(current[preprocessor.get_feature_names_out()]))


#test_probas = pd.DataFrame(xgb_clf.predict_proba(current[preprocessor.get_feature_names_out()]))
#test_probas.columns = ['no_diploma', 'diploma']



In [386]:
reference['prediction'] = xgb_clf.predict_proba(reference[preprocessor.get_feature_names_out()])[:, 1]
current['prediction'] = xgb_clf.predict_proba(current[preprocessor.get_feature_names_out()])[:, 1]

In [387]:
reference.head()

Unnamed: 0,hs_diploma,num__male,num__frpl,num__iep,num__ell,num__ever_alternative,num__ap_ever_take_class,num__math_ss,num__read_ss,num__pct_days_absent,...,num__scale_score_11_read,num__scale_score_11_comp,cat__race_ethnicity_African-American,cat__race_ethnicity_Asian/Pacific Islander,cat__race_ethnicity_Hispanic,cat__race_ethnicity_Multiple/Native American,cat__race_ethnicity_White,cat__race_ethnicity_nan,target,prediction
0,1,0.96134,0.800675,-0.355583,-0.126301,-0.615368,-0.519719,0.692965,0.509396,-0.377791,...,1.023842,0.856601,0.0,0.0,1.0,0.0,0.0,0.0,1,0.976213
1,1,0.96134,-1.248946,-0.355583,-0.126301,-0.615368,-0.519719,1.154435,0.509396,-0.368607,...,1.405498,1.752841,0.0,0.0,0.0,0.0,1.0,0.0,1,0.978897
2,1,0.96134,-1.248946,-0.355583,-0.126301,-0.615368,-0.519719,0.128946,-0.294521,0.577701,...,-0.121125,-0.03964,0.0,0.0,0.0,0.0,1.0,0.0,0,0.232173
3,1,-1.040215,0.800675,-0.355583,-0.126301,-0.615368,-0.519719,-0.383799,0.308417,-0.347587,...,0.451358,0.63254,0.0,0.0,0.0,0.0,0.0,1.0,1,0.978635
4,0,-1.040215,0.800675,-0.355583,-0.126301,1.625043,-0.519719,1.974827,1.782266,0.848197,...,1.405498,1.976901,1.0,0.0,0.0,0.0,0.0,0.0,1,0.792653


In [388]:
column_mapping = ColumnMapping()

column_mapping.target = 'hs_diploma'
column_mapping.prediction = 'target'

In [389]:

model_card = Report(metrics=[
    Comment(model_details),
    ClassificationClassBalance(),
    Comment(training_dataset),
    DatasetSummaryMetric(),
    Comment(model_evaluation),
    ClassificationQualityMetric(),
    ClassificationConfusionMatrix(),
    Comment(considerations),
])

model_card.run(current_data=current, reference_data=reference, column_mapping = column_mapping)

In [390]:
model_card

In [391]:

model_card.save_html('../artifacts/model_card.html')

In [392]:
model_card.save_html('../reports/model_card.html')