## 2.0 Model Training  
### 2.1 Import Data and Required Packages

In [2]:
# Basic import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Modeling 
from sklearn.metrics import accuracy_score
# logistic regression
from sklearn.linear_model import LogisticRegression
# naive bayes
from sklearn.naive_bayes import GaussianNB
# Support Vector Machine
from sklearn import svm
# KNN
from sklearn.neighbors import KNeighborsClassifier
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
# Random Forest
from sklearn.ensemble import RandomForestClassifier
# xgboost
import xgboost as xgb


### 2.2 Import data

In [5]:
df = pd.read_csv('../data/raw/training_2009.csv')

In [6]:
df.head()

Unnamed: 0,sid,sid_type,first_coop_code,first_dist_code,first_hs_code,first_dist_name,first_hs_name,first_hs_alt,first_hs_urbanicity,chrt_ninth,...,ihe_retention_lt_4_yr_part_time,ihe_federal_loan_rate,ihe_share_25_older,ihe_med_debt_completers_all,ihe_med_debt_completers_pmts,ihe_ihe_repay_3_yr_all,ihe_rate_4_yr,ihe_rate_lt_4_yr,ihe_med_earn_10_yrs_after,ihe_pct_earn_gt_25k_6_yrs_after
0,2,Fake record,WKEC,415,5194,Everett,Everett,0,Town: Remote,2009,...,,,,,,,,,,
1,5,Fake record,GRREC,199,1648,Kingfisher,Kingfisher,0,Town: Distant,2009,...,,,,,,,,,,
2,12,Fake record,GRREC,142,1564,Diamond Lake,Diamond Lake,0,Rural: Distant,2009,...,,,,,,,,,,
3,13,Fake record,OVEC,238,2230,Orange,Pike,0,Suburb: Large,2009,...,,,,,,,,,,
4,14,Fake record,NKCES,517,7658,Foster,Kent,0,Rural: Fringe,2009,...,,0.5491,0.2428,24458.5,271.53949,0.757102,0.37589,,36100.0,0.591997


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52942 entries, 0 to 52941
Data columns (total 68 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   sid                              52942 non-null  int64  
 1   sid_type                         52942 non-null  object 
 2   first_coop_code                  52942 non-null  object 
 3   first_dist_code                  52942 non-null  int64  
 4   first_hs_code                    52942 non-null  int64  
 5   first_dist_name                  52942 non-null  object 
 6   first_hs_name                    52942 non-null  object 
 7   first_hs_alt                     52942 non-null  int64  
 8   first_hs_urbanicity              52484 non-null  object 
 9   chrt_ninth                       52942 non-null  int64  
 10  male                             52931 non-null  float64
 11  race_ethnicity                   52257 non-null  object 
 12  frpl              

### 2.3 Select variables of importance

In [8]:
temp = df[['male', 'race_ethnicity', 'frpl','iep', 'ell', 'ever_alternative', 'ap_ever_take_class',
           'math_ss', 'read_ss','pct_days_absent', 'gpa', 'scale_score_11_eng', 'scale_score_11_math', 
           'scale_score_11_read', 'scale_score_11_comp','hs_diploma']]

In [9]:
cross_tab = pd.crosstab(index=temp['hs_diploma'], columns='count')

# Display the crosstab
print(cross_tab)


col_0       count
hs_diploma       
0            9975
1           42967


### 2.4 Update int to boolean for demographics

In [10]:
#temp.loc[df['s_male'] == 0, 's_male'] = 'female'
#temp.loc[df['s_male'] == 1, 's_male'] = 'male'
#temp.loc[df['s_ell'] == 0, 's_ell'] = 'not_ell'
#temp.loc[df['s_ell'] == 1, 's_ell'] = 'ell'
#temp.loc[df['s_iep'] == 0, 's_iep'] = 'no_iep'
#temp.loc[df['s_iep'] == 1, 's_iep'] = 'iep'
#temp.loc[df['sch_charter'] == 0, 'sch_charter'] = 'no_charter'
#temp.loc[df['sch_charter'] == 1, 'sch_charter'] = 'yes_charter'
#temp.loc[df['sch_alternative'] == 0, 'sch_alternative'] = 'no_alt'
#temp.loc[df['sch_alternative'] == 1, 'sch_alternative'] = 'yes_alt'
#temp.loc[df['sch_vocational'] == 0, 'sch_vocational'] = 'no_voc'
#temp.loc[df['sch_vocational'] == 1, 'sch_vocational'] = 'yes_voc'


In [11]:
temp.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52942 entries, 0 to 52941
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   male                 52931 non-null  float64
 1   race_ethnicity       52257 non-null  object 
 2   frpl                 52258 non-null  float64
 3   iep                  52942 non-null  int64  
 4   ell                  52942 non-null  int64  
 5   ever_alternative     52942 non-null  int64  
 6   ap_ever_take_class   52942 non-null  int64  
 7   math_ss              43953 non-null  float64
 8   read_ss              43967 non-null  float64
 9   pct_days_absent      52855 non-null  float64
 10  gpa                  51720 non-null  float64
 11  scale_score_11_eng   43049 non-null  float64
 12  scale_score_11_math  43042 non-null  float64
 13  scale_score_11_read  43022 non-null  float64
 14  scale_score_11_comp  43012 non-null  float64
 15  hs_diploma           52942 non-null 

In [12]:
vars = temp.columns
unique_values_counts = temp.nunique()


# Display the result
print(unique_values_counts)

male                       2
race_ethnicity             5
frpl                       2
iep                        2
ell                        2
ever_alternative           2
ap_ever_take_class         2
math_ss                   83
read_ss                   82
pct_days_absent        32213
gpa                    18739
scale_score_11_eng        34
scale_score_11_math       31
scale_score_11_read       35
scale_score_11_comp       29
hs_diploma                 2
dtype: int64


In [None]:
for var in vars:
    unique = temp[var].unique()
    print(var, unique)

### 2.5 Train test split

In [None]:
from sklearn.model_selection import train_test_split
X = temp.drop(columns = ['hs_diploma'], axis = 1)
y = temp['hs_diploma']

### Class distribution

In [None]:
diploma_cnts = y.value_counts()
diploma_cnts


In [None]:
# show pie chart
y.value_counts().plot.pie(autopct = '%.2f')

In [None]:
import matplotlib.pyplot as plt
fig1, ax1 = plt.subplots()
ax1.pie(y.value_counts(), autopct = '%.2f', labels = diploma_cnts.index)

### Class balancing

In [None]:
# split the data:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 67)

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
y_train.value_counts().plot.pie(autopct = '%2.f')

In [None]:
y_train.value_counts()

### Create pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.compose import make_column_selector as selector

numeric_features = X.select_dtypes(exclude = 'object').columns
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), 
           ("scaler", StandardScaler())]
)

categorical_features = X.select_dtypes(include='object').columns
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
        ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, selector(dtype_exclude="object")),
        ("cat", categorical_transformer, selector(dtype_include="object")),
    ]
)

In [None]:
models = {}

# Base
models['Baseline'] = 0

# Logistic Regression
from sklearn.linear_model import LogisticRegression
models['Logistic Regression'] = LogisticRegression()

# Support Vector Machines
from sklearn.svm import LinearSVC
models['Support Vector Machines'] = LinearSVC()

# Decision Trees
from sklearn.tree import DecisionTreeClassifier
models['Decision Trees'] = DecisionTreeClassifier()

# Random Forest
from sklearn.ensemble import RandomForestClassifier
models['Random Forest'] = RandomForestClassifier()

# Naive Bayes
from sklearn.naive_bayes import GaussianNB
models['Naive Bayes'] = GaussianNB()

# K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
models['K-Nearest Neighbor'] = KNeighborsClassifier()

# xgbppst
import xgboost as xgb
models['xgBoost'] = xgb.XGBClassifier(objective="binary:logistic", random_state=42)

In [None]:
models

### Imbalanced Modeling

In [None]:
X_train = preprocessor.fit_transform(X_train)


In [None]:
X_test = preprocessor.transform(X_test)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

accuracy, precision, recall = {}, {}, {}

for key in models.keys():

    if models[key] == 0:
        ##baseline
        predictions = np.ones(len(y_test))
        accuracy[key] = accuracy_score(predictions, y_test)
        precision[key] = precision_score(predictions, y_test)
        recall[key] = recall_score(predictions, y_test)
    elif models[key] != 0:
        # Fit the classifier
        models[key].fit(X_train, y_train)
        # Make predictions
        predictions = models[key].predict(X_test)
        # Calculate metrics
        accuracy[key] = accuracy_score(predictions, y_test)
        precision[key] = precision_score(predictions, y_test)
        recall[key] = recall_score(predictions, y_test)

In [None]:
df_model = pd.DataFrame(index=models.keys(), columns=['Accuracy', 'Precision', 'Recall'])
df_model['Accuracy'] = accuracy.values()
df_model['Precision'] = precision.values()
df_model['Recall'] = recall.values()

df_model

In [None]:
ax = df_model.plot.barh()
ax.legend(
    ncol=len(models.keys()), 
    bbox_to_anchor=(0, 1), 
    loc='lower left', 
    prop={'size': 14}
)
plt.tight_layout()

### Random Undersampling

In [None]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(sampling_strategy=1) # numerical
#rus = RandomUnderSampler(sampling_strategy='not minority') # string

X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)

In [None]:
y_train_rus.value_counts()

In [None]:
ax = y_train_rus.value_counts().plot.pie(autopct = '%.2f')
_ = ax.set_title("Under-sampling")

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

accuracy, precision, recall = {}, {}, {}

for key in models.keys():

    if models[key] == 0:
        ##baseline
        predictions = np.ones(len(y_test))
        accuracy[key] = accuracy_score(predictions, y_test)
        precision[key] = precision_score(predictions, y_test)
        recall[key] = recall_score(predictions, y_test)
    elif models[key] != 0:
        # Fit the classifier
        models[key].fit(X_train_rus, y_train_rus)
        # Make predictions
        predictions = models[key].predict(X_test)
        # Calculate metrics
        accuracy[key] = accuracy_score(predictions, y_test)
        precision[key] = precision_score(predictions, y_test)
        recall[key] = recall_score(predictions, y_test)

In [None]:
df_model = pd.DataFrame(index=models.keys(), columns=['Accuracy', 'Precision', 'Recall'])
df_model['Accuracy'] = accuracy.values()
df_model['Precision'] = precision.values()
df_model['Recall'] = recall.values()

df_model

In [None]:
ax = df_model.plot.barh()
ax.legend(
    ncol=len(models.keys()), 
    bbox_to_anchor=(0, 1), 
    loc='lower left', 
    prop={'size': 14}
)
plt.tight_layout()

### Random Oversampling

In [None]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(sampling_strategy=1) # numerical
#rus = RandomOverSampler(sampling_strategy='not minority') # string

X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)


In [None]:
y_train_ros.value_counts()

In [None]:
ax = y_train_ros.value_counts().plot.pie(autopct = '%.2f')
_ = ax.set_title("Over-sampling")

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

accuracy, precision, recall = {}, {}, {}

for key in models.keys():

    if models[key] == 0:
        ##baseline
        predictions = np.ones(len(y_test))
        accuracy[key] = accuracy_score(predictions, y_test)
        precision[key] = precision_score(predictions, y_test)
        recall[key] = recall_score(predictions, y_test)
    elif models[key] != 0:
        # Fit the classifier
        models[key].fit(X_train_ros, y_train_ros)
        # Make predictions
        predictions = models[key].predict(X_test)
        # Calculate metrics
        accuracy[key] = accuracy_score(predictions, y_test)
        precision[key] = precision_score(predictions, y_test)
        recall[key] = recall_score(predictions, y_test)

In [None]:
df_model = pd.DataFrame(index=models.keys(), columns=['Accuracy', 'Precision', 'Recall'])
df_model['Accuracy'] = accuracy.values()
df_model['Precision'] = precision.values()
df_model['Recall'] = recall.values()

df_model

In [None]:
ax = df_model.plot.barh()
ax.legend(
    ncol=len(models.keys()), 
    bbox_to_anchor=(0, 1), 
    loc='lower left', 
    prop={'size': 14}
)
plt.tight_layout()

### Tune Model

In [None]:
# import XGBoost
import xgboost as xgb


# define data_dmatrix
data_dmatrix = xgb.DMatrix(data=X_train,label=y_train)

In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.5, 0.7, 1]
}

# Create the XGBoost model object
xgb_model = xgb.XGBClassifier()

# Create the GridSearchCV object
grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='accuracy')

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Print the best set of hyperparameters and the corresponding score
print("Best set of hyperparameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

In [None]:
import xgboost as xgb
import hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

# Define the hyperparameter space
space={'verbosity':0,
        'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180
    }

In [None]:
# Classifier:
def objective(space):
    clf=xgb.XGBClassifier(verbosity = space['verbosity'],
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']))
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="error",
            early_stopping_rounds=10,verbose=False)
    

    pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, pred>0.5)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }

In [None]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

In [None]:
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

In [None]:
diploma_cnts = y.value_counts()
diploma_cnts


In [None]:
from xgboost import cv

params = {"objective":"binary:logistic",'colsample_bytree': 0.3,'learning_rate': 0.3,
                'max_depth': 6, 'alpha': 10}

xgb_cv = cv(dtrain=data_dmatrix, params=params, nfold=5,
                    num_boost_round=50, early_stopping_rounds=10, metrics="error", as_pandas=True, seed=123)

In [None]:
xgb_cv

In [None]:
feat_imp = pd.DataFrame(xgb_clf.feature_importances_, preprocessor.get_feature_names_out())

In [None]:
feat_imp.sort_values(0).plot(kind='barh', title='Feature Importances')
plt.ylabel('Feature Importance Score')