# XGBoost implementation

In [79]:
# Packages and libraries
import pandas as pd
import xgboost as xgb

In [80]:
# XGBoost data prep
#remove white space in columns
#NOTE: GENDER_bin=1 is female and GENDER_bin=0 is males

df = pd.read_csv("df_sum_score_py.csv")
df.replace(' ','_',regex=True,inplace=True)

# there were 14 missing values, which has earlier been set to 0 in STRATUM. 
# Since this is only approximately 0.116% of the entire data, we do not need to do anything further. 
# XGBoost is very well at handling missing data, and we just need to ensure that it's set to 0. 
df.STRATUM.isin(['0']).sum()

14

In [81]:
# Splitting the data 
# X is the data which will be used to make predictions, and y contains the data we want to predict.
# We want to predict the score of the students in college, hence y_training and y_test is the college average grade. 

X = df.drop(['COL_GRADE_AVG','GENDER','Unnamed: 0','CR_S11','CC_S11','ENG_S11','CR_PRO','CC_PRO','ENG_PRO'], axis=1).copy()
y=df['COL_GRADE_AVG'].copy()

# Now we will continue to formatting X to make it suitable for making a model with XGBoost. 

In [82]:
# One-hot encoding 

# Look at the different types of data contained in each variable. 
X.dtypes

# The object columns we need to inspect to ensure that they are what we need them to be and after that we will do one-hot encoding. 
# One hot encoding is used to make the categorical varoiable STRATUM work in the model. 
# What is gonna happen is that the categorical variable is becoming multiple columns of binary values. 
# One hot encoding works great for trees and this is the motivation for using this method. 
X_encoded = pd.get_dummies(X,columns=['STRATUM'])

In [83]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_encoded,y,random_state=24, test_size=0.33)#, stratify=y)

In [84]:
# fra horse tut
# frames = [X_train, y_train]

train_data = pd.concat([X_train, y_train], axis=1)
train_data

Unnamed: 0,HI_GRADE_AVG,GENDER_bin,STRATUM_0,STRATUM_Stratum_1,STRATUM_Stratum_2,STRATUM_Stratum_3,STRATUM_Stratum_4,STRATUM_Stratum_5,STRATUM_Stratum_6,COL_GRADE_AVG
2988,49.333333,0,0,1,0,0,0,0,0,21.000000
5916,59.666667,1,0,0,0,1,0,0,0,64.000000
3435,59.333333,0,0,0,1,0,0,0,0,24.000000
6269,85.666667,0,0,1,0,0,0,0,0,98.333333
3964,43.666667,1,0,0,1,0,0,0,0,10.000000
...,...,...,...,...,...,...,...,...,...,...
5249,43.666667,0,0,1,0,0,0,0,0,9.333333
10385,67.666667,0,0,0,0,0,1,0,0,94.000000
3473,59.666667,0,0,1,0,0,0,0,0,69.333333
8535,54.333333,0,0,1,0,0,0,0,0,76.000000


# TODO 

From now on, we need to figure out how to train and test a model. We cannot follow any found toturial.

In [85]:
# Dette er fra statquest video, men det virker ikke helt. 
groups = train_data.groupby(train_data.index.values).size().to_frame('size')['size'].to_numpy()

clf_xgb = xgb.XGBRanker(objective="rank:map", missing=None, seed=24)
clf_xgb.fit(X_train,
            y_train,
            verbose=True,
            # early_stopping_rounds=10,
            group=groups,
            eval_metric='map')
            # eval_set=[(X_test,y_test)])

XGBRanker(base_score=0.5, booster='gbtree', colsample_bylevel=1,
          colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
          importance_type='gain', interaction_constraints='',
          learning_rate=0.300000012, max_delta_step=0, max_depth=6,
          min_child_weight=1, missing=None, monotone_constraints='()',
          n_estimators=100, n_jobs=4, num_parallel_tree=1, objective='rank:map',
          random_state=24, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
          seed=24, subsample=1, tree_method='exact', validate_parameters=1,
          verbosity=None)

DETTE SER UD TIL AT VIRKE; MEN VI SKAL FINDE UD AF HVORDAN VI KAN DISPLAY RESULTATERNE!

In [86]:
# Maybe useful

# should be in reverse order of relevance score
# print( y_train[gbm.predict_proba(X)[:, 1].argsort()][::-1])

In [87]:
# Dette er fra horse tut
# Groups created 
groups = train_data.groupby(train_data.index.values).size().to_frame('size')['size'].to_numpy()

model = xgb.XGBRanker(  
    tree_method='gpu_hist',
    booster='gbtree',
    objective='rank:pairwise',
    random_state=42, 
    learning_rate=0.1,
    colsample_bytree=0.9, 
    eta=0.05, 
    max_depth=6, 
    n_estimators=110, 
    subsample=0.75 
    )

model.fit(X_train, y_train, group=groups, verbose=True)

XGBRanker(base_score=0.5, booster='gbtree', colsample_bylevel=1,
          colsample_bynode=1, colsample_bytree=0.9, eta=0.05, gamma=0, gpu_id=0,
          importance_type='gain', interaction_constraints='', learning_rate=0.1,
          max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
          monotone_constraints='()', n_estimators=110, n_jobs=4,
          num_parallel_tree=1, random_state=42, reg_alpha=0, reg_lambda=1,
          scale_pos_weight=None, subsample=0.75, tree_method='gpu_hist',
          validate_parameters=1, verbosity=None)

In [88]:
def predict(model, df):
    return model.predict(df.loc[:, ~df.columns.isin(['id'])])
  
predictions = (data.groupby('id')
               .apply(lambda x: predict(model, x)))

KeyError: 'id'

In [71]:
from sklearn.model_selection import GroupShuffleSplit


gss = GroupShuffleSplit(test_size=.40, n_splits=1, random_state = 7).split(X, groups=df.index.values)
X_train_inds, X_test_inds = next(gss)

train_data= X.iloc[X_train_inds]
X_train = train_data.loc[:, ~train_data.columns.isin(['STRATUM','GENDER_bin','HI_GRADE_AVG'])]
y_train = train_data.loc[:, train_data.columns.isin(['COL_GRADE_AVG'])]
y_train

1
3
6
7
10
...
12397
12404
12407
12408
12409


In [72]:
test_data= X.iloc[X_test_inds]
X_test = test_data.loc[:, ~test_data.columns.isin(['STRATUM','GENDER_bin','HI_GRADE_AVG'])]
y_test = test_data.loc[:, test_data.columns.isin(['COL_GRADE_AVG'])]



In [89]:

import xgboost as xgb

model = xgb.XGBRanker(  
    tree_method='gpu_hist',
    booster='gbtree',
    objective='rank:pairwise',
    random_state=42, 
    learning_rate=0.1,
    colsample_bytree=0.9, 
    eta=0.05, 
    max_depth=6, 
    n_estimators=110, 
    subsample=0.75 
    )

model.fit(X_train, y_train, group=groups, verbose=True)

XGBRanker(base_score=0.5, booster='gbtree', colsample_bylevel=1,
          colsample_bynode=1, colsample_bytree=0.9, eta=0.05, gamma=0, gpu_id=0,
          importance_type='gain', interaction_constraints='', learning_rate=0.1,
          max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
          monotone_constraints='()', n_estimators=110, n_jobs=4,
          num_parallel_tree=1, random_state=42, reg_alpha=0, reg_lambda=1,
          scale_pos_weight=None, subsample=0.75, tree_method='gpu_hist',
          validate_parameters=1, verbosity=None)

In [90]:
def predict(model, df):
    return model.predict(df.loc[:, ~df.columns.isin(['STRATUM','GENDER_bin','HI_GRADE_AVG'])])
    
# X_train = train_data.loc[:, ~train_data.columns.isin(['STRATUM','GENDER_bin','HI_GRADE_AVG'])]

test_data

Unnamed: 0,STRATUM,HI_GRADE_AVG,GENDER_bin
0,Stratum_4,74.666667,1
2,Stratum_2,43.000000,0
4,Stratum_4,77.666667,0
5,Stratum_6,66.333333,1
8,Stratum_2,53.666667,0
...,...,...,...
12402,Stratum_2,62.333333,1
12403,Stratum_3,63.000000,1
12405,Stratum_2,69.000000,0
12406,Stratum_2,73.333333,0


In [91]:

data = test_data
predictions = (data.groupby('STRATUM')
               .apply(lambda x: predict(model, x)))

ValueError: Feature shape mismatch, expected: 9, got 7

In [76]:
# https://github.com/foxtrotmike/xgbrank/blob/master/xgbranker.py
   
import xgboost
from xgboost import XGBModel
from xgboost import DMatrix, train
import numpy as np
class XGBRanker(XGBModel):
    __doc__ = """Implementation of sklearn API for XGBoost Ranking
           """ + '\n'.join(XGBModel.__doc__.split('\n')[2:])
    
    def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, 
                 silent=True, objective="rank:pairwise", booster='gbtree',
                 n_jobs=-1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0,
                 subsample=1, colsample_bytree=1, colsample_bylevel=1,
                 reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                 base_score=0.5, random_state=0, seed=None, missing=None, **kwargs): 
        
        super(XGBRanker, self).__init__(max_depth, learning_rate,
                                        n_estimators, silent, objective, booster,
                                        n_jobs, nthread, gamma, min_child_weight, max_delta_step, 
                                        subsample, colsample_bytree, colsample_bylevel,
                                        reg_alpha, reg_lambda, scale_pos_weight,
                                        base_score, random_state, seed, missing)


    def fit(self, X, y, group=None, eval_metric=None, sample_weight=None,
            early_stopping_rounds=None, verbose=True):
        """
        Fit the gradient boosting model
        Parameters
        ----------
        X : array_like
            Feature matrix
        y : array_like
            Labels
        group : list, optional
            Group number list. All X and y will be taken as single group when group is not provided. All ranking is valid only in their own group.
        sample_weight : array_like
            instance weights
        eval_set : list, optional
            A list of (X, y) tuple pairs to use as a validation set for
            early-stopping
        eval_metric : str, callable, optional
            If a str, should be a built-in evaluation metric to use. See
            doc/parameter.md. If callable, a custom evaluation metric. The call
            signature is func(y_predicted, y_true) where y_true will be a
            DMatrix object such that you may need to call the get_label
            method. It must return a str, value pair where the str is a name
            for the evaluation and value is the value of the evaluation
            function. This objective is always minimized.
        early_stopping_rounds : int
            Activates early stopping. Validation error needs to decrease at
            least every <early_stopping_rounds> round(s) to continue training.
            Requires at least one item in evals.  If there's more than one,
            will use the last. Returns the model from the last iteration
            (not the best one). If early stopping occurs, the model will
            have three additional fields: bst.best_score, bst.best_iteration
            and bst.best_ntree_limit.
            (Use bst.best_ntree_limit to get the correct value if num_parallel_tree
            and/or num_class appears in the parameters)
        verbose : bool
            If `verbose` and an evaluation set is used, writes the evaluation
            metric measured on the validation set to stderr.
        xgb_model : str
            file name of stored xgb model or 'Booster' instance Xgb model to be
            loaded before training (allows training continuation).
        """
        if group is None:
            group = [X.shape[0]]
        else:
            idx = np.argsort(group)
            X = X[idx,:]
            y = y[idx]
            group = group[idx]
            unique, counts = np.unique(group, return_counts=True)
            group = counts[np.argsort(unique)]
        
        params = self.get_xgb_params()
 
        if callable(self.objective):
            obj = _objective_decorator(self.objective)
            # Use default value. Is it really not used ?
            xgb_options["objective"] = "rank:pairwise"
        else:
            obj = None
        
        evals_result = {}
        feval = eval_metric if callable(eval_metric) else None
        if eval_metric is not None:
            if callable(eval_metric):
                eval_metric = None
            else:
                params.update({'eval_metric': eval_metric})

        if sample_weight is not None:
            train_dmatrix = DMatrix(X, label=y, weight=sample_weight,
                                    missing=self.missing)
        else:
            train_dmatrix = DMatrix(X, label=y,
                                    missing=self.missing)
        train_dmatrix.set_group(group)
        
        self.objective = params["objective"]

        self._Booster = train(params, train_dmatrix, 
                              self.n_estimators,
                              early_stopping_rounds=early_stopping_rounds,
                              evals_result=evals_result, obj=obj, feval=feval,
                              verbose_eval=verbose,
                              xgb_model=None)

        
        if evals_result:
            for val in evals_result.items():
                evals_result_key = list(val[1].keys())[0]
                evals_result[val[0]][evals_result_key] = val[1][evals_result_key]
            self.evals_result = evals_result

        if early_stopping_rounds is not None:
            self.best_score = self._Booster.best_score
            self.best_iteration = self._Booster.best_iteration
            self.best_ntree_limit = self._Booster.best_ntree_limit

        return self

    def predict(self, X, group=None, output_margin=False, ntree_limit=0):
        unsort = (group is not None)
        if group == None:
            group = [X.shape[0]]            
        else:
            idx = np.argsort(group)
            X = X[idx,:]
            group = group[idx]
            unique, counts = np.unique(group, return_counts=True)
            group = counts[np.argsort(unique)]
            
        test_dmatrix = DMatrix(X, missing=self.missing)
        test_dmatrix.set_group(group)
        rank_values = self.get_booster().predict(test_dmatrix,
                                                 output_margin=output_margin,
                                                 ntree_limit=ntree_limit)
        if unsort:
            rank_values=rank_values[np.argsort(idx)]
        return rank_values



In [78]:
XGBModel.fit(X_train, y_train, eval_metric=None, sample_weight=None,early_stopping_rounds=None, verbose=True)

TypeError: fit() missing 1 required positional argument: 'y'

In [52]:
import xgboost as xgb
from xgboost import DMatrix

group_train = []
with open(X_train, "r") as f:
    data = f.readlines()
    for line in data:
        group_train.append(int(line.split("\n")[0]))

group_valid = []
with open("mq2008.vali.group", "r") as f:
    data = f.readlines()
    for line in data:
        group_valid.append(int(line.split("\n")[0]))

group_test = []
with open("mq2008.test.group", "r") as f:
    data = f.readlines()
    for line in data:
        group_test.append(int(line.split("\n")[0]))

train_dmatrix = DMatrix(X_train, y_train)
valid_dmatrix = DMatrix(X_test, y_test)
test_dmatrix = DMatrix(x_test)

train_dmatrix.set_group(group_train)
valid_dmatrix.set_group(group_valid)

params = {'objective': 'rank:ndcg', 'eta': 0.1, 'gamma': 1.0,
          'min_child_weight': 0.1, 'max_depth': 6}
xgb_model = xgb.train(params, train_dmatrix, num_boost_round=4,
                      evals=[(valid_dmatrix, 'validation')])
pred = xgb_model.predict(test_dmatrix)

FileNotFoundError: [Errno 2] No such file or directory: 'mq2008.train.group'