In [3]:
import pandas as pd

In [4]:
# XGBoost data prep
#remove white space in columns
#NOTE: GENDER_bin=1 is female and GENDER_bin=0 is males

df = pd.read_csv("df_sum_score_py.csv")
df.replace(' ','_',regex=True,inplace=True)

# there is 14 0's in STRATUM and since this is only approximately 0.116% of the entire data, we do not need to do anything. 
# XGBoost is very well at handling missing data, and we just need to ensure that it's set to 0. 
df.STRATUM.isin(['0']).sum()

14

In [5]:
# Splitting the data 
#We want to predict the score of the students in college. 

X = df.drop(['COL_GRADE_AVG','GENDER','Unnamed: 0','CR_S11','CC_S11','ENG_S11','CR_PRO','CC_PRO','ENG_PRO'], axis=1).copy()
y=df['COL_GRADE_AVG'].copy()

# X is the data which will be used to make predictions, and y contains the data we want to predict. 
# Now we will continue to formatting X to make it suitable for making a model with XGBoost. 

In [6]:
# One-hot encoding 
X.dtypes
# The object columns we need to inspect to ensure that they are what we need them to be and after that we will do one-hot encoding. 
# One hot encoding is used to make the categorical varoiable STRATUM work in the model. 
# What is gonna happen is that the categorical variable is becoming multiple columns of binary values. 
# One hot encoding works great for trees and this is the motivation for using this method. 
X_encoded = pd.get_dummies(X,columns=['STRATUM'])

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_encoded,y,random_state=24, test_size=0.8)#, stratify=y)

In [8]:
# fra horse tut
# frames = [X_train, y_train]

train_data = pd.concat([X_train, y_train], axis=1)
train_data

Unnamed: 0,HI_GRADE_AVG,GENDER_bin,STRATUM_0,STRATUM_Stratum_1,STRATUM_Stratum_2,STRATUM_Stratum_3,STRATUM_Stratum_4,STRATUM_Stratum_5,STRATUM_Stratum_6,COL_GRADE_AVG
2988,49.333333,0,0,1,0,0,0,0,0,21.000000
5916,59.666667,1,0,0,0,1,0,0,0,64.000000
3435,59.333333,0,0,0,1,0,0,0,0,24.000000
6269,85.666667,0,0,1,0,0,0,0,0,98.333333
3964,43.666667,1,0,0,1,0,0,0,0,10.000000
...,...,...,...,...,...,...,...,...,...,...
5249,43.666667,0,0,1,0,0,0,0,0,9.333333
10385,67.666667,0,0,0,0,0,1,0,0,94.000000
3473,59.666667,0,0,1,0,0,0,0,0,69.333333
8535,54.333333,0,0,1,0,0,0,0,0,76.000000


In [88]:

groups = train_data.groupby(train_data.index.values).size().to_frame('size')['size'].to_numpy()
groups

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [90]:
import xgboost as xgb

model = xgb.XGBRanker(  
    tree_method='gpu_hist',
    booster='gbtree',
    objective='rank:map',
    random_state=42,
    learning_rate=0.1,
    colsample_bytree=0.9, 
    eta=0.05, 
    max_depth=6, 
    n_estimators=110, 
    subsample=0.75 
    )

model.fit(X_train, y_train, group=groups,verbose=True)

XGBRanker(base_score=0.5, booster='gbtree', colsample_bylevel=1,
          colsample_bynode=1, colsample_bytree=0.9, eta=0.05, gamma=0, gpu_id=0,
          importance_type='gain', interaction_constraints='', learning_rate=0.1,
          max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
          monotone_constraints='()', n_estimators=110, n_jobs=4,
          num_parallel_tree=1, objective='rank:map', random_state=42,
          reg_alpha=0, reg_lambda=1, scale_pos_weight=None, subsample=0.75,
          tree_method='gpu_hist', validate_parameters=1, verbosity=None)

In [91]:
def predict(model, df):
    return model.predict(df.loc[:, ~df.columns.isin(['id'])])
  
predictions = (data.groupby('id')
               .apply(lambda x: predict(model, x)))

NameError: name 'data' is not defined

In [53]:
from sklearn.model_selection import GroupShuffleSplit


gss = GroupShuffleSplit(test_size=.40, n_splits=1, random_state = 7).split(X, groups=df.index.values)
X_train_inds, X_test_inds = next(gss)

train_data= X.iloc[X_train_inds]
X_train = train_data.loc[:, ~train_data.columns.isin(['STRATUM','GENDER_bin','HI_GRADE_AVG'])]
y_train = train_data.loc[:, train_data.columns.isin(['COL_GRADE_AVG'])]
y_train

1
3
6
7
10
...
12397
12404
12407
12408
12409


In [50]:

test_data= X.iloc[X_test_inds]
test_data
# X_train = train_data.loc[:, ~train_data.columns.isin(['id','rank'])]
# y_train = train_data.loc[:, train_data.columns.isin(['rank'])]


Unnamed: 0,STRATUM,HI_GRADE_AVG,GENDER_bin
0,Stratum_4,74.666667,1
2,Stratum_2,43.000000,0
4,Stratum_4,77.666667,0
5,Stratum_6,66.333333,1
8,Stratum_2,53.666667,0
...,...,...,...
12402,Stratum_2,62.333333,1
12403,Stratum_3,63.000000,1
12405,Stratum_2,69.000000,0
12406,Stratum_2,73.333333,0


In [None]:

X_train_inds, X_test_inds = next(gss)

train_data= df.iloc[X_train_inds]
X_train = train_data.loc[:, ~train_data.columns.isin(['id','rank'])]
y_train = train_data.loc[:, train_data.columns.isin(['rank'])]

groups = train_data.groupby('id').size().to_frame('size')['size'].to_numpy()

test_data= df.iloc[X_test_inds]

#We need to keep the id for later predictions
X_test = test_data.loc[:, ~test_data.columns.isin(['rank'])]
y_test = test_data.loc[:, test_data.columns.isin(['rank'])]

Umiddelbart det som skal ske her er at vi skal bare fordele data i træning og test. Dette tror jeg hellere at jeg vil gøre selv i stedet for hans kode. 

In [3]:

import xgboost as xgb

model = xgb.XGBRanker(  
    tree_method='gpu_hist',
    booster='gbtree',
    objective='rank:pairwise',
    random_state=42, 
    learning_rate=0.1,
    colsample_bytree=0.9, 
    eta=0.05, 
    max_depth=6, 
    n_estimators=110, 
    subsample=0.75 
    )

model.fit(X_train, y_train, group=groups, verbose=True)

ModuleNotFoundError: No module named 'xgboost'

In [4]:
def predict(model, df):
    return model.predict(df.loc[:, ~df.columns.isin(['id'])])
  
predictions = (data.groupby('id')
               .apply(lambda x: predict(model, x)))

NameError: name 'data' is not defined