# Data mining techniques: Assignment 2

## LambdaMART model training and evaluation

In [1]:
library("gbm")
library("xgboost")

“package ‘gbm’ was built under R version 3.4.4”Loading required package: survival
Loading required package: lattice
Loading required package: splines
Loading required package: parallel
Loaded gbm 2.1.3


# Scoring and Ranking functions

In [111]:
rank_prediction <- function(df){
    "
    Ranks search IDs according to prediction scores

    The dataframe given as a parameter 
    must contain the following columns
    in order to rank:
        - srch_id (the search IDs)
        - prediction (the prediction per row (higher score -> better rank))

    Other important columns are:
        - prop_id (for sorting the property IDs for the prediction file)
        - relevance (relevance score for calculating the NDCG of the test set)
    "
    
    search_ids <- sort(unique(df[,"srch_id"]))
    searches <- df[,"srch_id"]
    
    res <- data.frame()
    for (i in search_ids){
        group <- df[searches == i,]
        res <- rbind(res,group[order(group$prediction,decreasing=TRUE),])
    }
    
    rownames(res) <- NULL
    
    return(res)
}

ndcg_score <- function(df){
    "
    Calculates the average ndcg score for a 
    given ensemble of ranked searches.

    The dataframe given as a parameter 
    must contain the following columns
    in order to calculate the ndcg score:
        - srch_id (the search IDs)
        - relevance (relevance score for calculating the NDCG of the test set)
    "
    
    search_ids <- sort(unique(df[,"srch_id"]))
    searches <- df[,"srch_id"]
    
    ndcg_scores <- rep(0,length(search_ids))
    for (i in 1:length(search_ids)){
        group <- df[searches == i,]
        
        dcg <- group[,"relevance"] / log2((1:nrow(group))+1)
        norm <- group[order(group$relevance,decreasing=TRUE),"relevance"] / log2((1:nrow(group))+1)
        
        ndcg_scores[i] <- sum(dcg) / sum(norm)
    }

    return(mean(ndcg_scores))
}

In [142]:
# Test implementation

srch_id <- c(rep(1,6),rep(2,6))
prop_id <- 1:12
prediction <- c(4,3,5,6,1,2,sample(1:6,replace=F))
relevance <- c(5,1,1,0,0,0,sample(c(5,1,0,0,0,0),replace=F))

df <- data.frame(srch_id=srch_id,prop_id=prop_id,prediction=prediction,relevance=relevance)

df

ranking <- rank_prediction(df)

ranking

ndcg_score(ranking)

srch_id,prop_id,prediction,relevance
1,1,4,5
1,2,3,1
1,3,5,1
1,4,6,0
1,5,1,0
1,6,2,0
2,7,3,0
2,8,2,0
2,9,5,5
2,10,6,0


srch_id,prop_id,prediction,relevance
1,4,6,0
1,3,5,1
1,1,4,5
1,2,3,1
1,6,2,0
1,5,1,0
2,10,6,0
2,9,5,5
2,11,4,0
2,7,3,0


# Load the data

In [2]:
# Memory efficient column specific data frame loading functions
col_list <- function(names,selected,df){
    res <- c(1:length(names))
    res[] <- "NULL"
        
    for(i in 1:length(selected)){
        res[names == selected[i]] <- df[selected[i],1]
    }
    
    return(res)
}

load_part <- function(path,nrows = -1,skip = 1,colClasses = NA){
    namecols <- colClasses
    if(length(namecols) > 1 || !is.na(namecols)){
        namecols[colClasses != "NULL"] <- NA
    }
    
    names <- colnames(read.csv(trainPath,nrows = 1,colClasses=namecols))
    df <- read.csv(trainPath,header=F,skip=skip,nrows = nrows,colClasses=colClasses,
                   na.strings=c("NA","NULL"))
    
    colnames(df) <- names
    
    return(df)
}

In [None]:
trainPath <- "/home/kevin/data_mining/data/assignment2/undertrain.csv"
validPath <- "/home/kevin/data_mining/data/assignment2/undervalid.csv"
testPath <- "/home/kevin/data_mining/data/assignment2/undertest.csv"

train <- read.csv(trainPath)
valid <- read.csv(validPath)
test <- read.csv(testPath)

# Model training

## Train the lambdaMART using XGBoost

In [None]:
xgb.train <- subset(train.train, select=c(-srch_id,-position,-click_bool,-gross_bookings_usd,-booking_bool,-relevance))
#dtest <- subset()

xgb.train <- data.matrix(xgb.train)

head(xgb.train)

In [None]:
xgb.model <- xgboost(booster="gbtree",data = xgb.train, label=train.train$relevance, nrounds=100, 
                 objective = "rank:pairwise",eval_metric="ndcg",missing=NA)

In [None]:
predictions <- predict(xgb.model,xgb.train)

predictions.df <- data.frame(srch_id=train.train$srch_id,prop_id=train.train$prop_id,
                            predictions=predictions,relevance=train.train$relevance)

predictions.df[predictions.df$srch_id == 1,]

## Train lambdaMART using GBM

In [None]:
gbm.train <- subset(train.train, select=c(-position,-click_bool,-gross_bookings_usd,-booking_bool))

head(gbm.train)

gbm.model <- gbm(relevance~date_year:comp8_rate_percent_diff,
                data=gbm.train,
                distribution=list(   # loss function:
                  name='pairwise',   # pairwise
                  metric="ndcg",     # ranking metric:
                  group='srch_id'),    # column indicating query groups
                n.trees=1000,        # number of trees
                cv.folds = 2,
                keep.data=TRUE,      # store copy of input data in model
                verbose = FALSE,     # don't print progress
                n.cores = 2)         # number of cores used

In [None]:
gbm.perf(gbm.model,method="test")

In [None]:
predictions <- predict(gbm.model,gbm.train,n.trees=1000)

predictions.df <- data.frame(srch_id=gbm.train$srch_id,prop_id=gbm.train$prop_id,
                            predictions=predictions,relevance=gbm.train$relevance)

predictions.df[predictions.df$srch_id == 1,]

# Model evaluation