In [30]:
library(knitr)
library(rmdformats)
library(data.table)
library(magrittr)
library(lubridate)
library(ggplot2)
library(cowplot)
library(repr)
library(kableExtra)
library(IRdisplay)
print("All packages loaded successfully")

[1] "All packages loaded successfully"


In [31]:
simulate_central_scenario <- function(seed = 1234){
  
  #seed = 1234  
  set.seed(seed)
  
  # Policy data
  #~~~~~~~~~~~~~~~~~
  
  # polices sold between start 2016 to end 2017
  dt_policydates <- data.table(date_UW = seq(as.Date("2016/1/1"), as.Date("2017/12/31"), "day"))
  
  # number of policies per day follows Poisson process with mean 700 (approx 255,500 pols per annum)
  dt_policydates[, ':='(policycount = rpois(.N,700),
                        date_lapse = date_UW %m+% years(1),
                        expodays = as.integer(date_UW %m+% years(1) - date_UW),
                        pol_prefix = year(date_UW)*10000 + month(date_UW)*100 + mday(date_UW))]
  
  
  # Add columns defining Policy Covers   
  dt_policydates[, Cover_B := round(policycount * 0.25)]
  dt_policydates[, Cover_BO := round(policycount * 0.45)]
  dt_policydates[, Cover_BOT := policycount - Cover_B - Cover_BO]
  
  
  # repeat rows for each policy by UW-Date
  dt_policy <- dt_policydates[rep(1:.N, policycount),c("date_UW", "pol_prefix"), with = FALSE][,pol_seq:=1:.N, by=pol_prefix]
  
  # Create a unique policy number 
  dt_policy[, pol_number := as.character(pol_prefix * 10000 + pol_seq)]
  
  # set join keys
  setkey(dt_policy,'date_UW')
  setkey(dt_policydates,'date_UW')  
  
  # remove pol_prefix before join
  dt_policydates[, pol_prefix := NULL]  
  
  # join cover from summary file (dt_policydates)
  dt_policy <- dt_policy[dt_policydates]  
  
  # now create Cover field for each policy row
  dt_policy[,Cover := 'BO']
  dt_policy[pol_seq <= policycount- Cover_BO,Cover := 'BOT']
  dt_policy[pol_seq <= Cover_B,Cover := 'B']  
  
  # remove interim calculation fields
  dt_policy[, ':='(pol_prefix = NULL,
                   policycount = NULL,
                   pol_seq = NULL,
                   Cover_B = NULL,
                   Cover_BOT = NULL,
                   Cover_BO = NULL)]
  
  # Add remaining policy details
  dt_policy[, Brand := rep(rep(c(1,2,3,4), c(9,6,3,2)), length.out = .N)]
  dt_policy[, Base_Price := rep(rep(c(600,550,300,150), c(9,6,3,2)), length.out = .N)]
  
  # models types and model cost multipliers
  for (eachBrand in unique(dt_policy$Brand)) {
    dt_policy[Brand == eachBrand, Model := rep(rep(c(3,2,1,0), c(10, 7, 2, 1)), length.out = .N)]
    dt_policy[Brand == eachBrand, Model_mult := rep(rep(c(1.15^3, 1.15^2, 1.15^1, 1.15^0), c(10, 7, 2, 1)), length.out = .N)]
  }
  
  dt_policy[, Price := ceiling (Base_Price * Model_mult)]
  
  
  # colums to keep
  cols_policy <- c("pol_number",
                   "date_UW",
                   "date_lapse",
                   "Cover",
                   "Brand",
                   "Model",
                   "Price")
  
  dt_policy <- dt_policy[, cols_policy, with = FALSE]
  
  # check output
  head(dt_policy)
  
  #save(dt_policy, file = "./dt_policy.rda")
  
  
  # Claims data
  #~~~~~~~~~~~~~~~~~
  
 # All policies have breakage cover
  # claims uniformly sampled from policies
  claim <- sample(nrow(dt_policy), size = floor(nrow(dt_policy) * 0.15))
  
  # Claim serverity multiplier sampled from beta distn
  dt_claim <- data.table(pol_number = dt_policy[claim, pol_number],
                         claim_type = 'B',
                         claim_count = 1,
                         claim_sev = rbeta(length(claim), 2,5))
  
  # identify all policies with Oxidation cover
  cov <- which(dt_policy$Cover != 'B')
  
  # sample claims from policies with cover
  claim <- sample(cov, size = floor(length(cov) * 0.05))
  
  # add claims to table 
  dt_claim <- rbind(dt_claim,
                    data.table(pol_number = dt_policy[claim, pol_number],
                               claim_type = 'O',
                               claim_count = 1,
                               claim_sev = rbeta(length(claim), 5,3)))
  
  
  # identify all policies with Theft cover
  # for Theft claim frequency varies by Brand
  # So need to consider each in turn...
  
  for(myModel in 0:3) {
    
    cov <- which(dt_policy$Cover == 'BOT' & dt_policy$Model == myModel)
    claim <- sample(cov, size = floor(length(cov) * 0.05*(1 + myModel)))
    
    dt_claim <- rbind(dt_claim,
                      data.table(pol_number = dt_policy[claim, pol_number],
                                 claim_type = 'T',
                                 claim_count = 1,
                                 claim_sev = rbeta(length(claim), 5,.5)))
  }
  
  # set join keys
  setkey(dt_policy, pol_number)
  setkey(dt_claim, pol_number)
  
  #join Brand and Price from policy to claim
  dt_claim[dt_policy,
           on = 'pol_number',
           ':='(date_UW = i.date_UW,
                Price = i.Price,
                Brand = i.Brand)]
  
  # use lubridate %m+% date addition operator 
  dt_claim[, date_lapse := date_UW %m+% years(1)]
  dt_claim[, expodays := as.integer(date_lapse - date_UW)]
  dt_claim[, occ_delay_days := floor(expodays * runif(.N, 0,1))]
  
  dt_claim[ ,delay_report := floor(365 * rbeta(.N, .4, 10))]  
  dt_claim[ ,delay_pay := floor(10 + 40* rbeta(.N, 7,7))]  
  
  dt_claim[, date_occur := date_UW %m+% days(occ_delay_days)]
  dt_claim[, date_report := date_occur %m+% days(delay_report)]
  dt_claim[, date_pay := date_report %m+% days(delay_pay)]
  
  dt_claim[, claim_cost := round(Price * claim_sev)]
  
  dt_claim[, clm_prefix := year(date_report)*10000 + month(date_report)*100 + mday(date_report)]
  
  dt_claim[, clm_seq := seq_len(.N), by = clm_prefix]
  dt_claim[, clm_number := as.character(clm_prefix * 10000 + clm_seq)]
  
  # colums to keep
  cols_claim <- c("clm_number",
                  "pol_number",
                  "claim_type",
                  "claim_count",
                  "claim_sev",
                  "date_occur",
                  "date_report",
                  "date_pay",
                  "claim_cost")
  
  dt_claim <- dt_claim[, cols_claim, with = FALSE]
  
 output <- list()
 output$dt_policy <- dt_policy
 output$dt_claim <- dt_claim

 return(output)

  
  
}

In [32]:

dt_PhoneData <- simulate_central_scenario(1234)

In [33]:

dt_policy <- dt_PhoneData$dt_policy

kable(head(dt_policy), "html") %>% 
  kable_styling("striped") %>% 
  scroll_box(width = "100%") %>%
  as.character() %>%
  display_html()
     

pol_number,date_UW,date_lapse,Cover,Brand,Model,Price
201601010001,2016-01-01,2017-01-01,B,1,3,913
201601010002,2016-01-01,2017-01-01,B,1,3,913
201601010003,2016-01-01,2017-01-01,B,1,3,913
201601010004,2016-01-01,2017-01-01,B,1,3,913
201601010005,2016-01-01,2017-01-01,B,1,3,913
201601010006,2016-01-01,2017-01-01,B,1,3,913


In [34]:
dt_claim <- dt_PhoneData$dt_claim

kable(head(dt_claim), "html") %>% 
  kable_styling(c("striped", "hover", "condensed")) %>% 
  scroll_box(width = "100%") %>%
  as.character() %>%
  display_html()
     

clm_number,pol_number,claim_type,claim_count,claim_sev,date_occur,date_report,date_pay,claim_cost
201606080001,201601010001,B,1,0.3337923,2016-06-08,2016-06-08,2016-07-21,305
201609150001,201601010014,B,1,0.3692034,2016-09-15,2016-09-15,2016-10-17,309
201609090001,201601010025,B,1,0.4496012,2016-09-09,2016-09-09,2016-10-07,357
201602190001,201601010027,B,1,0.4019731,2016-01-25,2016-02-19,2016-03-21,319
201605140001,201601010043,B,1,0.2146653,2016-05-14,2016-05-14,2016-06-15,196
201612110001,201601010045,B,1,0.2783313,2016-12-11,2016-12-11,2017-01-06,254


In [35]:
setnames(dt_policy, c('date_UW', 'date_lapse'), c('date_pol_start', 'date_pol_end'))
  
# set policy start and end dates in foverlap friendly format
dt_policy[, date_pol_start:= floor_date(date_pol_start, unit= "second")]
dt_policy[, date_pol_end:= floor_date(date_pol_end, unit= "second") - 1]
  
# create a dummy end claim occurrence date for foverlap
dt_claim[, date_occur:= floor_date(date_occur, unit= "second")]
dt_claim[, date_occur_end:= date_occur]
dt_claim[, date_report:= floor_date(date_report, unit= "second")]
dt_claim[, date_pay:= floor_date(date_pay, unit= "second")]
  
# set keys for claim join (by policy and dates)
setkey(dt_claim, pol_number, date_occur, date_occur_end)
setkey(dt_policy, pol_number, date_pol_start, date_pol_end)
  
# use foverlaps to attach claim to right occurrence period and policy
dt_polclaim <- foverlaps(dt_policy, dt_claim, type="any") ## return overlap indices
dt_polclaim[, date_occur_end := NULL]

In [36]:

head(dt_polclaim)

pol_number,clm_number,claim_type,claim_count,claim_sev,date_occur,date_report,date_pay,claim_cost,date_pol_start,date_pol_end,Cover,Brand,Model,Price
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dttm>,<dttm>,<dttm>,<dbl>,<dttm>,<dttm>,<chr>,<dbl>,<dbl>,<dbl>
201601010001,201606080001.0,B,1.0,0.3337923,2016-06-08,2016-06-08,2016-07-21,305.0,2016-01-01,2016-12-31 23:59:59,B,1,3,913
201601010002,,,,,,,,,2016-01-01,2016-12-31 23:59:59,B,1,3,913
201601010003,,,,,,,,,2016-01-01,2016-12-31 23:59:59,B,1,3,913
201601010004,,,,,,,,,2016-01-01,2016-12-31 23:59:59,B,1,3,913
201601010005,,,,,,,,,2016-01-01,2016-12-31 23:59:59,B,1,3,913
201601010006,,,,,,,,,2016-01-01,2016-12-31 23:59:59,B,1,3,913


In [37]:


setkey(dt_polclaim, pol_number, date_pol_start)
  
# create 2 new cols that count how many claims against each policy
dt_polclaim[,
            ':='(pol_seq = seq_len(.N),
                 pol_seq_max = .N),
            by = c('pol_number', 'date_pol_start') ]
  
table(dt_polclaim[, pol_seq_max])


     1      2      3 
504688  14750    549 

In [38]:
#set NA dates to 31/12/2999
lst_datefields <- grep(names(dt_polclaim),pattern = "date", value = TRUE)
  
for (datefield in lst_datefields)
  set(dt_polclaim,which(is.na(dt_polclaim[[datefield]])),datefield,as_datetime("2199-12-31 23:59:59 UTC"))
 
#set other NAs to zero (claim counts and costs)
for (field in c("claim_count", "claim_sev", "claim_cost"))
  set(dt_polclaim,which(is.na(dt_polclaim[[field]])),field,0)

In [39]:
#Clean up ----
dt_polclaim[, ExpoDays:= ceiling((as.numeric(date_pol_end) - as.numeric(date_pol_start))/(24*60*60*365))]
dt_polclaim <- dt_polclaim[ExpoDays > 0]
  
rm(dt_claim)
rm(dt_policy)
  
gc()

     

Unnamed: 0,used,(Mb),gc trigger,(Mb).1,max used,(Mb).2
Ncells,1537255,82.1,3359594,179.5,3359594,179.5
Vcells,17362398,132.5,62957724,480.4,78546699,599.3


In [40]:

head(dt_polclaim)

pol_number,clm_number,claim_type,claim_count,claim_sev,date_occur,date_report,date_pay,claim_cost,date_pol_start,date_pol_end,Cover,Brand,Model,Price,pol_seq,pol_seq_max,ExpoDays
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dttm>,<dttm>,<dttm>,<dbl>,<dttm>,<dttm>,<chr>,<dbl>,<dbl>,<dbl>,<int>,<int>,<dbl>
201601010001,201606080001.0,B,1,0.3337923,2016-06-08 00:00:00,2016-06-08 00:00:00,2016-07-21 00:00:00,305,2016-01-01,2016-12-31 23:59:59,B,1,3,913,1,1,2
201601010002,,,0,0.0,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,0,2016-01-01,2016-12-31 23:59:59,B,1,3,913,1,1,2
201601010003,,,0,0.0,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,0,2016-01-01,2016-12-31 23:59:59,B,1,3,913,1,1,2
201601010004,,,0,0.0,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,0,2016-01-01,2016-12-31 23:59:59,B,1,3,913,1,1,2
201601010005,,,0,0.0,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,0,2016-01-01,2016-12-31 23:59:59,B,1,3,913,1,1,2
201601010006,,,0,0.0,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,0,2016-01-01,2016-12-31 23:59:59,B,1,3,913,1,1,2


In [41]:

lst_Date_slice <- floor_date(seq(as.Date("2016/1/1"), as.Date("2019/06/30"), by = 30), unit= "second") 

# Time slice Policy & claims 
 
for (i in 1:length(lst_Date_slice)){
  dt_polclaim[date_pay<= lst_Date_slice[i], paste0('P_t_', format(lst_Date_slice[i], "%Y%m%d")):= claim_cost]
  set(dt_polclaim,which(is.na(dt_polclaim[[paste0('P_t_', format(lst_Date_slice[i], "%Y%m%d"))]])),paste0('P_t_', format(lst_Date_slice[i], "%Y%m%d")),0)
}
  
# sort data by policynumber
setkey(dt_polclaim, pol_number)

In [42]:

head(dt_polclaim)

pol_number,clm_number,claim_type,claim_count,claim_sev,date_occur,date_report,date_pay,claim_cost,date_pol_start,...,P_t_20180917,P_t_20181017,P_t_20181116,P_t_20181216,P_t_20190115,P_t_20190214,P_t_20190316,P_t_20190415,P_t_20190515,P_t_20190614
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dttm>,<dttm>,<dttm>,<dbl>,<dttm>,...,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
201601010001,201606080001.0,B,1,0.3337923,2016-06-08 00:00:00,2016-06-08 00:00:00,2016-07-21 00:00:00,305,2016-01-01,...,305,305,305,305,305,305,305,305,305,305
201601010002,,,0,0.0,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,0,2016-01-01,...,0,0,0,0,0,0,0,0,0,0
201601010003,,,0,0.0,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,0,2016-01-01,...,0,0,0,0,0,0,0,0,0,0
201601010004,,,0,0.0,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,0,2016-01-01,...,0,0,0,0,0,0,0,0,0,0
201601010005,,,0,0.0,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,0,2016-01-01,...,0,0,0,0,0,0,0,0,0,0
201601010006,,,0,0.0,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,0,2016-01-01,...,0,0,0,0,0,0,0,0,0,0


In [43]:

#_ 2.1 Set initial variables ----
#~~~~~~~~~~~~~~~~~~~~~~~
    
i <- valuation <- 10
t_i <- lst_Date_slice[i] 
delta <- min(i, length(lst_Date_slice) - i + 1)
     

In [44]:

RBNS_Train_ijk <- function(dt_policy_claim, date_i, j_dev_period, k, reserving_dates, model_vars) {
  
  # # debugging
  # #~~~~~~~~~~~~~
  # dt_policy_claim = dt_polclaim
  # date_i = t_i
  # j_dev_period = 1
  # k = 1
  # reserving_dates = lst_Date_slice
  # #~~~~~~~~~~~~~
  #   
  
  date_i <- as.Date(date_i)
  date_k <- (reserving_dates[which(reserving_dates == date_i) - k + 1])
  date_j <- (reserving_dates[which(reserving_dates == date_k) - j_dev_period])
  
  #i - j - k + 1 (predictor as at date)
  date_lookup <- (reserving_dates[which(reserving_dates == (date_i)) - j_dev_period -k + 1]) 
  
  #i - k to calculate target incremental paid
  target_lookup <- (reserving_dates[which(reserving_dates == (date_i)) - k]) 
  
  #i -k + 1 to calculate target incremental paid
  target_lookup_next <- (reserving_dates[which(reserving_dates == (date_i)) - k + 1]) 
  
  #definition of reported but not settled
  dt_policy_claim <- dt_policy_claim[(date_report <= date_lookup) & (date_pay > date_lookup)] 
  
  #simulated data assumes one payment so just need to check date paid in target calc
  dt_policy_claim[, ':='(date_lookup = date_lookup,
                         delay_train = as.numeric(date_lookup - date_pol_start), #extra feature
                         j = j_dev_period,
                         k = k,
                         target = ifelse(date_pay<=target_lookup,0,ifelse(date_pay<=target_lookup_next,claim_cost,0)))]
  
  return(dt_policy_claim[, model_vars, with = FALSE])
}


In [45]:
RBNS_Test_ijk <- function(dt_policy_claim, date_i,j_dev_period, k, reserving_dates, model_vars) {
  
  # # debugging
  # #~~~~~~~~~~~~~
  # dt_policy_claim = dt_polclaim
  # date_i = t_i
  # j_dev_period = 1
  # k = 1
  # reserving_dates = lst_Date_slice
  # #~~~~~~~~~~~~~
  #   
  date_i <- as.Date(date_i)
  
  #i - j - k + 1 (predictor as at date)
  date_lookup <- (reserving_dates[which(reserving_dates == (date_i))]) 
  
  #i - k to calculate target incremental paid
  target_lookup <- (reserving_dates[which(reserving_dates == (date_i)) +j_dev_period - 1]) 
  
  #i -k + 1 to calculate target incremental paid  
  target_lookup_next <- (reserving_dates[which(reserving_dates == (date_i)) + j_dev_period]) 
  
  #definition of reported but not settled
  # P_te_RBNS rowids of policies needing an RBNS reserve
  dt_policy_claim <- dt_policy_claim[date_report <= date_lookup & date_lookup < date_pay] 
  
  #model assumes one payment so just need to check date paid
  dt_policy_claim[, ':='(date_lookup = date_lookup,
                         delay_train = as.numeric(date_lookup - date_pol_start), #extra feature
                         j = j_dev_period,
                         k = k,
                         target = ifelse(date_pay<=target_lookup,0,ifelse(date_pay<=target_lookup_next,claim_cost,0)))] 
  
return(dt_policy_claim[, model_vars, with = FALSE])
  
}


In [46]:

RBNS_Train <- function(dt_policy_claim, date_i, i, k, reserving_dates, model_vars) {
# Create a combined TRAIN dataset across all k and j combos
  for (k in 1:k){
    if (k==1) dt_train <- NULL
    for (j in 1:(i - k + 1)){
      dt_train <- rbind(dt_train, RBNS_Train_ijk(dt_polclaim, date_i, j, k,reserving_dates, model_vars))
    }
  }  
  return(dt_train)
}

In [47]:


RBNS_Test <- function(dt_policy_claim, date_i, delta, k, reserving_dates, model_vars) {
  
  # Create a combined TEST dataset across all k and j combos
  for (k in 1:k){
    if (k==1) dt_test <- NULL
    for (j in 1:(delta - k + 1)){
      dt_test <- rbind(dt_test, RBNS_Test_ijk(dt_polclaim, date_i, j, k,reserving_dates, model_vars))
    }
  }
  
  return(dt_test)
}


In [48]:
#define modelVars
RBNS_model_vars <- c("clm_number",
                     "pol_number",
                     "j",
                     "k",
                     "date_pol_start",
                     "date_occur",
                     "date_report",
                     "date_pay",
                     "Cover",
                     "claim_type",
                     "Brand",
                     "Model",
                     "Price",
                     "target"
    )


# Create a combined TRAIN dataset for k = 1 and all valid j delay values
dt_RBNS_train <- RBNS_Train(dt_polclaim, t_i, i, k = 1, lst_Date_slice, RBNS_model_vars)

# Create a combined TEST dataset for k = 1 and all valid j delay values
dt_RBNS_test <- RBNS_Test(dt_polclaim, t_i, delta, k = 1, lst_Date_slice, RBNS_model_vars)


In [49]:
head(dt_RBNS_train)
head(dt_RBNS_test)


clm_number,pol_number,j,k,date_pol_start,date_occur,date_report,date_pay,Cover,claim_type,Brand,Model,Price,target
<chr>,<chr>,<int>,<int>,<dttm>,<dttm>,<dttm>,<dttm>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
201608040001,201601010103,1,1,2016-01-01,2016-07-15,2016-08-04,2016-09-02,B,B,1,3,913,201
201608110001,201601010130,1,1,2016-01-01,2016-05-11,2016-08-11,2016-09-14,B,B,2,2,728,448
201608170001,201601010141,1,1,2016-01-01,2016-08-02,2016-08-17,2016-09-16,B,B,1,3,913,15
201608280001,201601010145,1,1,2016-01-01,2016-08-28,2016-08-28,2016-09-24,B,B,1,3,913,437
201608190001,201601010190,1,1,2016-01-01,2016-08-19,2016-08-19,2016-09-21,BOT,T,2,2,728,725
201608130001,201601010194,1,1,2016-01-01,2016-06-23,2016-08-13,2016-09-11,BOT,B,2,1,633,118


clm_number,pol_number,j,k,date_pol_start,date_occur,date_report,date_pay,Cover,claim_type,Brand,Model,Price,target
<chr>,<chr>,<int>,<int>,<dttm>,<dttm>,<dttm>,<dttm>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
201609150001,201601010014,1,1,2016-01-01,2016-09-15,2016-09-15,2016-10-17,B,B,2,3,837,309
201609090001,201601010025,1,1,2016-01-01,2016-09-09,2016-09-09,2016-10-07,B,B,1,2,794,357
201609220001,201601010212,1,1,2016-01-01,2016-09-02,2016-09-22,2016-10-20,BOT,T,2,3,837,837
201609020001,201601010261,1,1,2016-01-01,2016-06-06,2016-09-02,2016-10-01,BOT,T,1,1,690,675
201609060001,201601010284,1,1,2016-01-01,2016-08-25,2016-09-06,2016-10-09,BOT,T,1,3,913,908
201609200001,201601010327,1,1,2016-01-01,2016-09-20,2016-09-20,2016-10-26,BOT,T,1,2,794,666


In [50]:


# Add a flag to determine which rows are from the trainset and which from the test set
dt_RBNS_train[, flgTrain := 1]
dt_RBNS_test[, flgTrain := 0]

# combine into a single RBNS dataset   
dt_All_RBNS <- rbind(dt_RBNS_train, dt_RBNS_test)
#write.csv(dt_All_RBNS,"dt_All_RBNS.csv", row.names = F)
    


In [52]:
head(dt_RBNS_train)
head(dt_RBNS_test)


clm_number,pol_number,j,k,date_pol_start,date_occur,date_report,date_pay,Cover,claim_type,Brand,Model,Price,target,flgTrain
<chr>,<chr>,<int>,<int>,<dttm>,<dttm>,<dttm>,<dttm>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
201608040001,201601010103,1,1,2016-01-01,2016-07-15,2016-08-04,2016-09-02,B,B,1,3,913,201,1
201608110001,201601010130,1,1,2016-01-01,2016-05-11,2016-08-11,2016-09-14,B,B,2,2,728,448,1
201608170001,201601010141,1,1,2016-01-01,2016-08-02,2016-08-17,2016-09-16,B,B,1,3,913,15,1
201608280001,201601010145,1,1,2016-01-01,2016-08-28,2016-08-28,2016-09-24,B,B,1,3,913,437,1
201608190001,201601010190,1,1,2016-01-01,2016-08-19,2016-08-19,2016-09-21,BOT,T,2,2,728,725,1
201608130001,201601010194,1,1,2016-01-01,2016-06-23,2016-08-13,2016-09-11,BOT,B,2,1,633,118,1


clm_number,pol_number,j,k,date_pol_start,date_occur,date_report,date_pay,Cover,claim_type,Brand,Model,Price,target,flgTrain
<chr>,<chr>,<int>,<int>,<dttm>,<dttm>,<dttm>,<dttm>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
201609150001,201601010014,1,1,2016-01-01,2016-09-15,2016-09-15,2016-10-17,B,B,2,3,837,309,0
201609090001,201601010025,1,1,2016-01-01,2016-09-09,2016-09-09,2016-10-07,B,B,1,2,794,357,0
201609220001,201601010212,1,1,2016-01-01,2016-09-02,2016-09-22,2016-10-20,BOT,T,2,3,837,837,0
201609020001,201601010261,1,1,2016-01-01,2016-06-06,2016-09-02,2016-10-01,BOT,T,1,1,690,675,0
201609060001,201601010284,1,1,2016-01-01,2016-08-25,2016-09-06,2016-10-09,BOT,T,1,3,913,908,0
201609200001,201601010327,1,1,2016-01-01,2016-09-20,2016-09-20,2016-10-26,BOT,T,1,2,794,666,0


In [53]:
write.csv(dt_RBNS_train, "dt_RBNS_train.csv", row.names = FALSE)
write.csv(dt_RBNS_test, "dt_RBNS_test.csv", row.names = FALSE)


In [51]:

head(dt_All_RBNS)

clm_number,pol_number,j,k,date_pol_start,date_occur,date_report,date_pay,Cover,claim_type,Brand,Model,Price,target,flgTrain
<chr>,<chr>,<int>,<int>,<dttm>,<dttm>,<dttm>,<dttm>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
201608040001,201601010103,1,1,2016-01-01,2016-07-15,2016-08-04,2016-09-02,B,B,1,3,913,201,1
201608110001,201601010130,1,1,2016-01-01,2016-05-11,2016-08-11,2016-09-14,B,B,2,2,728,448,1
201608170001,201601010141,1,1,2016-01-01,2016-08-02,2016-08-17,2016-09-16,B,B,1,3,913,15,1
201608280001,201601010145,1,1,2016-01-01,2016-08-28,2016-08-28,2016-09-24,B,B,1,3,913,437,1
201608190001,201601010190,1,1,2016-01-01,2016-08-19,2016-08-19,2016-09-21,BOT,T,2,2,728,725,1
201608130001,201601010194,1,1,2016-01-01,2016-06-23,2016-08-13,2016-09-11,BOT,B,2,1,633,118,1


In [54]:
IBNR_Freq_Train_ijk <- function(dt_policy_claim, date_i, j_dev_period, k, reserving_dates, model_vars, verbose = FALSE) {
  
  # # debugging
  # #~~~~~~~~~~~~~
  # dt_policy_claim = dt_polclaim
  # date_i = t_i
  # j_dev_period = 1
  # k = 1
  # reserving_dates = lst_Date_slice
  # model_vars <- IBNR_model_vars
  # #~~~~~~~~~~~~~
  
  date_i <- as.Date(date_i)
  date_k <- (reserving_dates[which(reserving_dates == date_i) - k + 1])
  date_j <- (reserving_dates[which(reserving_dates == date_k) - j_dev_period])
  date_lookup <- (reserving_dates[which(reserving_dates == (date_i)) - j_dev_period -k + 1]) #i - j - k + 1 (predictor as at date)
  target_lookup <- (reserving_dates[which(reserving_dates == (date_i)) - k]) #i - k to calculate target incremental paid
  target_lookup_next <- (reserving_dates[which(reserving_dates == (date_i)) - k + 1]) #i -k + 1 to calculate targte incremental paid
  
  if(verbose) cat(paste("Valn date", date_i, ", j = ", j_dev_period, ", k =", k, "\n"))
  
  dt_policy_claim <- dt_policy_claim[date_pol_start < date_lookup  & date_lookup < date_report] #definition of IBNR
  
  dt_policy_claim[, ':='(date_lookup = date_lookup,
                         delay_train = as.numeric(date_lookup - date_pol_start), #extra feature
                         j = j_dev_period,
                         k = k,
                         exposure = round((pmin(as.numeric(as.numeric(date_pol_end)), as.numeric(floor_date(date_i, unit= "second")))
                                             - as.numeric(date_pol_start))/(24*60*60*365), 3),
                         target = ifelse(target_lookup <= date_pay &  date_pay< target_lookup_next & date_occur <= date_lookup ,1,0))]
  
  dt_policy_claim <- dt_policy_claim [,.(exposure = sum(exposure)), by= c(setdiff(model_vars, 'exposure')) ]
  
  return(dt_policy_claim[, model_vars, with = FALSE])
  
}

In [55]:

IBNR_Loss_Train_ijk <- function(dt_policy_claim, date_i, j_dev_period, k, reserving_dates, model_vars, verbose = FALSE) {
  
  
  # # debugging
  # #~~~~~~~~~~~~~
  # dt_policy_claim = dt_polclaim
  # date_i = t_i
  # j_dev_period = 1
  # k = 1
  # reserving_dates = lst_Date_slice
  # model_vars <- IBNR_model_vars
  # #~~~~~~~~~~~~~
  
  date_i <- as.Date(date_i)
  date_k <- (reserving_dates[which(reserving_dates == date_i) - k + 1])
  date_j <- (reserving_dates[which(reserving_dates == date_k) - j_dev_period])
  date_lookup <- (reserving_dates[which(reserving_dates == (date_i)) - j_dev_period -k + 1]) #i - j - k + 1 (predictor as at date)
  target_lookup <- (reserving_dates[which(reserving_dates == (date_i)) - k]) #i - k to calculate target incremental paid
  target_lookup_next <- (reserving_dates[which(reserving_dates == (date_i)) - k + 1]) #i -k + 1 to calculate targte incremental paid
  
  if(verbose) cat(paste("Valn date", date_i, ", j = ", j_dev_period, ", k =", k, "\n"))
  
  dt_policy_claim <- dt_policy_claim[(date_lookup < date_report) & (date_occur < date_lookup) & (target_lookup >= date_pay  & date_pay < target_lookup_next)] #definition of reported but not settled
  dt_policy_claim[, ':='(date_lookup = date_lookup,
                         delay_train = as.numeric(date_lookup - date_pol_start), #extra feature
                         j = j_dev_period,
                         k = k,
                         exposure = 1, #all claims trated equal
                         
                         target = ifelse(target_lookup >= date_pay & date_pay < target_lookup_next,claim_cost,0) #model assumes one payment so just need to check date paid
                         
  )]
  
  return(dt_policy_claim[, model_vars, with = FALSE])
}


In [56]:
IBNR_Test_ijk <- function(dt_policy_claim, date_i,j_dev_period, k, reserving_dates, model_vars, verbose = FALSE) {
  
  ## debugging
  ##~~~~~~~~~~~~~
  #dt_policy_claim = dt_polclaim
  #date_i = t_i
  #j_dev_period = 8
  #k = 1
  #reserving_dates = lst_Date_slice
  #model_vars <- IBNR_model_vars
  ##~~~~~~~~~~~~~
  
  date_i <- as.Date(date_i)
  date_lookup <- (reserving_dates[which(reserving_dates == (date_i))]) #i - j - k + 1 (predictor as at date)
  target_lookup <- (reserving_dates[which(reserving_dates == (date_i)) +j_dev_period - 1]) #i - k to calculate target incremental paid
  target_lookup_next <- (reserving_dates[which(reserving_dates == (date_i)) + j_dev_period]) #i -k + 1 to calculate targte incremental paid  
  
  if(verbose) cat(paste("Valn date", date_i, ", j = ", j_dev_period, ", k =", k, "\n"))
  
  # P_te_IBNR rowids of policies needing an RBNS reserve
  dt_policy_claim <- dt_policy_claim[date_pol_start <= date_lookup & date_lookup < date_report] #IBNR
  
  dt_policy_claim[, ':='(date_lookup = date_lookup,
                         delay_train = as.numeric(date_lookup - date_pol_start), #extra feature
                         j = j_dev_period,
                         k = k,
                         exposure = round((pmin(as.numeric(as.numeric(date_pol_end)), as.numeric(floor_date(date_i, unit= "second")))
                                             - as.numeric(date_pol_start))/(24*60*60*365),3),
                         target = ifelse(target_lookup <= date_pay &  date_pay < target_lookup_next & date_occur <= date_lookup ,claim_cost,0))]  #model assumes one payment so just need to check date paid
  
  dt_policy_claim <- dt_policy_claim [,.(exposure = sum(exposure)), by= c(setdiff(model_vars, 'exposure')) ]
  
  return(dt_policy_claim[, model_vars, with = FALSE])
  
}

In [66]:
# Initialize storage lists
freq_list <- list()
loss_list <- list()
test_list <- list()

# Set k and loop over valid j values
k <- 1
for (j in 1:(i - k + 1)) {
  
  # Freq
  freq_out <- IBNR_Freq_Train_ijk(dt_polclaim, t_i, j, k, lst_Date_slice, IBNR_model_vars)
  freq_out[, j_used := j]
  freq_list[[j]] <- freq_out
  
  # Loss
  loss_out <- IBNR_Loss_Train_ijk(dt_polclaim, t_i, j, k, lst_Date_slice, IBNR_model_vars)
  loss_out[, j_used := j]
  loss_list[[j]] <- loss_out
  
  # Test
  if (j <= (delta - k + 1)) {
    test_out <- IBNR_Test_ijk(dt_polclaim, t_i, j, k, lst_Date_slice, IBNR_model_vars)
    test_out[, j_used := j]
    test_list[[j]] <- test_out
  }
}

# Combine outputs
all_freq <- rbindlist(freq_list, fill = TRUE)
all_loss <- rbindlist(loss_list, fill = TRUE)
all_test <- rbindlist(test_list, fill = TRUE)

all_freq[, flgTrain := 1]   # Frequency training data
all_loss[, flgTrain := 2]   # Loss training data
all_test[, flgTrain := 0]  

# Display top rows (first 10 each)

head(all_freq)

head(all_loss)


head(all_test)

# Write to CSV
#write.csv(all_freq, "IBNR_Freq_Train_ijk_all.csv", row.names = FALSE)
#write.csv(all_loss, "IBNR_Loss_Train_ijk_all.csv", row.names = FALSE)
#write.csv(all_test, "IBNR_Test_ijk_all.csv", row.names = FALSE)


clm_number,pol_number,j,k,exposure,date_pol_start,date_occur,date_report,date_pay,Cover,Brand,Model,Price,target,j_used,flgTrain
<chr>,<chr>,<int>,<dbl>,<dbl>,<dttm>,<dttm>,<dttm>,<dttm>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<dbl>
,201601010002,1,1,0.74,2016-01-01,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,B,1,3,913,0,1,1
,201601010003,1,1,0.74,2016-01-01,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,B,1,3,913,0,1,1
,201601010004,1,1,0.74,2016-01-01,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,B,1,3,913,0,1,1
,201601010005,1,1,0.74,2016-01-01,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,B,1,3,913,0,1,1
,201601010006,1,1,0.74,2016-01-01,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,B,1,3,913,0,1,1
,201601010007,1,1,0.74,2016-01-01,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,B,1,3,913,0,1,1


clm_number,pol_number,j,k,exposure,date_pol_start,date_occur,date_report,date_pay,Cover,Brand,Model,Price,target,j_used,flgTrain
<chr>,<chr>,<int>,<dbl>,<dbl>,<dttm>,<dttm>,<dttm>,<dttm>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<dbl>
201608030002,201601060667,2,1,1,2016-01-06,2016-07-23,2016-08-03,2016-08-27,BO,1,3,913,244,2,2
201607300002,201601070086,2,1,1,2016-01-07,2016-07-23,2016-07-30,2016-08-28,B,1,2,794,162,2,2
201608030003,201601080133,2,1,1,2016-01-08,2016-04-25,2016-08-03,2016-08-24,B,1,0,600,96,2,2
201607300004,201601120717,2,1,1,2016-01-12,2016-07-27,2016-07-30,2016-08-24,BO,1,2,794,368,2,2
201607300006,201601160494,2,1,1,2016-01-16,2016-07-28,2016-07-30,2016-08-24,BO,1,3,913,453,2,2
201607310003,201601180368,2,1,1,2016-01-18,2016-07-25,2016-07-31,2016-08-26,BOT,1,2,794,386,2,2


clm_number,pol_number,j,k,exposure,date_pol_start,date_occur,date_report,date_pay,Cover,Brand,Model,Price,target,j_used,flgTrain
<chr>,<chr>,<int>,<dbl>,<dbl>,<dttm>,<dttm>,<dttm>,<dttm>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<dbl>
,201601010002,1,1,0.74,2016-01-01,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,B,1,3,913,0,1,0
,201601010003,1,1,0.74,2016-01-01,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,B,1,3,913,0,1,0
,201601010004,1,1,0.74,2016-01-01,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,B,1,3,913,0,1,0
,201601010005,1,1,0.74,2016-01-01,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,B,1,3,913,0,1,0
,201601010006,1,1,0.74,2016-01-01,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,B,1,3,913,0,1,0
,201601010007,1,1,0.74,2016-01-01,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,B,1,3,913,0,1,0


In [67]:
# Load data.table functions (if not already loaded)
library(data.table)
library(lubridate)

# Set key and calculate delays for FREQ
setkey(all_freq, clm_number, k, j)
all_freq[, Count := .N , by = clm_number]
all_freq[, ':='(
  delay_uw_occ = ifelse(year(date_occur) == 2199, -1,
                        ceiling((as.numeric(date_occur) - as.numeric(date_pol_start)) / (24*60*60))),
  
  delay_occ_rep = ifelse(year(date_occur) == 2199, -1,
                         ceiling((as.numeric(date_report) - as.numeric(date_occur)) / (24*60*60))),
  
  delay_rep_pay = ifelse(year(date_occur) == 2199, -1,
                         ceiling((as.numeric(date_pay) - as.numeric(date_report)) / (24*60*60))),
  
  delay_uw_val = ceiling((as.numeric(t_i) - as.numeric(date_pol_start)) / (24*60*60)),
  date_uw = ceiling(as.numeric(date_pol_start) / (24*60*60)),
  Cover = as.factor(Cover)
)]

# Repeat for LOSS
setkey(all_loss, clm_number, k, j)
all_loss[, Count := .N , by = clm_number]
all_loss[, ':='(
  delay_uw_occ = ifelse(year(date_occur) == 2199, -1,
                        ceiling((as.numeric(date_occur) - as.numeric(date_pol_start)) / (24*60*60))),
  
  delay_occ_rep = ifelse(year(date_occur) == 2199, -1,
                         ceiling((as.numeric(date_report) - as.numeric(date_occur)) / (24*60*60))),
  
  delay_rep_pay = ifelse(year(date_occur) == 2199, -1,
                         ceiling((as.numeric(date_pay) - as.numeric(date_report)) / (24*60*60))),
  
  delay_uw_val = ceiling((as.numeric(t_i) - as.numeric(date_pol_start)) / (24*60*60)),
  date_uw = ceiling(as.numeric(date_pol_start) / (24*60*60)),
  Cover = as.factor(Cover)
)]

# Repeat for TEST
setkey(all_test, clm_number, k, j)
all_test[, Count := .N , by = clm_number]
all_test[, ':='(
  delay_uw_occ = ifelse(year(date_occur) == 2199, -1,
                        ceiling((as.numeric(date_occur) - as.numeric(date_pol_start)) / (24*60*60))),
  
  delay_occ_rep = ifelse(year(date_occur) == 2199, -1,
                         ceiling((as.numeric(date_report) - as.numeric(date_occur)) / (24*60*60))),
  
  delay_rep_pay = ifelse(year(date_occur) == 2199, -1,
                         ceiling((as.numeric(date_pay) - as.numeric(date_report)) / (24*60*60))),
  
  delay_uw_val = ceiling((as.numeric(t_i) - as.numeric(date_pol_start)) / (24*60*60)),
  date_uw = ceiling(as.numeric(date_pol_start) / (24*60*60)),
  Cover = as.factor(Cover)
)]


In [68]:
head(all_freq)

head(all_loss)

head(all_test)

clm_number,pol_number,j,k,exposure,date_pol_start,date_occur,date_report,date_pay,Cover,...,Price,target,j_used,flgTrain,Count,delay_uw_occ,delay_occ_rep,delay_rep_pay,delay_uw_val,date_uw
<chr>,<chr>,<int>,<dbl>,<dbl>,<dttm>,<dttm>,<dttm>,<dttm>,<fct>,...,<dbl>,<dbl>,<int>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
,201601010002,1,1,0.74,2016-01-01,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,B,...,913,0,1,1,588668,-1,-1,-1,270,16801
,201601010003,1,1,0.74,2016-01-01,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,B,...,913,0,1,1,588668,-1,-1,-1,270,16801
,201601010004,1,1,0.74,2016-01-01,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,B,...,913,0,1,1,588668,-1,-1,-1,270,16801
,201601010005,1,1,0.74,2016-01-01,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,B,...,913,0,1,1,588668,-1,-1,-1,270,16801
,201601010006,1,1,0.74,2016-01-01,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,B,...,913,0,1,1,588668,-1,-1,-1,270,16801
,201601010007,1,1,0.74,2016-01-01,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,B,...,913,0,1,1,588668,-1,-1,-1,270,16801


clm_number,pol_number,j,k,exposure,date_pol_start,date_occur,date_report,date_pay,Cover,...,Price,target,j_used,flgTrain,Count,delay_uw_occ,delay_occ_rep,delay_rep_pay,delay_uw_val,date_uw
<chr>,<chr>,<int>,<dbl>,<dbl>,<dttm>,<dttm>,<dttm>,<dttm>,<fct>,...,<dbl>,<dbl>,<int>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
201602010001,201601050247,8,1,1,2016-01-05,2016-01-28,2016-02-01,2016-03-04,BOT,...,837,371,8,2,1,23,4,32,266,16805
201602010002,201601090232,8,1,1,2016-01-09,2016-01-29,2016-02-01,2016-03-01,BOT,...,794,774,8,2,1,20,3,29,262,16809
201602010005,201601140432,8,1,1,2016-01-14,2016-01-22,2016-02-01,2016-03-07,BO,...,913,703,8,2,1,8,10,35,257,16814
201602010007,201601220474,8,1,1,2016-01-22,2016-01-25,2016-02-01,2016-03-08,BO,...,457,32,8,2,1,3,7,36,249,16822
201602020004,201601090567,8,1,1,2016-01-09,2016-01-24,2016-02-02,2016-02-29,BO,...,457,195,8,2,1,15,9,27,262,16809
201602020007,201601240318,8,1,1,2016-01-24,2016-01-26,2016-02-02,2016-03-04,BOT,...,457,97,8,2,1,2,7,31,247,16824


clm_number,pol_number,j,k,exposure,date_pol_start,date_occur,date_report,date_pay,Cover,...,Price,target,j_used,flgTrain,Count,delay_uw_occ,delay_occ_rep,delay_rep_pay,delay_uw_val,date_uw
<chr>,<chr>,<int>,<dbl>,<dbl>,<dttm>,<dttm>,<dttm>,<dttm>,<fct>,...,<dbl>,<dbl>,<int>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
,201601010002,1,1,0.74,2016-01-01,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,B,...,913,0,1,0,1479120,-1,-1,-1,270,16801
,201601010003,1,1,0.74,2016-01-01,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,B,...,913,0,1,0,1479120,-1,-1,-1,270,16801
,201601010004,1,1,0.74,2016-01-01,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,B,...,913,0,1,0,1479120,-1,-1,-1,270,16801
,201601010005,1,1,0.74,2016-01-01,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,B,...,913,0,1,0,1479120,-1,-1,-1,270,16801
,201601010006,1,1,0.74,2016-01-01,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,B,...,913,0,1,0,1479120,-1,-1,-1,270,16801
,201601010007,1,1,0.74,2016-01-01,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,B,...,913,0,1,0,1479120,-1,-1,-1,270,16801


In [70]:
data.table(
  Dataset = c("Freq Train", "Loss Train", "Test"),
  Rows = c(nrow(all_freq), nrow(all_loss), nrow(all_test))
)


Dataset,Rows
<chr>,<int>
Freq Train,732120
Loss Train,2755
Test,1778580


In [71]:
data.table(
  Dataset = c("Freq Train", "Loss Train", "Test"),
  Rows_with_NA_clm_number = c(
    sum(is.na(all_freq$clm_number)),
    sum(is.na(all_loss$clm_number)),
    sum(is.na(all_test$clm_number))
  )
)



Dataset,Rows_with_NA_clm_number
<chr>,<int>
Freq Train,588668
Loss Train,0
Test,1479120


In [69]:
write.csv(all_freq, "IBNR_Freq_Train_ijk_all.csv", row.names = FALSE)
write.csv(all_loss, "IBNR_Loss_Train_ijk_all.csv", row.names = FALSE)
write.csv(all_test, "IBNR_Test_ijk_all.csv", row.names = FALSE)


In [57]:


IBNR_Train <- function(dt_policy_claim, date_i, i, k, reserving_dates, model_vars, verbose = FALSE) {

  # Create a combined TRAIN dataset across all k and j combos
    for (k in 1:k){
      if (k==1){
        dt_train_Freq <- NULL
        dt_train_Loss <- NULL
      }
      
      for (j in 1:(i - k + 1)){
        dt_train_Freq <- rbind(dt_train_Freq, IBNR_Freq_Train_ijk(dt_policy_claim, date_i, j, k,reserving_dates, model_vars, verbose))
        dt_train_Loss <- rbind(dt_train_Loss, IBNR_Loss_Train_ijk(dt_policy_claim, date_i, j, k,reserving_dates, model_vars, verbose))
      }
    }

  return(list(Freq = dt_train_Freq, Loss = dt_train_Loss))
}


In [58]:
IBNR_Test <- function(dt_policy_claim, date_i, delta, k, reserving_dates, model_vars, verbose = FALSE) {
 
  # Create a combined TEST dataset across all k and j combos
  for (k in 1:k){
    if (k==1) dt_test <- NULL
    for (j in 1:(delta - k + 1)){
      dt_test <- rbind(dt_test, IBNR_Test_ijk(dt_policy_claim, date_i, j, k,reserving_dates, model_vars, verbose))
    }
  }
  return(dt_test)
}


In [59]:
#define IBNR modelVars
IBNR_model_vars <- c("clm_number",
                     "pol_number",
                     "j",
                     "k",
                     "exposure",
                     "date_pol_start",
                     "date_occur",
                     "date_report",
                     "date_pay",
                     "Cover",
                     "Brand",
                     "Model",
                     "Price",
                     "target")
    
# Create a combined TRAIN dataset for k = 1 and all valid j delay values
lst_IBNR_train <- IBNR_Train(dt_polclaim, t_i, i, k = 1,lst_Date_slice, IBNR_model_vars)

# Create a combined TEST dataset for k = 1 and all valid j delay values
dt_IBNR_test <- IBNR_Test(dt_polclaim, t_i, delta, k = 1,lst_Date_slice, IBNR_model_vars)

In [60]:
# Set flags
lst_IBNR_train$Freq[, flgTrain := 1]
lst_IBNR_train$Loss[, flgTrain := 2]
dt_IBNR_test[, flgTrain := 0]

# Combine datasets
dt_All_IBNR <- rbind(
  lst_IBNR_train$Freq,
  lst_IBNR_train$Loss,
  dt_IBNR_test
)

# Optional: Write to CSV
# write.csv(dt_All_IBNR, "dt_ALL_IBNR.csv", row.names = FALSE)


In [61]:
# order and create some delay fields
setkey(dt_All_IBNR, clm_number, k, j)
    
dt_All_IBNR[, Count := .N , by =clm_number]
dt_All_IBNR[,':='( delay_uw_occ = ifelse(year(date_occur) == 2199,
                                        -1,
                                        ceiling((as.numeric(date_occur) - as.numeric(date_pol_start))
                                                  /(24*60*60))
                                          ),
                   delay_occ_rep = ifelse(year(date_occur) == 2199,
                                          -1,
                                          ceiling((as.numeric(date_report) - as.numeric(date_occur))
                                                  /(24*60*60))
                                          ),
                   delay_rep_pay = ifelse(year(date_occur) == 2199,
                                          -1,
                                          ceiling((as.numeric(date_pay) - as.numeric(date_report))
                                                  /(24*60*60))
                                          ),
                   delay_uw_val = ceiling((as.numeric(t_i) - as.numeric(date_pol_start))/(24*60*60)),
                   date_uw = ceiling(as.numeric(date_pol_start)/(24*60*60)),
                   Cover = as.factor(Cover))]

In [62]:
head(dt_All_IBNR)

clm_number,pol_number,j,k,exposure,date_pol_start,date_occur,date_report,date_pay,Cover,...,Model,Price,target,flgTrain,Count,delay_uw_occ,delay_occ_rep,delay_rep_pay,delay_uw_val,date_uw
<chr>,<chr>,<int>,<int>,<dbl>,<dttm>,<dttm>,<dttm>,<dttm>,<fct>,...,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
,201601010002,1,1,0.74,2016-01-01,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,B,...,3,913,0,1,2067788,-1,-1,-1,270,16801
,201601010003,1,1,0.74,2016-01-01,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,B,...,3,913,0,1,2067788,-1,-1,-1,270,16801
,201601010004,1,1,0.74,2016-01-01,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,B,...,3,913,0,1,2067788,-1,-1,-1,270,16801
,201601010005,1,1,0.74,2016-01-01,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,B,...,3,913,0,1,2067788,-1,-1,-1,270,16801
,201601010006,1,1,0.74,2016-01-01,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,B,...,3,913,0,1,2067788,-1,-1,-1,270,16801
,201601010007,1,1,0.74,2016-01-01,2199-12-31 23:59:59,2199-12-31 23:59:59,2199-12-31 23:59:59,B,...,3,913,0,1,2067788,-1,-1,-1,270,16801
