# Load Library

In [13]:
suppressMessages(library(data.table))
suppressMessages(library(tidyverse))  
suppressMessages(library(RcppRoll))

# Read Data

In [14]:
# Label
train_label <- fread("../raw/train_label.csv", na.strings=c("NA","NaN", ""),encoding="UTF-8")

# Activty
train_activity <- fread("../raw/train_activity.csv", na.strings=c("NA","NaN", ""),encoding="UTF-8")
test1_activity <- fread("../raw/test1_activity.csv", na.strings=c("NA","NaN", ""),encoding="UTF-8")
test2_activity <- fread("../raw/test2_activity.csv", na.strings=c("NA","NaN", ""),encoding="UTF-8")

#combet
train_combat <- fread("../raw/train_combat.csv", na.strings=c("NA","NaN", ""),encoding="UTF-8")
test1_combat <- fread("../raw/test1_combat.csv", na.strings=c("NA","NaN", ""),encoding="UTF-8")
test2_combat <- fread("../raw/test2_combat.csv", na.strings=c("NA","NaN", ""),encoding="UTF-8")

#pledge
train_pledge <- fread("../raw/train_pledge.csv", na.strings=c("NA","NaN", ""),encoding="UTF-8")
test1_pledge <- fread("../raw/test1_pledge.csv", na.strings=c("NA","NaN", ""),encoding="UTF-8")
test2_pledge <- fread("../raw/test2_pledge.csv", na.strings=c("NA","NaN", ""),encoding="UTF-8")

#trade
train_trade <- fread("../raw/train_trade.csv", na.strings=c("NA","NaN", ""),encoding="UTF-8")
test1_trade <- fread("../raw/test1_trade.csv", na.strings=c("NA","NaN", ""),encoding="UTF-8")
test2_trade <- fread("../raw/test2_trade.csv", na.strings=c("NA","NaN", ""),encoding="UTF-8")

#payment
train_payment <- fread("../raw/train_payment.csv", na.strings=c("NA","NaN", ""),encoding="UTF-8")
test1_payment <- fread("../raw/test1_payment.csv", na.strings=c("NA","NaN", ""),encoding="UTF-8")
test2_payment <- fread("../raw/test2_payment.csv", na.strings=c("NA","NaN", ""),encoding="UTF-8")

invisible(gc)

# Feature Engineering

## Activity

- playtime이 0인 경우 fishing값으로 대체함.  
- game_money_change는 절대값으로 변경함.

In [15]:
# 오류 변경
train_activity = train_activity  %>%
    mutate(playtime = ifelse(playtime==0 & fishing!=0, fishing, playtime))


# 파생변수 생성
train_activity = train_activity  %>% 
    mutate(game_money_change = abs(game_money_change),
           week = ifelse(day<=7,1,
                        ifelse(day<=14,2,
                              ifelse(day<=21,3,4))))

# test1---------------------------------------------------------------

# 오류 변경
test1_activity = test1_activity  %>%
    mutate(playtime = ifelse(playtime==0 & fishing!=0, fishing, playtime))


# 파생변수 생성
test1_activity = test1_activity  %>% 
    mutate(game_money_change = abs(game_money_change),
           week = ifelse(day<=7,1,
                        ifelse(day<=14,2,
                              ifelse(day<=21,3,4))))

# test2---------------------------------------------------------------

# 오류 변경
test2_activity = test2_activity  %>%
    mutate(playtime = ifelse(playtime==0 & fishing!=0, fishing, playtime))


# 파생변수 생성
test2_activity = test2_activity  %>% 
    mutate(game_money_change = abs(game_money_change),
           week = ifelse(day<=7,1,
                        ifelse(day<=14,2,
                              ifelse(day<=21,3,4))))

### 유저별, 주차별 groupby 후 sum
- 주차간 이동 평균, 이동 표준편차 변수 생성.

In [16]:
# 주차별 평균
activity1 = train_activity  %>% 
            select(-c('day','char_id','server'))  %>% 
            group_by(acc_id,week)  %>% 
            summarise_all(sum,na.rm=T) %>% 
            ungroup()

# activitiy 2 Rename
oldnames = colnames(activity1)[-c(1,2)] # acc_id,week 제외
newnames <- paste("roll_sd2", oldnames, sep="_")

# 유저별 이동 분산
activity2 = activity1 %>%
    group_by(acc_id) %>%
    mutate_each(list(~roll_sd(., 2, align = "right", fill=NA)), -week) %>% 
    ungroup()%>% 
    rename_at(vars(oldnames), ~ newnames)  %>% 
    gather(variable, value, -(acc_id:week)) %>%
            unite(temp, week, variable) %>%
            spread(temp, value)

# activity3 Rename
oldnames = colnames(activity1)[-c(1,2)]
newnames <- paste("roll_mean2", oldnames, sep="_")

# 유저별 이동 평균
activity3 = activity1 %>%
    group_by(acc_id) %>%
    mutate_each(list(~roll_mean(., 2, align = "right", fill=NA)), -week) %>% 
    ungroup() %>% 
    rename_at(vars(oldnames), ~ newnames)  %>% 
    gather(variable, value, -(acc_id:week)) %>%
            unite(temp, week, variable) %>%
            spread(temp, value)


# 주차별 값을 long -> wide형식 변환
activity1 = activity1  %>% 
            gather(variable, value, -(acc_id:week)) %>%
            unite(temp, week, variable) %>%
            spread(temp, value)


train_activity = left_join(activity1,activity2,by='acc_id') 
train_activity = left_join(train_activity,activity3,by='acc_id')  %>% select(-starts_with("1_roll"))

rm(activity1,activity2,activity3)
invisible(gc)

In [17]:
# 주차별 평균
activity1 = test1_activity  %>% 
            select(-c('day','char_id','server'))  %>% 
            group_by(acc_id,week)  %>% 
            summarise_all(sum,na.rm=T) %>% 
            ungroup()

# activitiy 2 Rename
oldnames = colnames(activity1)[-c(1,2)] # acc_id,week 제외
newnames <- paste("roll_sd2", oldnames, sep="_")

# 유저별 이동 분산
activity2 = activity1 %>%
    group_by(acc_id) %>%
    mutate_each(list(~roll_sd(., 2, align = "right", fill=NA)), -week) %>% 
    ungroup()%>% 
    rename_at(vars(oldnames), ~ newnames)  %>% 
    gather(variable, value, -(acc_id:week)) %>%
            unite(temp, week, variable) %>%
            spread(temp, value)

# activity3 Rename
oldnames = colnames(activity1)[-c(1,2)] # acc_id,week 제외
newnames <- paste("roll_mean2", oldnames, sep="_")

# 유저별 이동 평균
activity3 = activity1 %>%
    group_by(acc_id) %>%
    mutate_each(list(~roll_mean(., 2, align = "right", fill=NA)), -week) %>% 
    ungroup() %>% 
    rename_at(vars(oldnames), ~ newnames)  %>% 
    gather(variable, value, -(acc_id:week)) %>%
            unite(temp, week, variable) %>%
            spread(temp, value)


# 주차별 값을 long -> wide형식 변환
activity1 = activity1  %>% 
            gather(variable, value, -(acc_id:week)) %>%
            unite(temp, week, variable) %>%
            spread(temp, value)


test1_activity = left_join(activity1,activity2,by='acc_id') 
test1_activity = left_join(test1_activity,activity3,by='acc_id')  %>% select(-starts_with("1_roll"))

rm(activity1,activity2,activity3)
invisible(gc)

In [18]:
# 주차별 평균
activity1 = test2_activity  %>% 
            select(-c('day','char_id','server'))  %>% 
            group_by(acc_id,week)  %>% 
            summarise_all(sum,na.rm=T) %>% 
            ungroup()

# activitiy 2 Rename
oldnames = colnames(activity1)[-c(1,2)] # acc_id,week 제외
newnames <- paste("roll_sd2", oldnames, sep="_")

# 유저별 이동 분산
activity2 = activity1 %>%
    group_by(acc_id) %>%
    mutate_each(list(~roll_sd(., 2, align = "right", fill=NA)), -week) %>% 
    ungroup()%>% 
    rename_at(vars(oldnames), ~ newnames)  %>% 
    gather(variable, value, -(acc_id:week)) %>%
            unite(temp, week, variable) %>%
            spread(temp, value)

# activity3 Rename
oldnames = colnames(activity1)[-c(1,2)] # acc_id,week 제외
newnames <- paste("roll_mean2", oldnames, sep="_")

# 유저별 이동 평균
activity3 = activity1 %>%
    group_by(acc_id) %>%
    mutate_each(list(~roll_mean(., 2, align = "right", fill=NA)), -week) %>% 
    ungroup() %>% 
    rename_at(vars(oldnames), ~ newnames)  %>% 
    gather(variable, value, -(acc_id:week)) %>%
            unite(temp, week, variable) %>%
            spread(temp, value)


# 주차별 값을 long -> wide형식 변환
activity1 = activity1  %>% 
            gather(variable, value, -(acc_id:week)) %>%
            unite(temp, week, variable) %>%
            spread(temp, value)


test2_activity = left_join(activity1,activity2,by='acc_id') 
test2_activity = left_join(test2_activity,activity3,by='acc_id')  %>% select(-starts_with("1_roll"))

rm(activity1,activity2,activity3)
invisible(gc)

## Combat

In [19]:
train_combat = train_combat  %>% 
    mutate(week = ifelse(day<=7,1,
                        ifelse(day<=14,2,
                              ifelse(day<=21,3,4))))

# test1---------------------------------------------------------------


test1_combat = test1_combat  %>% 
    mutate(week = ifelse(day<=7,1,
                        ifelse(day<=14,2,
                              ifelse(day<=21,3,4))))

# test2---------------------------------------------------------------

test2_combat = test2_combat  %>% 
    mutate(week = ifelse(day<=7,1,
                        ifelse(day<=14,2,
                              ifelse(day<=21,3,4))))

### 유저별, 주차별 groupby 후 sum
- 주차간 이동 평균, 이동 표준편차 변수 생성.

In [20]:
# 주차별 평균
combat1 = train_combat  %>% 
            select(-c('day','char_id','server','class'))  %>% 
            group_by(acc_id,week)  %>% 
            summarise_all(sum,na.rm=T) %>% 
            ungroup()

# activitiy 2 Rename
oldnames = colnames(combat1)[-c(1,2)]
newnames <- paste("roll_sd2", oldnames, sep="_")

# 유저별 이동 분산
combat2 = combat1 %>%
    group_by(acc_id) %>%
    mutate_each(list(~roll_sd(., 2, align = "right", fill=NA)), -week) %>% 
    ungroup()%>% 
    rename_at(vars(oldnames), ~ newnames)  %>% 
    gather(variable, value, -(acc_id:week)) %>%
            unite(temp, week, variable) %>%
            spread(temp, value)

# combat3 Rename
oldnames = colnames(combat1)[-c(1,2)]
newnames <- paste("roll_mean2", oldnames, sep="_")

# 유저별 이동 평균
combat3 = combat1 %>%
    group_by(acc_id) %>%
    mutate_each(list(~roll_mean(., 2, align = "right", fill=NA)), -week) %>% 
    ungroup() %>% 
    rename_at(vars(oldnames), ~ newnames)  %>% 
    gather(variable, value, -(acc_id:week)) %>%
            unite(temp, week, variable) %>%
            spread(temp, value)


# 주차별 값을 long -> wide형식 변환
combat1 = combat1  %>% 
            gather(variable, value, -(acc_id:week)) %>%
            unite(temp, week, variable) %>%
            spread(temp, value)


train_combat = left_join(combat1,combat2,by='acc_id') 
train_combat = left_join(train_combat,combat3,by='acc_id')  %>% select(-starts_with("1_roll"))

rm(combat1,combat2,combat3)
invisible(gc)

In [21]:
# 주차별 평균
combat1 = test1_combat  %>% 
            select(-c('day','char_id','server','class'))  %>% 
            group_by(acc_id,week)  %>% 
            summarise_all(sum,na.rm=T) %>% 
            ungroup()

# activitiy 2 Rename
oldnames = colnames(combat1)[-c(1,2)]
newnames <- paste("roll_sd2", oldnames, sep="_")

# 유저별 이동 분산
combat2 = combat1 %>%
    group_by(acc_id) %>%
    mutate_each(list(~roll_sd(., 2, align = "right", fill=NA)), -week) %>% 
    ungroup()%>% 
    rename_at(vars(oldnames), ~ newnames)  %>% 
    gather(variable, value, -(acc_id:week)) %>%
            unite(temp, week, variable) %>%
            spread(temp, value)

# combat3 Rename
oldnames = colnames(combat1)[-c(1,2)]
newnames <- paste("roll_mean2", oldnames, sep="_")

# 유저별 이동 평균
combat3 = combat1 %>%
    group_by(acc_id) %>%
    mutate_each(list(~roll_mean(., 2, align = "right", fill=NA)), -week) %>% 
    ungroup() %>% 
    rename_at(vars(oldnames), ~ newnames)  %>% 
    gather(variable, value, -(acc_id:week)) %>%
            unite(temp, week, variable) %>%
            spread(temp, value)


# 주차별 값을 long -> wide형식 변환
combat1 = combat1  %>% 
            gather(variable, value, -(acc_id:week)) %>%
            unite(temp, week, variable) %>%
            spread(temp, value)


test1_combat = left_join(combat1,combat2,by='acc_id') 
test1_combat = left_join(test1_combat,combat3,by='acc_id')  %>% select(-starts_with("1_roll"))

rm(combat1,combat2,combat3)
invisible(gc)

In [22]:
# 주차별 평균
combat1 = test2_combat  %>% 
            select(-c('day','char_id','server','class'))  %>% 
            group_by(acc_id,week)  %>% 
            summarise_all(sum,na.rm=T) %>% 
            ungroup()

# activitiy 2 Rename
oldnames = colnames(combat1)[-c(1,2)]
newnames <- paste("roll_sd2", oldnames, sep="_")

# 유저별 이동 분산
combat2 = combat1 %>%
    group_by(acc_id) %>%
    mutate_each(list(~roll_sd(., 2, align = "right", fill=NA)), -week) %>% 
    ungroup()%>% 
    rename_at(vars(oldnames), ~ newnames)  %>% 
    gather(variable, value, -(acc_id:week)) %>%
            unite(temp, week, variable) %>%
            spread(temp, value)

# combat3 Rename
oldnames = colnames(combat1)[-c(1,2)]
newnames <- paste("roll_mean2", oldnames, sep="_")

# 유저별 이동 평균
combat3 = combat1 %>%
    group_by(acc_id) %>%
    mutate_each(list(~roll_mean(., 2, align = "right", fill=NA)), -week) %>% 
    ungroup() %>% 
    rename_at(vars(oldnames), ~ newnames)  %>% 
    gather(variable, value, -(acc_id:week)) %>%
            unite(temp, week, variable) %>%
            spread(temp, value)


# 주차별 값을 long -> wide형식 변환
combat1 = combat1  %>% 
            gather(variable, value, -(acc_id:week)) %>%
            unite(temp, week, variable) %>%
            spread(temp, value)


test2_combat = left_join(combat1,combat2,by='acc_id') 
test2_combat = left_join(test2_combat,combat3,by='acc_id')  %>% select(-starts_with("1_roll"))

rm(combat1,combat2,combat3)
invisible(gc)

## Payment

In [23]:
train_payment = train_payment  %>%
    mutate(week = ifelse(day<=7,1,
                        ifelse(day<=14,2,
                              ifelse(day<=21,3,4))))


# test1---------------------------------------------------------------


test1_payment = test1_payment  %>%
    mutate(week = ifelse(day<=7,1,
                        ifelse(day<=14,2,
                              ifelse(day<=21,3,4))))


# test2---------------------------------------------------------------


test2_payment = test2_payment  %>%
    mutate(week = ifelse(day<=7,1,
                        ifelse(day<=14,2,
                              ifelse(day<=21,3,4))))

### 유저별, 주차별 groupby 후 sum, n_distinct
- 한 유저의 주차별 결제 횟수.
- 주차간 이동 평균, 이동 표준편차 변수 생성.

In [24]:
# 주별 몇번 결제 했는가?
payment1 = train_payment  %>% 
    group_by(acc_id,week)  %>%
    summarise(payment_cnt = n_distinct(day))  %>% 
    ungroup()  %>% 
    gather(variable, value, -(acc_id:week)) %>%
    unite(temp, week, variable) %>%
    spread(temp, value)

# 주차별 평균
payment2 = train_payment  %>%
    select(-day) %>% 
    group_by(acc_id,week)  %>%
    summarise_all(sum,na.rm=T) %>% 
    ungroup()


# activitiy 2 Rename
oldnames = colnames(payment2)[-c(1,2)]
newnames <- paste("roll_sd2", oldnames, sep="_")

# 유저별 이동 분산
payment3 = payment2 %>%
    group_by(acc_id) %>%
    mutate_each(list(~roll_sd(., 2, align = "right", fill=NA)), -week) %>% 
    ungroup()%>% 
    rename_at(vars(oldnames), ~ newnames)  %>% 
    gather(variable, value, -(acc_id:week)) %>%
            unite(temp, week, variable) %>%
            spread(temp, value)

# payment3 Rename
oldnames = colnames(payment2)[-c(1,2)]
newnames <- paste("roll_mean2", oldnames, sep="_")

# 유저별 이동 평균
payment4 = payment2 %>%
    group_by(acc_id) %>%
    mutate_each(list(~roll_mean(., 2, align = "right", fill=NA)), -week) %>% 
    ungroup() %>% 
    rename_at(vars(oldnames), ~ newnames)  %>% 
    gather(variable, value, -(acc_id:week)) %>%
            unite(temp, week, variable) %>%
            spread(temp, value)


# 주차별 값을 long -> wide형식 변환
payment2 = payment2 %>% 
            gather(variable, value, -(acc_id:week)) %>%
            unite(temp, week, variable) %>%
            spread(temp, value)


train_payment = left_join(payment1,payment2,by='acc_id') 
train_payment = left_join(train_payment,payment3,by='acc_id')
train_payment = left_join(train_payment,payment4,by='acc_id')%>% select(-starts_with("1_roll"))

rm(payment1,payment2,payment3,payment4)
invisible(gc)

In [25]:
# 주별 몇번 결제 했는가?
payment1 = test1_payment  %>% 
    group_by(acc_id,week)  %>%
    summarise(payment_cnt = n_distinct(day))  %>% 
    ungroup()  %>% 
    gather(variable, value, -(acc_id:week)) %>%
    unite(temp, week, variable) %>%
    spread(temp, value)

# 주차별 평균
payment2 = test1_payment  %>%
    select(-day) %>% 
    group_by(acc_id,week)  %>%
    summarise_all(sum,na.rm=T) %>% 
    ungroup()


# activitiy 2 Rename
oldnames = colnames(payment2)[-c(1,2)]
newnames <- paste("roll_sd2", oldnames, sep="_")

# 유저별 이동 분산
payment3 = payment2 %>%
    group_by(acc_id) %>%
    mutate_each(list(~roll_sd(., 2, align = "right", fill=NA)), -week) %>% 
    ungroup()%>% 
    rename_at(vars(oldnames), ~ newnames)  %>% 
    gather(variable, value, -(acc_id:week)) %>%
            unite(temp, week, variable) %>%
            spread(temp, value)

# payment3 Rename
oldnames = colnames(payment2)[-c(1,2)]
newnames <- paste("roll_mean2", oldnames, sep="_")

# 유저별 이동 평균
payment4 = payment2 %>%
    group_by(acc_id) %>%
    mutate_each(list(~roll_mean(., 2, align = "right", fill=NA)), -week) %>% 
    ungroup() %>% 
    rename_at(vars(oldnames), ~ newnames)  %>% 
    gather(variable, value, -(acc_id:week)) %>%
            unite(temp, week, variable) %>%
            spread(temp, value)


# 주차별 값을 long -> wide형식 변환
payment2 = payment2 %>% 
            gather(variable, value, -(acc_id:week)) %>%
            unite(temp, week, variable) %>%
            spread(temp, value)


test1_payment = left_join(payment1,payment2,by='acc_id') 
test1_payment = left_join(test1_payment,payment3,by='acc_id')
test1_payment = left_join(test1_payment,payment4,by='acc_id')%>% select(-starts_with("1_roll"))

rm(payment1,payment2,payment3,payment4)
invisible(gc)

In [26]:
# 주별 몇번 결제 했는가?
payment1 = test2_payment  %>% 
    group_by(acc_id,week)  %>%
    summarise(payment_cnt = n_distinct(day))  %>% 
    ungroup()  %>% 
    gather(variable, value, -(acc_id:week)) %>%
    unite(temp, week, variable) %>%
    spread(temp, value)

# 주차별 평균
payment2 = test2_payment  %>%
    select(-day) %>% 
    group_by(acc_id,week)  %>%
    summarise_all(sum,na.rm=T) %>% 
    ungroup()


# activitiy 2 Rename
oldnames = colnames(payment2)[-c(1,2)]
newnames <- paste("roll_sd2", oldnames, sep="_")

# 유저별 이동 분산
payment3 = payment2 %>%
    group_by(acc_id) %>%
    mutate_each(list(~roll_sd(., 2, align = "right", fill=NA)), -week) %>% 
    ungroup()%>% 
    rename_at(vars(oldnames), ~ newnames)  %>% 
    gather(variable, value, -(acc_id:week)) %>%
            unite(temp, week, variable) %>%
            spread(temp, value)

# payment3 Rename
oldnames = colnames(payment2)[-c(1,2)]
newnames <- paste("roll_mean2", oldnames, sep="_")

# 유저별 이동 평균
payment4 = payment2 %>%
    group_by(acc_id) %>%
    mutate_each(list(~roll_mean(., 2, align = "right", fill=NA)), -week) %>% 
    ungroup() %>% 
    rename_at(vars(oldnames), ~ newnames)  %>% 
    gather(variable, value, -(acc_id:week)) %>%
            unite(temp, week, variable) %>%
            spread(temp, value)


# 주차별 값을 long -> wide형식 변환
payment2 = payment2 %>% 
            gather(variable, value, -(acc_id:week)) %>%
            unite(temp, week, variable) %>%
            spread(temp, value)


test2_payment = left_join(payment1,payment2,by='acc_id') 
test2_payment = left_join(test2_payment,payment3,by='acc_id')
test2_payment = left_join(test2_payment,payment4,by='acc_id')%>% select(-starts_with("1_roll"))

rm(payment1,payment2,payment3,payment4)
invisible(gc)

## Trade

### 유저별 구매 횟수, 구매 일수, 판매 횟수, 판매 일수

In [27]:
fn = list(mean = mean, sd = sd, max= max, min = min)

train_target_trade1 = train_trade  %>%
    group_by(source_acc_id)  %>% 
    summarise(soruce_cnt = n_distinct(day), # 몇 일동안 팔았는지
              sell_cnt = n_distinct(target_acc_id)) %>%  # 몇명의 유저에게 팔았는지
    ungroup()  %>% 
    rename(acc_id = source_acc_id)


train_target_trade2 = train_trade  %>%
    group_by(source_acc_id,day)  %>% 
    summarise(sell_item_amount = mean(item_amount,na.rm=T),
              sell_item_price = mean(item_price, na.rm=T))  %>% 
    rename(acc_id = source_acc_id) %>% 
    ungroup()  %>% 
    select(-day)  %>% 
    group_by(acc_id)  %>% 
    summarise_all(fn,na.rm=T)   %>% 
    ungroup()



train_source_trade1 = train_trade  %>%
    group_by(target_acc_id)  %>% 
    summarise(target_cnt = n_distinct(day), # 몇 일동안 구매했는지
              buy_cnt = n_distinct(source_acc_id)) %>%  # 몇명의 유저에게 샀는지
    ungroup()  %>% 
    rename(acc_id = target_acc_id) 


train_source_trade2 = train_trade  %>%
    group_by(target_acc_id,day)  %>% 
    summarise(buy_item_amount = mean(item_amount,na.rm=T),
              buy_item_price = mean(item_price, na.rm=T))  %>% 
    rename(acc_id = target_acc_id) %>% 
    ungroup()  %>% 
    select(-day)  %>% 
    group_by(acc_id)  %>% 
    summarise_all(fn,na.rm=T)   %>% 
    ungroup()


train_target_trade = left_join(train_target_trade1,train_target_trade2,by="acc_id")
train_source_trade = left_join(train_source_trade1,train_source_trade2,by="acc_id")

rm(train_target_trade1,train_target_trade2,train_source_trade1,train_source_trade2)
invisible(gc)

In [28]:
fn = list(mean = mean, sd = sd, max= max, min = min)

test1_target_trade1 = test1_trade  %>%
    group_by(source_acc_id)  %>% 
    summarise(soruce_cnt = n_distinct(day), # 몇 일동안 팔았는지
              sell_cnt = n_distinct(target_acc_id)) %>%  # 몇명의 유저에게 팔았는지
    ungroup()  %>% 
    rename(acc_id = source_acc_id)


test1_target_trade2 = test1_trade  %>%
    group_by(source_acc_id,day)  %>% 
    summarise(sell_item_amount = mean(item_amount,na.rm=T),
              sell_item_price = mean(item_price, na.rm=T))  %>% 
    rename(acc_id = source_acc_id) %>% 
    ungroup()  %>% 
    select(-day)  %>% 
    group_by(acc_id)  %>% 
    summarise_all(fn,na.rm=T)   %>% 
    ungroup()



test1_source_trade1 = test1_trade  %>%
    group_by(target_acc_id)  %>% 
    summarise(target_cnt = n_distinct(day), # 몇 일동안 구매했는지
              buy_cnt = n_distinct(source_acc_id)) %>%  # 몇명의 유저에게 샀는지
    ungroup()  %>% 
    rename(acc_id = target_acc_id) 


test1_source_trade2 = test1_trade  %>%
    group_by(target_acc_id,day)  %>% 
    summarise(buy_item_amount = mean(item_amount,na.rm=T),
              buy_item_price = mean(item_price, na.rm=T))  %>% 
    rename(acc_id = target_acc_id) %>% 
    ungroup()  %>% 
    select(-day)  %>% 
    group_by(acc_id)  %>% 
    summarise_all(fn,na.rm=T)   %>% 
    ungroup()


test1_target_trade = left_join(test1_target_trade1,test1_target_trade2,by="acc_id")
test1_source_trade = left_join(test1_source_trade1,test1_source_trade2,by="acc_id")

rm(test1_target_trade1,test1_target_trade2,test1_source_trade1,test1_source_trade2)
invisible(gc)

In [29]:
fn = list(mean = mean, sd = sd, max= max, min = min)

test2_target_trade1 = test2_trade  %>%
    group_by(source_acc_id)  %>% 
    summarise(soruce_cnt = n_distinct(day), # 몇 일동안 팔았는지
              sell_cnt = n_distinct(target_acc_id)) %>%  # 몇명의 유저에게 팔았는지
    ungroup()  %>% 
    rename(acc_id = source_acc_id)


test2_target_trade2 = test2_trade  %>%
    group_by(source_acc_id,day)  %>% 
    summarise(sell_item_amount = mean(item_amount,na.rm=T),
              sell_item_price = mean(item_price, na.rm=T))  %>% 
    rename(acc_id = source_acc_id) %>% 
    ungroup()  %>% 
    select(-day)  %>% 
    group_by(acc_id)  %>% 
    summarise_all(fn,na.rm=T)   %>% 
    ungroup()



test2_source_trade1 = test2_trade  %>%
    group_by(target_acc_id)  %>% 
    summarise(target_cnt = n_distinct(day), # 몇 일동안 구매했는지
              buy_cnt = n_distinct(source_acc_id)) %>%  # 몇명의 유저에게 샀는지
    ungroup()  %>% 
    rename(acc_id = target_acc_id) 


test2_source_trade2 = test2_trade  %>%
    group_by(target_acc_id,day)  %>% 
    summarise(buy_item_amount = mean(item_amount,na.rm=T),
              buy_item_price = mean(item_price, na.rm=T))  %>% 
    rename(acc_id = target_acc_id) %>% 
    ungroup()  %>% 
    select(-day)  %>% 
    group_by(acc_id)  %>% 
    summarise_all(fn,na.rm=T)   %>% 
    ungroup()


test2_target_trade = left_join(test2_target_trade1,test2_target_trade2,by="acc_id")
test2_source_trade = left_join(test2_source_trade1,test2_source_trade2,by="acc_id")

rm(test2_target_trade1,test2_target_trade2,test2_source_trade1,test2_source_trade2)
invisible(gc)

# Join
- 위에서 전처리한 데이터들을 acc_id를 기준으로 병합

In [30]:
# Join with payment
train_activity = left_join(train_activity, train_payment, by="acc_id")

# Join with combat
train_activity = left_join(train_activity, train_combat, by="acc_id")

# train_activity = left_join(train_activity, train_pledge, by="acc_id")

# Join with trade1
train_activity = left_join(train_activity, train_source_trade, by="acc_id")

# Join with trade2
train_activity = left_join(train_activity, train_target_trade, by="acc_id")

rm(train_payment,train_combat,train_pledge,train_source_trade,train_target_trade)
invisible(gc)

# Join with label
label= left_join(train_label,train_activity,by="acc_id") 

In [31]:
# Join with payment
test1_activity = left_join(test1_activity, test1_payment, by="acc_id")

# Join with combat
test1_activity = left_join(test1_activity, test1_combat, by="acc_id")

# test1_activity = left_join(test1_activity, test1_pledge, by="acc_id")

# Join with trade1
test1_activity = left_join(test1_activity, test1_source_trade, by="acc_id")

# Join with trade2
test1_activity = left_join(test1_activity, test1_target_trade, by="acc_id")

rm(test1_payment,test1_combat,test1_pledge,test1_source_trade,test1_target_trade)
invisible(gc)

In [32]:
# Join with payment
test2_activity = left_join(test2_activity, test2_payment, by="acc_id")

# Join with combat
test2_activity = left_join(test2_activity, test2_combat, by="acc_id")

# test2_activity = left_join(test2_activity, test2_pledge, by="acc_id")

# Join with trade1
test2_activity = left_join(test2_activity, test2_source_trade, by="acc_id")

# Join with trade2
test2_activity = left_join(test2_activity, test2_target_trade, by="acc_id")

rm(test2_payment,test2_combat,test2_pledge,test2_source_trade,test2_target_trade)
invisible(gc)

# Handle Missing Value
- 결측값 처리

In [33]:
#  해달 날짜의 정보가 없어서, 결측값 또는 무한대의 값들은 0으로 처리
train_activity = train_activity %>% mutate_if(is.numeric,list(~replace(., is.na(.), 0))) 
train_activity[] <- lapply(train_activity, function(i) if(is.numeric(i)) ifelse(is.infinite(i), 0, i) else i) 
    
# test1---------------------------------------------------------------    

test1_activity = test1_activity %>% mutate_if(is.numeric,list(~replace(., is.na(.), 0)))
test1_activity[] <- lapply(test1_activity, function(i) if(is.numeric(i)) ifelse(is.infinite(i), 0, i) else i)

# test2---------------------------------------------------------------

test2_activity = test2_activity %>% mutate_if(is.numeric,list(~replace(., is.na(.), 0)))
test2_activity[] <- lapply(test2_activity, function(i) if(is.numeric(i)) ifelse(is.infinite(i), 0, i) else i)

# Save the Dataset

In [35]:
write.csv(label, file='../preprocess/train_preprocess.csv', row.names = F, fileEncoding="UTF-8")
write.csv(test1_activity, file='../preprocess/test1_preprocess.csv', row.names = F, fileEncoding="UTF-8")
write.csv(test2_activity, file='../preprocess/test2_preprocess.csv', row.names = F, fileEncoding="UTF-8")