In [1]:
train <- read.csv('./train_cleaned.csv')
test <- read.csv('./test_cleaned.csv')
test$label <- 0
# label the dataset 
train$dataset <- 0
test$dataset <- 1

# group train and test
mixed <- rbind(train, test)
mixed$dataset <- factor(mixed$dataset)
character_vars <- lapply(mixed, class) == "character"
mixed[, character_vars] <- lapply(mixed[, character_vars], factor)

# remove single factor variables
single_factor_vars <- lapply(mixed, function(x) {length(levels(x))}) == 1
single_factor_vars = single_factor_vars[single_factor_vars]
mixed <- mixed[, !names(mixed) %in% names(single_factor_vars)]

# split mixed into train/valid
train_size <- floor(0.7 * nrow(mixed))
train_ind <- sample(seq_len(nrow(mixed)), size = train_size, replace = FALSE)
train_df <- mixed[train_ind, ]
valid_df <- mixed[-train_ind, ]

show_level_diff <- function(df1, df2){
    for (i in colnames(df2)){
    n1 = levels(df2[, i])
    n2 = levels(df1[, i])
    if (length(n1) != length(n2) || sum(n1 != n2)){
        print(i)
        print(levels(df1[, i]))
        print(levels(df2[, i]))
        }
    }
}

In [3]:
library(randomForest)
rf.clf <- randomForest(dataset ~ . - protein - label, data=train_df)
p <- predict(rf.clf, valid_df)
l <- p == valid_df$dataset
sum(l) / length(l)

In [107]:
var.imp <- importance(rf.clf)
var.imp <- var.imp[order(var.imp, decreasing = TRUE),, drop=FALSE]
var.imp <- var.imp[20 > var.imp & var.imp > 1, , drop=FALSE]
selected_vars <- rownames(var.imp)
selected_vars

In [115]:
train_selected <- train
test_selected <- test
train_selected$chromosome_interact <- NULL
train_selected$type <- NULL

test_selected$protein <- test$protein
test_selected$X82 = NULL
test_selected$chromosome_interact <- NULL
test_selected$type <- NULL
test_selected$essential[test_selected$essential == 'Ambiguous-Non-Essential'] = 'Non-Essential'
test_selected$essential_interact[test_selected$essential_interact == 'Ambiguous-Essential'] = 'Essential'

character_vars_s <- lapply(train_selected, class) == "character"
train_selected[, character_vars_s] <- lapply(train_selected[, character_vars_s], factor)
character_vars_t <- lapply(test_selected, class) == "character"
test_selected[, character_vars_t] <- lapply(test_selected[, character_vars_t], factor)

for (i in colnames(test_selected)){
    if (i == 'protein'){
        next
    }
    n1 = length(levels(test_selected[, i]))
    n2 = length(levels(test_selected[, i]))
    if (n1 == 1 || n1 != n2){
        test_selected[, i] = NULL
        train_selected[, i] = NULL
    } 
}

In [44]:
rf.clf.label <- randomForest(label ~ . - protein, data=train_selected)

In [116]:
var.imp <- importance(rf.clf.label)
var.imp <- var.imp[order(var.imp, decreasing = TRUE),, drop=FALSE]
var.imp <- var.imp[var.imp > 10, , drop=FALSE]
train_selected <- train_selected[, names(train_selected) %in% c(rownames(var.imp), 'label', 'protein')]
test_selected <- test_selected[, names(test_selected) %in% c(rownames(var.imp), 'protein')]
test_selected$X51 <- NULL
train_selected$X51 <- NULL

character_vars_s <- lapply(train_selected, class) == "character"
train_selected[, character_vars_s] <- lapply(train_selected[, character_vars_s], factor)
character_vars_t <- lapply(test_selected, class) == "character"
test_selected[, character_vars_t] <- lapply(test_selected[, character_vars_t], factor)
train_selected$label <- factor(train_selected$label)

train_size <- floor(0.7 * nrow(train_selected))
train_ind <- sample(seq_len(nrow(train_selected)), size = train_size, replace = FALSE)
valid_ind <- seq_len(nrow(train_selected))[-train_ind]
train_ind <- sample(seq_len(length(train_ind)), size = floor(1.5 * train_size), replace = TRUE)
train_selected_df <- train_selected[train_ind, ]
valid_selected_df <- train_selected[valid_ind, ]
train_selected_df$X82 <- NULL




In [117]:
library(e1071)
C <- 20 * seq(0.1, 1, 0.1)
epsilon <- seq(0.001, 0.01, 0.001)
g <- seq(0.01, 0.1, 0.01)
max_acc <- 0
max_C <- C[1]
max_epsilon <- epsilon[1]
max_pred <- c()
max_g <- g[1]
for (cost in C){
  for (eps in epsilon){
    model <- svm(label ~ . - protein - correlation - chromosome, data = train_selected_df, kernel="radial", cost = cost, epsilon = eps)
    predicted <- predict(model, valid_selected_df)
    predicted <- factor(predicted, levels=levels(valid_selected_df$label))
    accu_l <- predicted == valid_selected_df$label
    accuracy <- sum(accu_l) / length(accu_l)
    if (accuracy > max_acc){
      max_acc <- accuracy
      max_C <- cost
      max_epsilon <- eps
      max_model <- model
      max_pred <- predicted
    }
  }
}
c(max_C, max_epsilon)
for (gm in g){
    model <- svm(label ~ . - protein - correlation - chromosome, data = train_selected_df, kernel="radial", cost = max_C, epsilon = max_epsilon, gamma=gm)
    predicted <- predict(model, valid_selected_df)
    predicted <- factor(predicted, levels=levels(valid_selected_df$label))
    accu_l <- predicted == valid_selected_df$label
    accuracy <- sum(accu_l) / length(accu_l)
    if (accuracy > max_acc){
      max_acc <- accuracy
      max_model <- model
      max_pred <- predicted
      max_g <- gm
    }
}

In [118]:
table(max_pred) / length(max_pred)
c(max_C, max_epsilon, max_g, max_acc)

max_pred
          0           1           2           3           4           5 
0.467181467 0.270270270 0.046332046 0.038610039 0.042471042 0.019305019 
          6           7           8           9          10          11 
0.069498069 0.015444015 0.003861004 0.023166023 0.000000000 0.003861004 
         12          13          14 
0.000000000 0.000000000 0.000000000 

In [125]:
accuracy_per_label <- function(label){
    accu_l_0 <- valid_selected_df$label == label
    accu_l_0 <- predicted[accu_l_0] == label
    print(sum(accu_l_0) / length(accu_l_0))
}

In [126]:
for (i in levels(valid_selected_df$label)){
    accuracy_per_label(i)
}

[1] 0.9230769
[1] 0.7592593
[1] 0.4444444
[1] 0.5333333
[1] 0.5714286
[1] 0.625
[1] 0.6875
[1] 0.5
[1] 0
[1] 0.8
[1] 0
[1] 1
[1] NaN
[1] NaN
[1] 0


In [119]:
test_preds <- predict(max_model, test_selected)
test_selected$label <- test_preds
table(test_preds) / length(test_preds)

test_preds
          0           1           2           3           4           5 
0.443569554 0.296587927 0.044619423 0.049868766 0.031496063 0.052493438 
          6           7           8           9          10          11 
0.028871391 0.018372703 0.020997375 0.007874016 0.000000000 0.005249344 
         12          13          14 
0.000000000 0.000000000 0.000000000 

In [93]:
result_agg <- data.frame(Key=test_selected$protein, Label=test_selected$label)
write.csv(result_agg,'./result/result_Apr4.csv', row.names = FALSE, quote=FALSE)