In [20]:
train <- read.csv('./train_cleaned.csv')
test <- read.csv('./test_cleaned.csv')
train_selected <- train
test_selected <- test
train_selected$chromosome_interact <- NULL


test_selected$protein <- test$protein
test_selected$X82 = NULL
test_selected$chromosome_interact <- NULL

test_selected$essential[test_selected$essential == 'Ambiguous-Non-Essential'] = 'Non-Essential'
test_selected$essential_interact[test_selected$essential_interact == 'Ambiguous-Essential'] = 'Essential'

character_vars_s <- lapply(train_selected, class) == "character"
train_selected[, character_vars_s] <- lapply(train_selected[, character_vars_s], factor)
character_vars_t <- lapply(test_selected, class) == "character"
test_selected[, character_vars_t] <- lapply(test_selected[, character_vars_t], factor)

for (i in colnames(test_selected)){
    if (i == 'protein'){
        next
    }
    n1 = length(levels(test_selected[, i]))
    n2 = length(levels(test_selected[, i]))
    if (n1 == 1 || n1 != n2){
        test_selected[, i] = NULL
        train_selected[, i] = NULL
    } 
}

In [21]:
library(randomForest)
rf.clf.label <- randomForest(label ~ . - protein, data=train_selected)

In [42]:
var.imp <- importance(rf.clf.label)
var.imp <- var.imp[order(var.imp, decreasing = TRUE),, drop=FALSE]
var.imp <- var.imp[var.imp > 10, , drop=FALSE]
train_selected <- train_selected[, names(train_selected) %in% c(rownames(var.imp), 'label', 'protein')]
test_selected <- test_selected[, names(test_selected) %in% c(rownames(var.imp), 'protein')]
test_selected$X51 <- NULL
train_selected$X51 <- NULL

character_vars_s <- lapply(train_selected, class) == "character"
train_selected[, character_vars_s] <- lapply(train_selected[, character_vars_s], factor)
character_vars_t <- lapply(test_selected, class) == "character"
test_selected[, character_vars_t] <- lapply(test_selected[, character_vars_t], factor)
train_selected$label <- factor(train_selected$label)

train_selected$chromosome <- factor(train_selected$chromosome)
test_selected$chromosome <- factor(test_selected$chromosome)

train_size <- floor(0.7 * nrow(train_selected))
train_ind <- sample(seq_len(nrow(train_selected)), size = train_size, replace = FALSE)
valid_ind <- seq_len(nrow(train_selected))[-train_ind]
train_ind <- sample(seq_len(length(train_ind)), size = floor(2 * train_size), replace = TRUE)
# train_selected_b <- train_selected[train_ind, ]
train_selected_df <- train_selected[train_ind, ]
valid_selected_df <- train_selected[valid_ind, ]

# train_ind_0 <- sample(which(train_selected_b$label == 0), size = 1000, replace= TRUE)
# train_selected_df <- train_selected_b[train_ind_0, ]
# train_ind_1 <- sample(which(train_selected_b$label == 1), size = 600, replace = TRUE)
# train_selected_df <- rbind(train_selected_df, train_selected_b[train_ind_1, ])
# train_ind_2 <- sample(which(train_selected_b$label != 1 & train_selected_b$label != 0), size = 800, replace = TRUE)
# train_selected_df <- rbind(train_selected_df, train_selected_b[train_ind_2, ])

train_selected_df$X82 <- NULL
valid_selected_df$X82 <- NULL
test_selected$X82 <- NULL

train_selected_df$X51 <- NULL
valid_selected_df$X51 <- NULL
test_selected$X51 <- NULL


In [43]:
table(train_selected_df$label) / length(train_selected_df$label)
table(valid_selected_df$label) / length(valid_selected_df$label)


           0            1            2            3            4            5 
0.4200236967 0.2458530806 0.0663507109 0.0580568720 0.0639810427 0.0639810427 
           6            7            8            9           10           11 
0.0432464455 0.0130331754 0.0088862559 0.0082938389 0.0005924171 0.0053317536 
          13 
0.0023696682 


          0           1           2           3           4           5 
0.447513812 0.229281768 0.077348066 0.058011050 0.080110497 0.044198895 
          6           7           8           9          10          11 
0.027624309 0.011049724 0.013812155 0.002762431 0.005524862 0.000000000 
         13 
0.002762431 

In [46]:
library(e1071)
C <-  10 * seq(0.1, 1, 0.1)
epsilon <- seq(0.1, 1, 0.1)
g <- seq(0.1, 1, 0.1)
max_acc <- 0
max_C <- C[1]
max_epsilon <- epsilon[1]
max_g <- g[1]
max_pred <- c()
for (cost in C){
  for (eps in epsilon){
    model <- svm(label ~ . - protein, data = train_selected_df, kernel="radial", cost = cost, epsilon = eps, gamma=g)
    predicted <- predict(model, valid_selected_df)
    predicted <- factor(predicted, levels=levels(valid_selected_df$label))
    accu_l <- predicted == valid_selected_df$label
    accuracy <- sum(accu_l) / length(accu_l)
    if (accuracy > max_acc){
      max_acc <- accuracy
      max_C <- cost
      max_epsilon <- eps
      max_model <- model
      max_g <- g
      max_pred <- predicted
    }
  }
}
for (gamma in g){
    model <- svm(label ~ . - protein, data = train_selected_df, kernel="radial", cost = max_C, epsilon = max_epsilon, gamma=g)
    predicted <- predict(model, valid_selected_df)
    predicted <- factor(predicted, levels=levels(valid_selected_df$label))
    accu_l <- predicted == valid_selected_df$label
    accuracy <- sum(accu_l) / length(accu_l)
    if (accuracy > max_acc){
      max_acc <- accuracy
      max_model <- model
      max_g <- g
      max_pred <- predicted
    }
}

In [47]:
library(caret)
confusionMatrix(max_pred, valid_selected_df$label)

Confusion Matrix and Statistics

          Reference
Prediction   0   1   2   3   4   5   6   7   8   9  10  11  13
        0  155   3   0   2   0   3   0   0   0   0   0   0   0
        1    4  74   3   0   1   0   1   1   0   0   0   0   0
        2    0   0  25   0   1   0   0   0   0   0   0   0   0
        3    0   1   0  19   0   0   0   0   0   0   0   0   0
        4    3   4   0   0  27   1   0   0   1   0   0   0   0
        5    0   0   0   0   0  12   0   0   0   0   0   0   0
        6    0   1   0   0   0   0   9   0   0   0   0   0   0
        7    0   0   0   0   0   0   0   3   0   0   0   0   0
        8    0   0   0   0   0   0   0   0   4   0   0   0   0
        9    0   0   0   0   0   0   0   0   0   1   0   0   0
        10   0   0   0   0   0   0   0   0   0   0   2   0   0
        11   0   0   0   0   0   0   0   0   0   0   0   0   0
        13   0   0   0   0   0   0   0   0   0   0   0   0   1

Overall Statistics
                                          
  

In [50]:
for (i in colnames(test)){
    n1 = levels(test[, i])
    n2 = levels(train[, i])
    if (length(n1) != length(n2) || sum(n1 != n2)){
#         print(i)
#         print(levels(test[, i]))
#         print(levels(train[, i]))
        test[, i] <- factor(test[, i], levels = levels(train[, i]))
    }
}

test_preds <- predict(max_model, test_selected)
test_selected$label <- test_preds
table(test_preds) / length(test_preds)

test_preds
          0           1           2           3           4           5 
0.509186352 0.246719160 0.094488189 0.039370079 0.034120735 0.031496063 
          6           7           8           9          10          11 
0.026246719 0.002624672 0.013123360 0.002624672 0.000000000 0.000000000 
         13 
0.000000000 

In [222]:
result <- data.frame("protein" = test$protein, "label" = test$label)
result_agg <- aggregate(result,
                by = list(result$protein),
                FUN = function(x) tail(names(sort(table(x))), 1))
result_agg <- data.frame("protein" = result_agg$protein, "label" = result_agg$label)
table(result_agg$label) / length(result_agg$label)


          0           1          10           2           3           4 
0.422572178 0.304461942 0.002624672 0.057742782 0.057742782 0.020997375 
          5           6           7           8           9 
0.060367454 0.026246719 0.020997375 0.023622047 0.002624672 

In [223]:
colnames(result_agg) <- c('Key', 'Label')
write.csv(result_agg,'./result/result_Apr3_2.csv', row.names = FALSE, quote=FALSE)