In [80]:
source("./aux_2.R")

In [81]:
dataset <- read.csv("two_label_dataset.csv", col.names = c("ID", "Title", "Author", "Text", "Label"))
classes <- as.integer(sort(unique(dataset$Label)))

In [82]:
dataset$Text <- clean(dataset$Text)
dataset <- clean_empty_rows(dataset)

In [83]:
eighty_percent <- as.integer(length(dataset$Text) * 0.8)

training_set <- dataset[1:eighty_percent, ]
test_set <- dataset[(eighty_percent + 1):length(dataset$Text), ]

In [84]:
# Assuming you have defined train_multinomial_nb and apply_multinomial_nb functions

# Define k-fold cross-validation function
kfold_cross_validation <- function(dataset, k = 5, occ_thresholds = c(1, 2, 3)) {
  set.seed(123)  # Set seed for reproducibility
  
  n <- nrow(dataset)
  fold_size <- floor(n / k)
  
  accuracies <- matrix(0, nrow = k, ncol = length(occ_thresholds))
  
  for (fold in 1:k) {
    # Determine indices for train and validation sets
    validation_indices <- ((fold - 1) * fold_size + 1):(fold * fold_size)
    train_indices <- setdiff(1:n, validation_indices)
    
    # Split dataset into train and validation sets
    training_set <- dataset[train_indices, ]
    validation_set <- dataset[validation_indices, ]
    
    # Iterate over different occ_threshold values
    for (i in seq_along(occ_thresholds)) {
      occ_threshold <- occ_thresholds[i]
      
      # Train Naive Bayes model
      model <- train_multinomial_nb_new_two_label(classes, training_set, occ_threshold)
      
      # Predict on validation set
      pred_labels <- sapply(validation_set$Text, function(doc) {
        apply_multinomial_nb(classes, model$vocab, model$prior, model$condprob, doc)
      })
      
      # Calculate accuracy
      correct_predictions <- sum(validation_set$Label == pred_labels)
      total_predictions <- length(validation_set$Label)
      accuracy <- correct_predictions / total_predictions
      
      # Store accuracy for this fold and occ_threshold
      accuracies[fold, i] <- accuracy
    }
  }
  
  # Return mean accuracy across folds for each occ_threshold
  mean_accuracies <- colMeans(accuracies)
  return(data.frame(occ_threshold = occ_thresholds, mean_accuracy = mean_accuracies))
}

In [85]:
crossval_results <- kfold_cross_validation(training_set, k = 5, occ_thresholds = c(0.00001, 0.000016, 0.00002, 0.00005))
print(crossval_results)

[1] 23357
[1] 4.281372e-05
[1] 0.0002176096
[1] 23357
[1] 4.281372e-05
[1] 0.0002176096
[1] 23357
[1] 4.281372e-05
[1] 0.0002176096
[1] 23357
[1] 4.281372e-05
[1] 0.0002176096
[1] 23317
[1] 4.288716e-05
[1] 0.0002175217
[1] 23317
[1] 4.288716e-05
[1] 0.0002175217
[1] 23317
[1] 4.288716e-05
[1] 0.0002175217
[1] 23317
[1] 4.288716e-05
[1] 0.0002175217
[1] 23255
[1] 4.300151e-05
[1] 0.0002183356
[1] 23255
[1] 4.300151e-05
[1] 0.0002183356
[1] 23255
[1] 4.300151e-05
[1] 0.0002183356
[1] 23255
[1] 4.300151e-05
[1] 0.0002183356
[1] 23233
[1] 4.304222e-05
[1] 0.0002180041
[1] 23233
[1] 4.304222e-05
[1] 0.0002180041
[1] 23233
[1] 4.304222e-05
[1] 0.0002180041
[1] 23233
[1] 4.304222e-05
[1] 0.0002180041
[1] 23252
[1] 4.300705e-05
[1] 0.0002181425
[1] 23252
[1] 4.300705e-05
[1] 0.0002181425
[1] 23252
[1] 4.300705e-05
[1] 0.0002181425
[1] 23252
[1] 4.300705e-05
[1] 0.0002181425
  occ_threshold mean_accuracy
1       1.0e-05     0.8615618
2       1.6e-05     0.8586448
3       2.0e-05     0.8576724


In [87]:
crossval_results

occ_threshold,mean_accuracy
<dbl>,<dbl>
1e-05,0.8615618
1.6e-05,0.8586448
2e-05,0.8576724
5e-05,0.8497113


In [88]:
model <- train_multinomial_nb_new_two_label(classes, training_set, threshold = 0.00001)

[1] 24163
[1] 4.138559e-05
[1] 0.000214054


In [89]:
docs <- test_set$Text

pred_labels <- sapply(docs, function(doc) {
  apply_multinomial_nb(classes, model$vocab, model$prior, model$condprob, doc)
})

In [92]:
correct_predictions <- sum(test_set$Label == pred_labels)
total_predictions <- length(test_set$Label)
accuracy <- correct_predictions / total_predictions

cat("Accuracy:", accuracy)

Accuracy: 0.8668287

In [93]:
confusion_matrix <- table(True = test_set$Label, Predicted = pred_labels)
cat("Confusion Matrix:\n")
print(confusion_matrix)

Confusion Matrix:
    Predicted
True    0    1
   0 1795  247
   1  301 1772


In [None]:
false_negative <- table(True = test_docs$Label, Predicted = pred_labels)[2,1]
false_negative

__________________________________________________________________________________________________

In [2]:
test_set <- read.csv("two_label_dataset.csv", col.names = c("ID", "Title", "Author", "Text", "Label"))
classes2 <- as.integer(sort(unique(test_set$Label)))

In [3]:
test_set$Text <- clean(test_set$Text)
test_set <- clean_empty_rows(test_set)

In [60]:
result <- get_vocabulary_two_label(test_set$Text, threshold = 0.000016)
cat(length(result$voc))

[1] 25022
[1] 3.996483e-05
[1] 0.0002103047
5112

In [33]:
dataframe <- result$df
dataframe <- dataframe[order(-dataframe$occurrencies), ]

In [71]:
new_dataset <- test_set[1:15600, ]
test_docs <- test_set[15600:length(test_set$Text), ]

In [72]:
model <- train_multinomial_nb_new_two_label(classes2, new_dataset, threshold = 0.000016)
#print(model$vocab)
#print(model$prior)
#print(model$post)

[1] 23910
[1] 4.18235e-05
[1] 0.0002150433


In [73]:
docs <- test_docs$Text

pred_labels <- sapply(docs, function(doc) {
  apply_multinomial_nb(classes2, model$vocab, model$prior, model$condprob, doc)
})

In [75]:
correct_predictions <- sum(test_docs$Label == pred_labels)
total_predictions <- length(test_docs$Label)
accuracy <- correct_predictions / total_predictions

cat("Accuracy:", accuracy)

Accuracy: 0.864642

In [76]:
confusion_matrix <- table(True = test_docs$Label, Predicted = pred_labels)
cat("Confusion Matrix:\n")
print(confusion_matrix)

Confusion Matrix:
    Predicted
True    0    1
   0 2181  313
   1  360 2118


In [79]:
false_negative <- table(True = test_docs$Label, Predicted = pred_labels)[2,1]
false_negative