<a href="https://colab.research.google.com/github/stephenfrein/csc8491/blob/main/AssociationRulesSourceCode.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# install special packages - comment these lines out after first install
install.packages("tidyr")
install.packages("xlsx")

# load packages for use
library(tidyr)
library(xlsx)

In [None]:
# get the file from the web
URL <- "https://zenodo.org/records/268481/files/Tomcat.xlsx?download=1"
tmp <- tempfile(fileext = ".xlsx")
download.file(url = URL, destfile = tmp, mode = "wb")

In [None]:
# pull file contents into a data frame (table)
rawdata <- read.xlsx(tmp, sheetName="tomcat", stringsAsFactors=FALSE)

# visually inspect the raw data
View(rawdata)

In [None]:
# strip the data down to a few useful columns
readydata <- rawdata[,c(2,3,10)]

# visually inspect the changed data
View(readydata)

In [None]:
# the files column has many files in a single column
# turn it into multiple columns
# don't worry about warning message that says "too few values"
# we are making space for up to 100 files per bug
# most have far fewer files than that
readydata <- readydata %>% separate(files, paste('V', 1:100), sep = " ")

In [None]:
# now drop columns that aren't files
readydata <- readydata[,-c(1:2)]

# visually inspect the changed data
View(readydata)

In [None]:
# convert the rest into a format that can be read by arules package
newtemp <- tempfile(fileext = ".csv")
write.csv(readydata,file = newtemp)

In [None]:
# install association rules package
# comment below line out after first install
install.packages("arules")
library(arules)

In [None]:
# read transactions - skip the first line of header info
checkins <- read.transactions(newtemp, sep = ",", skip = 1)
summary(checkins)

# look at the first five check-ins
inspect(checkins[1:5])

In [None]:
# plot the frequency of items
windows()
itemFrequencyPlot(checkins, topN = 20)

In [None]:
# training a model on the data
# default settings result in zero rules learned
apriori(checkins)

In [None]:
# set support and confidence levels
filerules <- apriori(checkins, parameter = list(
  support = 0.005, confidence = 0.50))

# look at the first three rules
inspect(filerules[1:3])

In [None]:
# sorting rules by lift
inspect(sort(filerules, by = "lift")[1:5])

In [None]:
# finding subsets of rules containing specific items
some_file_rules <- subset(filerules, items %in%
                    "java/org/apache/coyote/http11/Http11Processor.java")
inspect(some_file_rules)

In [None]:
# writing the rules to a CSV file
write(filerules, file = "filerules.csv",
      sep = ",", quote = TRUE, row.names = FALSE)