# Connect to Goodle Drive

In [None]:
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)

# Dependencies

In [None]:
!apt-get remove openjdk-* -y
!apt-get install openjdk-21-jdk-headless -qq > /dev/null
!java -version
!pip install spmf

from spmf import Spmf
import numpy as np
from pathlib import Path
import logging as lg
import os
import time

if not os.path.exists("spmf.jar"):
    !wget http://www.philippe-fournier-viger.com/spmf/spmf.jar

In [None]:
lg.basicConfig(level=lg.INFO, force=True)
logger = lg.getLogger(__name__)

# Prepare Dataset

In [None]:
# Download dataset, unpack it and prepare dataset structure, you can check if already exist this folder there with correct input

# Parameters
possible:
['test.txt', 'output', '.ipynb_checkpoints', 'kosorak.txt', 'fifa.txt', 'e_shop.txt', 'BMS1.txt', 'sign.txt', 'microblogPCU.txt']


In [None]:
possible_names = ['test', 'kosorak', 'fifa', 'e_shop', 'BMS1', 'sign', 'microblogPCU']
input_sequence_db_name = "BMS1" + ".txt"

min_support = 5

## Prepare Paths

In [None]:
try:
  google_base_path = Path("/content/gdrive/MyDrive/data/PrefixSpan").resolve(True)
except Exception:
  raise Exception("You aren't connected to google drive or you don't have mounted folder with data")
possible_sequences_names = [name.name for name in google_base_path.iterdir()]
input_sequence_filepath = (google_base_path / input_sequence_db_name).resolve(True)
output_base = google_base_path / "output"
output_base.mkdir(exist_ok=True)
output_filepath = output_base / input_sequence_db_name
spmf_output_filepath = output_base / f"spmf_{input_sequence_db_name}"

# Algorithm

## utils

In [None]:
def calculate_percentage_min_support(min_support: int, input_filepath: Path) -> float:
    with open(input_filepath, 'r') as file:
        seq_length = len(file)
        print(seq_length)
        return (min_support * 100) / seq_length if seq_length > 0 else 0

In [None]:
class PrefixSpan:
    def __init__(self, input_filepath: Path, output_filepath: Path, min_support: int):
        self.input_filepath = input_filepath
        self.output_filepath = output_filepath
        self.min_support = min_support
        self.sequences = self.read_data(input_filepath.as_posix())
        self.frequent_patterns = []

    @staticmethod
    def read_data(filename: str):
        with open(filename, 'r') as file:
            sequences = []
            for line in file:
                sequence = [item.split() for item in line.strip().split('-1')[:-1]]
                sequences.append(sequence)
            return sequences

    def prefix_span(self, prefix, projected_db):
        # Count all items and their supports in the projected_db
        item_counts = {}
        for sequence in projected_db:
            found_items = set()
            for itemset in sequence:
                for item in itemset:
                    if item not in found_items:
                        if item in item_counts:
                            item_counts[item] += 1
                        else:
                            item_counts[item] = 1
                        found_items.add(item)

        # Filter items by minimum support and recursively explore extensions
        frequent_items = [(item, count) for item, count in item_counts.items() if count >= self.min_support]
        for item, _ in sorted(frequent_items, key=lambda x: x[1], reverse=True):
            new_prefix = prefix + [item]
            self.frequent_patterns.append((new_prefix, item_counts[item]))
            suffix_db = self._build_suffix_db(new_prefix, projected_db)
            self.prefix_span(new_prefix, suffix_db)

    def _build_suffix_db(self, prefix, projected_db):
        suffix_db = []
        for sequence in projected_db:
            for index, itemset in enumerate(sequence):
                if prefix[-1] in itemset:
                    suffix = []
                    for future_index in range(index + 1, len(sequence)):
                        suffix.append(sequence[future_index])
                    if suffix:
                        suffix_db.append(suffix)
                    break
        return suffix_db

    def run(self):
        initial_db = [seq for seq in self.sequences]
        self.prefix_span([], initial_db)
        return self.frequent_patterns

    def write_frequent_patterns_to_file(self):
        if self.frequent_patterns:
          with open(self.output_filepath.as_posix(), 'w') as file:
              for pattern, support in self.frequent_patterns:
                  file.write(' '.join(pattern) + ' -1 #SUP: ' + str(support) + '\n')
        else:
            logger.warning("Trying to save outputfile if there is no frequent found, probably algorithm was not run")

# Experiments

In [None]:
prefix_span = PrefixSpan(input_sequence_filepath, output_filepath, min_support)
start_time = time.perf_counter()
prefix_span.run()
end_time = time.perf_counter()
print(f"Time of custom prefix time: {end_time - start_time} with min support: {min_support}")
prefix_span.write_frequent_patterns_to_file()

# SPMF version

In [None]:
min_support_perc = calculate_percentage_min_support(min_support, input_sequence_filepath)
spmf_prefix_span = Spmf("PrefixSpan", input_filename=input_sequence_filepath.as_posix(),
            output_filename=spmf_output_filepath.as_posix(), arguments=[min_support_perc])
start_time = time.perf_counter()
spmf_prefix_span.run()
end_time = time.perf_counter()
print(f"Time of custom prefix time: {end_time - start_time} with min support: {min_support}")

# Analyse