In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.ml.fpm import FPGrowth
import time


data = sc.textFile("hdfs://vm1:9000/user/azureuser/data/mushroom.dat")

transactions = data.map(lambda line: line.strip().split()).repartition(16)

transaction_count = transactions.count()
print(f"Total number of transactions: {transaction_count}")


# 转换为 DataFrame
df = transactions.map(lambda items: Row(items=items)).toDF()

min_support = 0.2
min_confidence = 0.6

start_time = time.time()

fpGrowth = FPGrowth(itemsCol="items", minSupport=min_support, minConfidence=min_confidence)

model = fpGrowth.fit(df)

freq_itemsets = model.freqItemsets

freq_itemsets_count = freq_itemsets.rdd.map(lambda row: (len(row.items), 1)).reduceByKey(lambda a, b: a + b)
freq_itemsets_count = freq_itemsets_count.collect()

print("Frequent Itemsets Count by Length:")
for length, count in sorted(freq_itemsets_count):
    print(f"Length {length}: {count}")

end_time = time.time()

print(f"Total execution time: {end_time - start_time} seconds")