# Harnessing the Power of Python with Apache Spark

### Imports

In [None]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import collect_list
from itertools import combinations
import os

### Load the Dataset

In [None]:
# initialize spark session
spark = (SparkSession.builder.appName('OnlineRetailAnalysis')
         .getOrCreate())

# file path
file_path = 'online_retail_II_2010-2011.csv'

# read dataset
df = spark.read.csv(file_path, header=True, inferSchema=True)

# display dataframe
df.show()

### Market Basket Analysis

In [None]:
# filter columns and remove nulls
df_filtered = df.select('Invoice', 'StockCode').na.drop()

# group by invoice and aggregate stockcodes
df_grouped = df_filtered.groupBy('Invoice').agg(collect_list('StockCode').alias('Items'))

# generate and count item pairs
item_pairs = (df_grouped.rdd.flatMap(lambda row: combinations(row[1], 2))
                                     .map(lambda pair: (pair, 1)))

pair_frequencies = item_pairs.reduceByKey(lambda a, b: a + b)

# sort pairs by frequency
sorted_pairs = pair_frequencies.sortBy(lambda x: x[1], ascending=False)

# display top 10 pairs
sorted_pairs.take(10)