<a href="https://colab.research.google.com/github/riccardotenuta/market_basket_analysis/blob/main/Market_Basket_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import dataset from Kaggle

In [None]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"riccardotenuta0023","key":"9fe14c68d229c6d70a59ac7ce2172410"}'}

In [None]:
!ls -lha kaggle.json
!pip install -q kaggle # installing the kaggle package
!mkdir -p ~/.kaggle # creating .kaggle folder where the key should be placed
!cp kaggle.json ~/.kaggle/ # move the key to the folder
!pwd # checking the present working directory

-rw-r--r-- 1 root root 74 Apr 17 19:34 kaggle.json
/content


In [None]:
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d asaniczka/1-3m-linkedin-jobs-and-skills-2024 -p /content/drive/MyDrive/

Downloading 1-3m-linkedin-jobs-and-skills-2024.zip to /content/drive/MyDrive
100% 1.88G/1.88G [00:20<00:00, 125MB/s]
100% 1.88G/1.88G [00:20<00:00, 97.0MB/s]


In [None]:
!unzip /content/drive/MyDrive/1-3m-linkedin-jobs-and-skills-2024.zip -d /content/drive/MyDrive/

Archive:  /content/drive/MyDrive/1-3m-linkedin-jobs-and-skills-2024.zip
  inflating: /content/drive/MyDrive/job_skills.csv  
  inflating: /content/drive/MyDrive/job_summary.csv  
  inflating: /content/drive/MyDrive/linkedin_job_postings.csv  


In [None]:
import pandas as pd
import numpy as np
import os
!pip install pyspark
from pyspark.sql import SparkSession
import pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=2660d8c2df03a4099359b7cdafb542ea40e873b0e1925b7e9de5a956c5fe799e
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


## Data preparation

In [None]:
spark = SparkSession.builder.appName("linkedin_project_SON").getOrCreate()

In [None]:
# import of the csv and selecting only the job_skills column
rdd_from_csv = spark.read.csv('./drive/MyDrive/job_skills.csv', header=True).dropna()
rdd_from_csv = rdd_from_csv.select(rdd_from_csv['job_skills']).rdd

In [None]:
# splitting every basket to create the item list
rdd_from_csv = rdd_from_csv.map(lambda basket: basket['job_skills'].split(', '))

In [None]:
# example of the first two baskets
rdd_from_csv.take(2)

[['Building Custodial Services',
  'Cleaning',
  'Janitorial Services',
  'Materials Handling',
  'Housekeeping',
  'Sanitation',
  'Waste Management',
  'Floor Maintenance',
  'Equipment Maintenance',
  'Safety Protocols',
  'Communication Skills',
  'Attention to Detail',
  'Physical Strength',
  'Experience in Housekeeping'],
 ['Customer service',
  'Restaurant management',
  'Food safety',
  'Training',
  'Supervision',
  'Scheduling',
  'Inventory',
  'Cost control',
  'Sales',
  'Communication',
  'Problemsolving',
  'Leadership',
  'Motivation',
  'Teamwork',
  'High School Diploma',
  "Bachelor's Degree",
  'ServSafe Certification',
  "Valid Driver's License",
  'Physical ability to perform job duties']]

In [None]:
# sampling the 10% of the whole dataset to compute easily the SON algorithm
rdd_son = rdd_from_csv.sample(withReplacement=False, fraction=0.10)
rdd_son.count()


130062

In [None]:
initial_partitions = rdd_son.getNumPartitions()
"""
Spark suggests to use 2-4 partitions for each CPU on the machine, since running the algorithm only on a single machine
I'll take this value from the cpu_count() function inside the multiprocessing library

https://spark.apache.org/docs/latest/rdd-programming-guide.html#parallelized-collections
"""
from multiprocessing import cpu_count
optimal_partitions = 4*cpu_count()

rdd_son.repartition(numPartitions=optimal_partitions)

print(f'Partitions before -> {initial_partitions}')
print(f'Optimal partitions -> {optimal_partitions}')

Partitions before -> 6
Optimal partitions -> 8


In [None]:
# define the support for each partition

support_threshold = round(0.02*rdd_son.count())
st_partition = round(support_threshold / rdd_son.getNumPartitions())

print(f'Support threshold for each partition/chunk of data is {st_partition}')

Support threshold for each partition/chunk of data is 434


In [None]:
def first_pass(partition, support: int) -> list:

  item_count = {}

  for basket in partition:
    for item in basket:
      item_count[item] = item_count.get(item, 0) + 1

  frequent_singleton = [(item, count) for item, count in item_count.items() if count >= support]

  return sorted(frequent_singleton, key=lambda x: x[1], reverse=True)

first_pass_rdd = rdd_son.mapPartitions(lambda partition: first_pass(partition, st_partition))

In [None]:
def second_pass(partition, support: int) -> list:


In [None]:
first_pass_rdd.take(10)

[('Communication', 6757),
 ('Teamwork', 4170),
 ('Leadership', 3284),
 ('Customer service', 2992),
 ('Communication skills', 2277),
 ('Customer Service', 1997),
 ('Problem Solving', 1896),
 ('Sales', 1709),
 ('Problemsolving', 1703),
 ('Project Management', 1618)]