<a href="https://colab.research.google.com/github/riccardotenuta/market_basket_analysis/blob/main/Market_Basket_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import dataset from Kaggle

In [1]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"riccardotenuta0023","key":"9fe14c68d229c6d70a59ac7ce2172410"}'}

In [2]:
!ls -lha kaggle.json
!pip install -q kaggle # installing the kaggle package
!mkdir -p ~/.kaggle # creating .kaggle folder where the key should be placed
!cp kaggle.json ~/.kaggle/ # move the key to the folder
!pwd # checking the present working directory

-rw-r--r-- 1 root root 74 Apr 30 16:15 kaggle.json
/content


In [3]:
!chmod 600 ~/.kaggle/kaggle.json

In [4]:
!kaggle datasets download -d asaniczka/1-3m-linkedin-jobs-and-skills-2024 -p /content/drive/MyDrive/

Downloading 1-3m-linkedin-jobs-and-skills-2024.zip to /content/drive/MyDrive
 99% 1.87G/1.88G [00:20<00:00, 143MB/s]
100% 1.88G/1.88G [00:20<00:00, 98.4MB/s]


In [5]:
!unzip /content/drive/MyDrive/1-3m-linkedin-jobs-and-skills-2024.zip -d /content/drive/MyDrive/

Archive:  /content/drive/MyDrive/1-3m-linkedin-jobs-and-skills-2024.zip
  inflating: /content/drive/MyDrive/job_skills.csv  
  inflating: /content/drive/MyDrive/job_summary.csv  
  inflating: /content/drive/MyDrive/linkedin_job_postings.csv  


In [6]:
import pandas as pd
import numpy as np
import os
!pip install pyspark
from pyspark.sql import SparkSession
import pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=68a4ebb17c9c35c521d402887a427383e337358cd41366f60c49049de88282f6
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


## Data preparation

In [8]:
spark = SparkSession.builder.appName("linkedin_project_SON").getOrCreate()

In [9]:
# import of the csv and selecting only the job_skills column
rdd_from_csv = spark.read.csv('./drive/MyDrive/job_skills.csv', header=True).dropna()
rdd_from_csv = rdd_from_csv.select(rdd_from_csv['job_skills']).rdd

In [10]:
# splitting every basket to create the item list
rdd_from_csv = rdd_from_csv.map(lambda basket: basket['job_skills'].split(', '))

In [11]:
# example of the first two baskets
rdd_from_csv.take(2)

[['Building Custodial Services',
  'Cleaning',
  'Janitorial Services',
  'Materials Handling',
  'Housekeeping',
  'Sanitation',
  'Waste Management',
  'Floor Maintenance',
  'Equipment Maintenance',
  'Safety Protocols',
  'Communication Skills',
  'Attention to Detail',
  'Physical Strength',
  'Experience in Housekeeping'],
 ['Customer service',
  'Restaurant management',
  'Food safety',
  'Training',
  'Supervision',
  'Scheduling',
  'Inventory',
  'Cost control',
  'Sales',
  'Communication',
  'Problemsolving',
  'Leadership',
  'Motivation',
  'Teamwork',
  'High School Diploma',
  "Bachelor's Degree",
  'ServSafe Certification',
  "Valid Driver's License",
  'Physical ability to perform job duties']]

In [12]:
# sampling the 10% of the whole dataset to compute easily the SON algorithm
rdd_son = rdd_from_csv.sample(withReplacement=False, fraction=0.01)
num_baskets = rdd_son.count()
num_baskets

12918

In [13]:
initial_partitions = rdd_son.getNumPartitions()
"""
Spark suggests to use 2-4 partitions for each CPU on the machine, since running the algorithm only on a single machine
I'll take this value from the cpu_count() function inside the multiprocessing library

https://spark.apache.org/docs/latest/rdd-programming-guide.html#parallelized-collections
"""
from multiprocessing import cpu_count
optimal_partitions = 4*cpu_count()

rdd_son.repartition(numPartitions=optimal_partitions)

print(f'Partitions before -> {initial_partitions}')
print(f'Optimal partitions -> {optimal_partitions}')

Partitions before -> 6
Optimal partitions -> 8


In [14]:
# define the support for each partition

support_threshold = round(0.02*num_baskets)
st_partition = round(support_threshold / rdd_son.getNumPartitions())

print(f'Support threshold for each partition/chunk of data is {st_partition}')

Support threshold for each partition/chunk of data is 43


In [15]:
def first_pass(partition, support: int) -> list:

  item_count = {}

  for basket in partition:
    for item in basket:
      item_count[item] = item_count.get(item, 0) + 1

  frequent_singleton = [(item, count) for item, count in item_count.items() if count >= support]

  return sorted(frequent_singleton, key=lambda x: x[1], reverse=True)

first_pass_rdd = rdd_son.mapPartitions(lambda partition: first_pass(partition, st_partition))

In [16]:
first_pass_rdd.take(10)

[('Communication', 676),
 ('Teamwork', 445),
 ('Leadership', 312),
 ('Customer service', 292),
 ('Communication skills', 233),
 ('Problemsolving', 190),
 ('Problem Solving', 183),
 ('Collaboration', 181),
 ('Customer Service', 180),
 ('Project Management', 165)]

In [17]:
frequent_singleton = first_pass_rdd.map(lambda item: item[0]).collect()
frequent_singleton[:10]

['Communication',
 'Teamwork',
 'Leadership',
 'Customer service',
 'Communication skills',
 'Problemsolving',
 'Problem Solving',
 'Collaboration',
 'Customer Service',
 'Project Management']

In [18]:
from itertools import combinations

def second_pass(partition, support: int) -> list:

  couples_count = {}

  for basket in partition:
    candidate_couples = list(combinations(basket, 2))
    for c in candidate_couples:

      if all(x in frequent_singleton for x in c):
          couples_count[c] = couples_count.get(c, 0) + 1

  frequent_couples = [(couple, count) for couple, count in couples_count.items() if count >= support]
  return sorted(frequent_couples, reverse=True, key=lambda x: x[1])

second_pass_rdd = rdd_son.mapPartitions(lambda partition: second_pass(partition, st_partition))

In [19]:
frequent_couples = second_pass_rdd.collect()

In [20]:
second_pass_rdd.take(10)

[(('Communication', 'Teamwork'), 203),
 (('Leadership', 'Communication'), 109),
 (('Communication', 'Problemsolving'), 104),
 (('Communication', 'Problem Solving'), 100),
 (('Communication', 'Leadership'), 92),
 (('Customer service', 'Communication'), 89),
 (('Customer service', 'Teamwork'), 80),
 (('Communication', 'Collaboration'), 76),
 (('Communication', 'Time Management'), 76),
 (('Leadership', 'Teamwork'), 75)]

In [21]:
def get_frequent_itemset(partition, support, frequent_itemset, n_pass):

  item_set_count = {}
  for basket in partition:

    candidate_itemset = list(combinations(basket, n_pass))
    for c in candidate_itemset:

      if n_pass > 2: subset = list(combinations(c, n_pass-1))
      else: subset = c

      # check if all the item of the candidate set are in the basket and if its subsets are contained in the previous frequent itemsets
      if all(x in frequent_itemset for x in subset):
        item_set_count[c] = item_set_count.get(c, 0) + 1

  new_frequent_itemset = [(itemset, count) for itemset, count in item_set_count.items() if count >= support]

  return sorted(new_frequent_itemset, reverse=True, key=lambda x: x[1])


In [22]:
# get frequent singleton

rdd_iter = []

n_pass = 1

rdd_frequent_itemset = rdd_son.mapPartitions(lambda partition: first_pass(partition, st_partition))
print(f'Pass {n_pass}')
print(rdd_frequent_itemset.take(10))
rdd_iter.append(rdd_frequent_itemset)

frequent_itemset = rdd_frequent_itemset.map(lambda item: item[0]).collect()
frequent_itemset_len = rdd_frequent_itemset.count()
n_pass = 2

checkEmpty = True

while len(frequent_itemset) > 0:

  rdd_frequent_itemset = rdd_son.mapPartitions(lambda partition: get_frequent_itemset(partition, st_partition, set(frequent_itemset), n_pass))
  rdd_iter.append(rdd_frequent_itemset)
  print(f'Pass {n_pass}')
  print(rdd_frequent_itemset.take(10))

  frequent_itemset = [itemset[0] for itemset in rdd_frequent_itemset.collect()]

  n_pass += 1



Pass 1
[('Communication', 676), ('Teamwork', 445), ('Leadership', 312), ('Customer service', 292), ('Communication skills', 233), ('Problemsolving', 190), ('Problem Solving', 183), ('Collaboration', 181), ('Customer Service', 180), ('Project Management', 165)]
Pass 2
[(('Communication', 'Teamwork'), 203), (('Leadership', 'Communication'), 109), (('Communication', 'Problemsolving'), 104), (('Communication', 'Problem Solving'), 100), (('Communication', 'Leadership'), 92), (('Customer service', 'Communication'), 89), (('Customer service', 'Teamwork'), 80), (('Communication', 'Collaboration'), 76), (('Communication', 'Time Management'), 76), (('Leadership', 'Teamwork'), 75)]
Pass 3
[(('Customer service', 'Communication', 'Teamwork'), 57), (('Customer service', 'Communication', 'Problemsolving'), 56), (('Customer service', 'Communication', 'Problemsolving'), 48), (('Customer service', 'Communication', 'Teamwork'), 48)]
Pass 4
[]


In [23]:
rdd_iter[2]

PythonRDD[35] at collect at <ipython-input-22-ad2c723ee76d>:25

In [24]:
def get_couples_support(couple, frequent_couples):
  for c in frequent_couples:
    if list(c[0]) == couple:
      return c[1]

In [25]:
def compute_association_rules(frequent_itemset_support, frequent_couples):
  for frequent_itemset in frequent_itemset_support[:5]:
    for item in frequent_itemset[0]:
      fi = list(frequent_itemset[0])

      fi.remove(item)
      couple_support = get_couples_support(fi, frequent_couples)
      support_with_item = frequent_itemset[1]

      confidence = round((support_with_item / couple_support)*100, 1)

      print(f'{fi} --> {item} with {confidence}%')


In [26]:
# TODO reduce to all frquent couples from all the partitions (maintain unique frequent couples)
frequent_triplets = rdd_iter[2].collect()
compute_association_rules(frequent_triplets, frequent_couples)

['Communication', 'Teamwork'] --> Customer service with 28.1%
['Customer service', 'Teamwork'] --> Communication with 71.2%
['Customer service', 'Communication'] --> Teamwork with 64.0%
['Communication', 'Problemsolving'] --> Customer service with 53.8%
['Customer service', 'Problemsolving'] --> Communication with 76.7%
['Customer service', 'Communication'] --> Problemsolving with 62.9%
['Communication', 'Problemsolving'] --> Customer service with 46.2%
['Customer service', 'Problemsolving'] --> Communication with 65.8%
['Customer service', 'Communication'] --> Problemsolving with 53.9%
['Communication', 'Teamwork'] --> Customer service with 23.6%
['Customer service', 'Teamwork'] --> Communication with 60.0%
['Customer service', 'Communication'] --> Teamwork with 53.9%
