<a href="https://colab.research.google.com/github/prometheus404/AMD_project/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
##### DOWNLOAD DATASET #####
from IPython.display import clear_output
from google.colab import files
files.upload()
!ls -lha kaggle.json
!pip install -q kaggle # installing the kaggle package
!mkdir -p ~/.kaggle # creating .kaggle folder where the key should be placed
!cp kaggle.json ~/.kaggle/ # move the key to the folder
!pwd # checking the present working directory
!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download -d gsimonx37/letterboxd
!unzip /content/letterboxd.zip -d /content/letterbox/
clear_output()
############################

In [3]:
########### SPARK CONTEXT #####################
import pandas as pd
import itertools
from tqdm import tqdm

!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better

sc = spark.sparkContext
###############################################

In [4]:
############## BASKET CREATION ################
#TODO do it using SPARK directly -> cars = spark.read.csv('cars.csv', header=True, sep=";")
actors = pd.read_csv("letterbox/actors.csv")
actors = actors
baskets = actors.groupby("id")["name"].apply(list)
baskets = baskets.sample(30000)

print("number of baskets: " + str(len(baskets)))
print("biggest basket: " + str(baskets.map(len).max()))
print(baskets)
baskets_RDD = sc.parallelize(baskets).cache()
##############################################

number of baskets: 30000
biggest basket: 348
id
1123605    [Jacques Mercier, Jeremy Paris, Jean-Marie Jua...
1107285    [Reiko Yasuhara, Hōsei Komatsu, Hidemi Maruyam...
1526765       [Paulo Cruz, Mónica Monteiro, Amílcar Bonança]
1742527    [Michael Barrow Smith, Monica West, Claire Wel...
1062107    [Erik Frey, Otto Wögerer, Franz Messner, Ernst...
                                 ...                        
1423604                      [Joshua Womble, Marija Velkova]
1317548    [Marcela Franco, Jacqueline Lustig, Cristian B...
1265976                                     [Bettany Hughes]
1081469                        [Tijn Winters, Jetske Lieber]
1343689    [Georg Schmieter, Mia Pankau, Harry Liedtke, A...
Name: name, Length: 30000, dtype: object


In [5]:
test_basket = [
    ['a','b','c','d','e','f','g','h','i','l'],
    ['n','o'],
    ['o','n'],
    ['a','b','c','d','e','f',],
    ['a','b','c'],
    ['f','q'],
    ['f','q','z'],
    ['f','u','q'],
    ['n','o','i'],
    ['a','b','c','d','e','f','q','p','o','n'],
    ['a','b','c','d','e','f','r','s','t','u'],
]
test_RDD = sc.parallelize(test_basket).cache()

In [6]:
## APRIORI
def apriori(chunk, s, tot_bsk):

  n_bsk = 0 # number of baskets in chunk used to calculate support
#  trsl = dict() # using a translation could be better for the combination step
  count = dict() # should be relatively efficient since is a hash table

#
#  def translate(e, dic):
#    try:
#      return dic[e]
#    except KeyError:
#      dic[e] = len(dic)+1
#      return dic[e]
  def incr(e,dic):
    try:
      dic[e] += 1;
    except KeyError:
      dic[e] = 1

  chunk = list(chunk) # to avoid consuming after first pass
  ### first pass
  for bsk in chunk:
    n_bsk += 1;
    #elements = [translate(x,trsl) for x in bsk]
    for b in bsk:
      incr(b,count)

  ### filter
  to_del = []
  frequent = []
  for c in count:
    if count[c] >= s* n_bsk/tot_bsk:
      frequent.append(c)

  k = 2
  while len(frequent) > 0:
    count = dict()

    for bsk in tqdm(chunk):
      #filter only members of C_k
      filtered = set()
      for b in bsk:
        for f in frequent:
          if b in f:
            filtered.add(b)
            break;
      bsk = list(filtered)
      bsk.sort()
      for tpl in itertools.combinations(bsk, k):
          incr(tuple(tpl), count)

    frequent = []
    for c in count:
      if count[c] >= s * n_bsk/tot_bsk:
        frequent.append(c)
        yield(c,1)
    k+=1

#SON second map
def second_map(chunk,itemset,s):
  def incr(e,dic):
    try:
      dic[e] += 1;
    except KeyError:
      dic[e] = 1

  count = dict()
  for bsk in tqdm(chunk):
    for i in itemset:
      flag = True
      for e in i:
        if e not in bsk:
          flag = False
          break
      if flag:
        incr(i,count)
  for c in count:
    yield (c,count[c])

In [7]:
S =  5; #support threshold (maybe a fraction is better)
TOT = len(test_basket)#len(baskets)

real = []
for a in apriori(test_basket,S,TOT):
  real.append(a[0])
print(real)

100%|██████████| 11/11 [00:00<00:00, 59608.97it/s]
100%|██████████| 11/11 [00:00<00:00, 26607.46it/s]
100%|██████████| 11/11 [00:00<00:00, 93206.76it/s]

[('a', 'b'), ('a', 'c'), ('b', 'c'), ('a', 'b', 'c')]





In [8]:
#SON first step
candidates = test_RDD.mapPartitions(lambda x: apriori(x,S,TOT)).reduceByKey(lambda a,b: a).keys().collect()
print(candidates)

[('b', 'c'), ('a', 'b', 'c'), ('f', 'q'), ('a', 'b'), ('a', 'c')]


In [9]:
#SON second step
real_from_candidates = test_RDD.mapPartitions(lambda x: second_map(x, candidates,S)).reduceByKey(lambda a,b: a+b).filter(lambda a: a[1] >= S).keys().collect()
print(real_from_candidates)

[('b', 'c'), ('a', 'b', 'c'), ('a', 'b'), ('a', 'c')]


In [10]:
not_real = [x for x in candidates if x not in real]
not_present = [x for x in real if x not in candidates]
print("candidates not in real set:" + str(not_real))
print("real itemset not present in candidates:" + str(not_present))
f1 = [x for x in real_from_candidates if x not in real]
f2 = [x for x in real if x not in real_from_candidates]
print("diff between final and real:"+ str(f1) + str(f2))


candidates not in real set:[('f', 'q')]
real itemset not present in candidates:[]
diff between final and real:[][]


In [11]:
def son(rdd,s,tot):
  #FIRST STEP
  candidates = rdd.mapPartitions(lambda x: apriori(x,s,tot)).reduceByKey(lambda a,b: a).keys().collect()
  #SECOND STEP
  return rdd.mapPartitions(lambda x: second_map(x, candidates,s)).reduceByKey(lambda a,b: a+b).filter(lambda a: a[1] >= S).keys().collect()

son(test_RDD,S,TOT)

[('b', 'c'), ('a', 'b', 'c'), ('a', 'b'), ('a', 'c')]

In [12]:
son(baskets_RDD, 6, len(baskets))

[('Jaroslav Tomsa', 'Zdeněk Srstka'),
 ('Karel Engel', 'Zdeněk Srstka'),
 ('Kusuo Abe', 'Yoshio Yoshida'),
 ('Innocent', 'Sankaradi'),
 ('Dal McKennon', 'Grace Stafford'),
 ('Manorama', 'Nagesh'),
 ('Jacques-Yves Cousteau', 'Philippe Cousteau Jr.'),
 ("Allen 'Farina' Hoskins", "Bobby 'Wheezer' Hutchins"),
 ("Allen 'Farina' Hoskins", 'Mary Ann Jackson'),
 ("Bobby 'Wheezer' Hutchins", 'Mary Ann Jackson'),
 ("Allen 'Farina' Hoskins", "Bobby 'Wheezer' Hutchins", 'Mary Ann Jackson'),
 ('Anton Soini', 'Arvo Kuusla'),
 ('Brad Heller', 'David A.R. White'),
 ('Andreas Kern', 'Egon Biscan'),
 ('Hans Stadlbauer', 'Kathi Leitner'),
 ('Andreas Kern', 'Egon Biscan', 'Hans Stadlbauer'),
 ('Dharmendra', 'Hema Malini'),
 ('George Harrison', 'Paul McCartney'),
 ('George Harrison', 'Paul McCartney', 'Ringo Starr'),
 ('Erich Seyfried', 'Peter Steiner'),
 ('Gerda Steiner', 'Gerda Steiner-Paltzer'),
 ('Gerda Steiner', 'Petra Auer'),
 ('Gerda Steiner-Paltzer', 'Petra Auer'),
 ('Erich Seyfried', 'Gerda Steine

In [13]:
for a in apriori(baskets,6,len(baskets)):
  print(a)

100%|██████████| 30000/30000 [02:18<00:00, 215.97it/s]


(('Jaroslav Tomsa', 'Zdeněk Srstka'), 1)
(('Innocent', 'Sukumari'), 1)
(('Nedumudi Venu', 'Sukumari'), 1)
(('Sankaradi', 'Sukumari'), 1)
(('Oskar Sima', 'Paul Hörbiger'), 1)
(('Dal McKennon', 'Grace Stafford'), 1)
(('Manorama', 'Nagesh'), 1)
(('Jacques-Yves Cousteau', 'Philippe Cousteau Jr.'), 1)
(("Allen 'Farina' Hoskins", "Bobby 'Wheezer' Hutchins"), 1)
(('Anton Soini', 'Arvo Kuusla'), 1)
(('Andreas Kern', 'Egon Biscan'), 1)
(('Andreas Kern', 'Hans Stadlbauer'), 1)
(('Andreas Kern', 'Kathi Leitner'), 1)
(('Hans Stadlbauer', 'Kathi Leitner'), 1)
(('Dharmendra', 'Hema Malini'), 1)
(('Daws Butler', 'Don Messick'), 1)
(('George Harrison', 'Paul McCartney'), 1)
(('George Harrison', 'Ringo Starr'), 1)
(('Paul McCartney', 'Ringo Starr'), 1)
(('Bill Gaither', 'Gloria Gaither'), 1)
(('Erich Seyfried', 'Gerda Steiner'), 1)
(('Erich Seyfried', 'Gerda Steiner-Paltzer'), 1)
(('Erich Seyfried', 'Peter Steiner'), 1)
(('Erich Seyfried', 'Petra Auer'), 1)
(('Gerda Steiner', 'Gerda Steiner-Paltzer'), 

100%|██████████| 30000/30000 [00:03<00:00, 8545.56it/s]


(('Andreas Kern', 'Hans Stadlbauer', 'Kathi Leitner'), 1)
(('George Harrison', 'Paul McCartney', 'Ringo Starr'), 1)
(('Erich Seyfried', 'Gerda Steiner', 'Gerda Steiner-Paltzer'), 1)
(('Erich Seyfried', 'Gerda Steiner', 'Peter Steiner'), 1)
(('Erich Seyfried', 'Gerda Steiner', 'Petra Auer'), 1)
(('Erich Seyfried', 'Gerda Steiner-Paltzer', 'Peter Steiner'), 1)
(('Erich Seyfried', 'Gerda Steiner-Paltzer', 'Petra Auer'), 1)
(('Erich Seyfried', 'Peter Steiner', 'Petra Auer'), 1)
(('Gerda Steiner', 'Gerda Steiner-Paltzer', 'Peter Steiner'), 1)
(('Gerda Steiner', 'Gerda Steiner-Paltzer', 'Petra Auer'), 1)
(('Gerda Steiner', 'Peter Steiner', 'Petra Auer'), 1)
(('Gerda Steiner-Paltzer', 'Peter Steiner', 'Petra Auer'), 1)
(('Eddie Vedder', 'Jeff Ament', 'Matt Cameron'), 1)
(('Eddie Vedder', 'Jeff Ament', 'Mike McCready'), 1)
(('Eddie Vedder', 'Jeff Ament', 'Stone Gossard'), 1)
(('Eddie Vedder', 'Matt Cameron', 'Mike McCready'), 1)
(('Eddie Vedder', 'Matt Cameron', 'Stone Gossard'), 1)
(('Eddie V

100%|██████████| 30000/30000 [00:01<00:00, 18038.72it/s]


(('Erich Seyfried', 'Gerda Steiner', 'Gerda Steiner-Paltzer', 'Peter Steiner'), 1)
(('Erich Seyfried', 'Gerda Steiner', 'Gerda Steiner-Paltzer', 'Petra Auer'), 1)
(('Erich Seyfried', 'Gerda Steiner', 'Peter Steiner', 'Petra Auer'), 1)
(('Erich Seyfried', 'Gerda Steiner-Paltzer', 'Peter Steiner', 'Petra Auer'), 1)
(('Gerda Steiner', 'Gerda Steiner-Paltzer', 'Peter Steiner', 'Petra Auer'), 1)
(('Eddie Vedder', 'Jeff Ament', 'Matt Cameron', 'Mike McCready'), 1)
(('Eddie Vedder', 'Jeff Ament', 'Matt Cameron', 'Stone Gossard'), 1)
(('Eddie Vedder', 'Jeff Ament', 'Mike McCready', 'Stone Gossard'), 1)
(('Eddie Vedder', 'Matt Cameron', 'Mike McCready', 'Stone Gossard'), 1)
(('Jeff Ament', 'Matt Cameron', 'Mike McCready', 'Stone Gossard'), 1)
(('George Harrison', 'John Lennon', 'Paul McCartney', 'Ringo Starr'), 1)
(('Boom Gaspar', 'Eddie Vedder', 'Jeff Ament', 'Matt Cameron'), 1)
(('Boom Gaspar', 'Eddie Vedder', 'Jeff Ament', 'Mike McCready'), 1)
(('Boom Gaspar', 'Eddie Vedder', 'Jeff Ament', '

100%|██████████| 30000/30000 [00:01<00:00, 27326.36it/s]


(('Erich Seyfried', 'Gerda Steiner', 'Gerda Steiner-Paltzer', 'Peter Steiner', 'Petra Auer'), 1)
(('Eddie Vedder', 'Jeff Ament', 'Matt Cameron', 'Mike McCready', 'Stone Gossard'), 1)
(('Boom Gaspar', 'Eddie Vedder', 'Jeff Ament', 'Matt Cameron', 'Mike McCready'), 1)
(('Boom Gaspar', 'Eddie Vedder', 'Jeff Ament', 'Matt Cameron', 'Stone Gossard'), 1)
(('Boom Gaspar', 'Eddie Vedder', 'Jeff Ament', 'Mike McCready', 'Stone Gossard'), 1)
(('Boom Gaspar', 'Eddie Vedder', 'Matt Cameron', 'Mike McCready', 'Stone Gossard'), 1)
(('Boom Gaspar', 'Jeff Ament', 'Matt Cameron', 'Mike McCready', 'Stone Gossard'), 1)


100%|██████████| 30000/30000 [00:00<00:00, 57867.61it/s]


(('Boom Gaspar', 'Eddie Vedder', 'Jeff Ament', 'Matt Cameron', 'Mike McCready', 'Stone Gossard'), 1)


100%|██████████| 30000/30000 [00:00<00:00, 139228.16it/s]


In [104]:
baskets

id
1195602                       [Bernardo Rocha, Luix Gabriel]
1109331    [William Hauber, Phillip Tyron, Glen Cavender,...
1562029                               [Christopher Sherwood]
1093613                         [Otto Schily, Ursula Lefkes]
1449148    [Kenne Duncan, Tex Palmer, Carl Mathews, Chick...
                                 ...                        
1320411    [Saeed Jaffrey, Dinsdale Landen, David Max Vau...
1170424    [Han Yoo-jeong, Gi So-you, Kim Jae-chul, Baek ...
1743383    [Jan Anderson, Anya O'Callaghan, Jake Sawyers,...
1559554    [Parvateesam, Swati Dixit, Rocket Raghava, Lir...
1673766           [Jeong Jun-ho, Kim Soo-mi, Shin Hyun-joon]
Name: name, Length: 30000, dtype: object

In [115]:
sum([1 for x in baskets if 'Angelina Jolie' in x])

5