<a href="https://colab.research.google.com/github/ranieri-unimi/faces-malchiodi-2022/blob/main/faces.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### run once

In [None]:
!pip install pyspark
!pip install findspark

In [None]:
import os
os.environ["KAGGLE_USERNAME"] = 'ranieriunimi'
os.environ["KAGGLE_KEY"] = str(hex(232307088475198570779809482024044346960))[2:]

In [None]:
ref = 'bwandowando/ukraine-russian-crisis-twitter-dataset-1-2-m-rows'
!mkdir datasets
!kaggle datasets download $ref --unzip -p ./datasets

## ukraine

In [None]:
SAMPLE_SIZE = 510

In [None]:
import numpy as np
import pandas as pd
import csv
import re
import string
import random

In [None]:
import pyspark
import findspark

In [None]:
# load dataset 
filename = r"./datasets/UkraineCombinedTweetsDeduped20220227-131611.csv.gzip"
pd.set_option("display.max_columns", None)
df = pd.read_csv(filename, compression='gzip', index_col=0, encoding='utf-8', quoting=csv.QUOTE_ALL)

## data cleaning

In [None]:
df = df[df.language == 'en']
df = df.text.tolist()

In [None]:
try: df = random.sample(df, SAMPLE_SIZE)
except: pass

In [None]:
df.insert(0, 'lorem ipsum dolor sit amet')
df.insert(0, 'lorem ipsum dolor sit amet')

In [None]:
df[:5]

In [None]:
def preprocess_tweet_text(tweet):
    index, tweet = tweet
    tweet = tweet.lower()
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    tweet = tweet.translate(str.maketrans(string.punctuation+'…’”“', ' '*(len(string.punctuation)+4)))  # puntctuation to spaces
    tweet = ' '.join(tweet.strip().split())
    return (index, tweet)

## hadoop instance

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext

In [None]:
rdd = sc.parallelize(enumerate(df))

In [None]:
rdd.take(7)

In [None]:
dataPipe = rdd.map(preprocess_tweet_text)

In [None]:
dataPipe.take(7)

In [None]:
# indexing = dataset.flatMap(lambda _, v : [(e,e) for e in v]).reduceByKey(lambda k, v : k)

### foos

In [None]:
!pip install crc16
import crc16
import binascii

### shingles

In [None]:
SH_LEN = 5

In [None]:
shigles = dataPipe.flatMap(lambda x : [(x[0], x[-1][i:i+SH_LEN]) for i in range(len(x[-1])-SH_LEN)]).distinct()

In [None]:
shigles.filter(lambda x : x[0]<2).collect()

In [None]:
# def shingling(x):
#   k, v = x
#   shingles = []
#   for i in range(len(v) - SH_LEN):
#     shingle = v[i:i+SH_LEN]
#     bshingle = bytes(shingle, 'utf-8') 
#     hshingle = binascii.crc32(bshingle) # 4 bytes
#     # hshingle = crc16.crc16xmodem(bshingle) # 2 bytes
#     shingles.append( (hshingle, k) )
#   return shingles

In [None]:
def hashval(x):
  bval = bytes(x, 'utf-8')
  hval = binascii.crc32(bval) # 4 bytes
  return hval

In [None]:
charactMx = shigles.mapValues(hashval).distinct()

In [None]:
# charactMx = dataPipe.flatMap(shingling).distinct()

In [None]:
charactMx.filter(lambda x : x[0] < 2).collect()

### MinHashing

In [None]:
def RN(stop) : return random.randrange(stop)
def RB(B = 4) : return RN(2**(8*B-1))*2+1
def PF(x, scale, shift, base) : return (scale * x + shift) % base

In [None]:
import random
n_permfoo, b_bands = 4, 2

(1 / b_bands)**(1 / (n_permfoo//b_bands)) # threshold

In [None]:
perm_foos = [(i, (lambda x : PF(x, RB(), RB(), 2**(8*4)))) for i in range(n_permfoo)]

In [None]:
def gen_perm(x):
  doc, shingle = x
  return [ ((doc, h, shingle), pf(shingle) ) for h, pf in perm_foos]

In [None]:
minHashSignMx = charactMx.flatMap(gen_perm).reduceByKey(min).map(lambda x : ((x[0][0], x[0][1]), x[0][2]) )
# (doc, hash) , shingle

In [None]:
minHashSignMx.filter(lambda x : x[0][0] < 2 and x[0][1] < 2).collect()

### LSH

In [None]:
scale, shift = RN(b_bands), RN(b_bands)
bandmap = (lambda x : PF(x, 1, shift, b_bands))

In [None]:
shift

In [None]:
def band_expand(x):
  (doc, h), v = x
  return ((doc, bandmap(h)), (h, v))

def band_reduct(x):
  (doc, band), a = x
  return ((hash(tuple(a)), band), doc)

In [None]:
minHashSignMx.map(band_expand).groupByKey().mapValues(list).mapValues(lambda a : sorted(a, key=lambda x: x[0])).filter(lambda x : x[0][0]<2).collect()

In [None]:
minHashSignMx.map(band_expand).groupByKey().mapValues(list).mapValues(lambda a : sorted(a, key=lambda x: x[0])).mapValues(lambda a : [v for _, v in a] ).filter(lambda x : x[0][0]<2).collect()

  # .groupByKey().mapValues(list)

In [None]:
bandMx = (
    minHashSignMx
    .map(band_expand)
    .groupByKey().mapValues(list)
    .mapValues(lambda a : sorted(a, key=lambda x: x[0]))
    .mapValues(lambda a : [v for _, v in a] )
    .map(band_reduct)
    .groupByKey().mapValues(list)
  )

In [None]:
bandMx.filter(lambda x : len(x[1])>1).collect()