<a href="https://colab.research.google.com/github/ranieri-unimi/faces-malchiodi-2022/blob/main/faces.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### run once

In [None]:
!pip install pyspark
!pip install findspark

In [None]:
import os
os.environ["KAGGLE_USERNAME"] = 'ranieriunimi'
os.environ["KAGGLE_KEY"] = str(hex(232307088475198570779809482024044346960))[2:]

In [None]:
ref = 'bwandowando/ukraine-russian-crisis-twitter-dataset-1-2-m-rows'
!mkdir datasets
!kaggle datasets download $ref --unzip -p ./datasets

## ukraine

In [None]:
SAMPLE_SIZE = 2048

In [None]:
import numpy as np
import pandas as pd
import csv
import re
import string
import random

In [None]:
import pyspark
import findspark

In [None]:
# load dataset 
filename = r"./datasets/UkraineCombinedTweetsDeduped20220227-131611.csv.gzip"
pd.set_option("display.max_columns", None)
df = pd.read_csv(filename, compression='gzip', index_col=0, encoding='utf-8', quoting=csv.QUOTE_ALL)

## data cleaning

In [None]:
df = df[df.language == 'en']
df = df.text.tolist()

In [None]:
df.insert(0, 'lorem ipsum dolor sit amet')
df.insert(0, 'lorem ipsum dolor sit amet')

In [None]:
len(df)

In [None]:
try: df = random.sample(df, SAMPLE_SIZE)
except: pass

In [None]:
def preprocess_tweet_text(tweet):
    index, tweet = tweet
    tweet = tweet.lower()
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    tweet = tweet.translate(str.maketrans(string.punctuation+'…’”“', ' '*(len(string.punctuation)+4)))  # puntctuation to spaces
    tweet = ' '.join(tweet.strip().split())
    return (index, tweet)

## hadoop instance

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext

In [None]:
rdd = sc.parallelize(enumerate(df))

In [None]:
dataPipe = rdd.map(preprocess_tweet_text)

In [None]:
dataPipe.take(10)

In [None]:
# indexing = dataset.flatMap(lambda _, v : [(e,e) for e in v]).reduceByKey(lambda k, v : k)

### foos

In [None]:
!pip install crc16
import crc16
import binascii

### shingles

In [None]:
SH_LEN = 5

In [None]:
shigles = dataPipe.flatMap(lambda x : [(x[-1][i:i+SH_LEN], x[0]) for i in range(0, len(x[-1])-SH_LEN)]).distinct()

In [None]:
def shingling(x):
  k, v = x
  shingles = []
  for i in range(len(v) - SH_LEN):
    shingle = v[i:i+SH_LEN]
    bshingle = bytes(shingle, 'utf-8') 
    hshingle = binascii.crc32(bshingle) # 4 bytes
    # hshingle = crc16.crc16xmodem(bshingle) # 2 bytes
    shingles.append( (hshingle, k) )
  return shingles

In [None]:
charactMx = dataPipe.flatMap(shingling).distinct()

### MinHashing

In [None]:
def RN(stop) : return random.randrange(stop)
def RB(B = 4) : return RN(2**(8*B-1))*2+1
def HF(x, scale, shift, base) : return (scale * x + shift) % base

In [None]:
import random
n_hashfoo, b_bands = 120, 40

(1 / b_bands)**(1 / (n_hashfoo//b_bands)) # threshold

In [None]:
hash_foos = [(i, (lambda x : HF(x, RB(), RB(), 2**(8*4)))) for i in range(n_hashfoo)]

In [None]:
minHashSignMx = charactMx.flatMap(lambda x : [((i, x[-1]), hf(x[0])) for i, hf in hash_foos]).reduceByKey(min)
# (nº of hash_foo, ShingleSet aka document nº) , hash_val

### LSH

In [None]:
bandmap = (lambda x : HF(x, RN(b_bands), RN(b_bands), b_bands))

In [None]:
def band_expand(x):
  (h, s), v = x
  return ((bandmap(h), s), (h, v))

def band_reduct(x):
  (b, s), a = x
  return ((hash(tuple(a)), b), s)

In [None]:
bandMx = ( minHashSignMx
 .map(band_expand)
 .groupByKey().mapValues(list)
 .mapValues(lambda a : sorted(a, key=lambda x: x[0]))
 .mapValues(lambda a : [v for _, v in a] )
 .map(band_reduct)
 .groupByKey().mapValues(list)
)

In [None]:
 bandMx.filter(lambda x : len(x[1]) > 1 ).collect()