<a href="https://colab.research.google.com/github/ranieri-unimi/lsh-malchiodi-2022/blob/main/faces.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os

os.environ["KAGGLE_USERNAME"] = "ranieriunimi"
os.environ["KAGGLE_KEY"] = str(hex(232307088475198570779809482024044346960))[2:]

In [None]:
ref = "bwandowando/ukraine-russian-crisis-twitter-dataset-1-2-m-rows"

!mkdir datasets
!kaggle datasets download $ref --unzip -p ./datasets

In [None]:
!pip install pyspark
!pip install findspark

In [None]:
import pyspark
import findspark

In [None]:
import numpy as np
import pandas as pd
import csv
import re
import string
import random

# plagiarism

set a number cap of tweets (0 to skip)

In [None]:
SAMPLE_SIZE = 10 * 1000

hadoop instance

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext

load the dataset

In [None]:
filename = r"./datasets/UkraineCombinedTweetsDeduped20220227-131611.csv.gzip"
pd.set_option("display.max_columns", None)
df = pd.read_csv(
    filename, compression="gzip", index_col=0, encoding="utf-8", quoting=csv.QUOTE_ALL
)

In [None]:
df = df[df.language == "en"]

In [None]:
dl = df.text.tolist()
dl = random.sample(dl, SAMPLE_SIZE) if SAMPLE_SIZE else dl

In [None]:
rdd = sc.parallelize(enumerate(dl))

rdd.take(7)

## data cleaning

In [None]:
def preprocess_tweet_text(tweet):

    # opt for a case insensite analysis
    tweet = tweet.lower()

    # remove URLs
    tweet = re.sub(r"http\S+|www\S+|https\S+", "", tweet, flags=re.MULTILINE)

    # translate puntctuation to spaces
    tweet = tweet.translate(
        str.maketrans(string.punctuation + "…’”“‘", " " * (len(string.punctuation) + 5))
    )

    # remove extra spaces
    tweet = " ".join(tweet.strip().split())

    return tweet

In [None]:
def shift_key_value(x):
    return (x[-1], x[0])

In [None]:
dataPipe = (
    rdd
    #    clean tweets
    .mapValues(preprocess_tweet_text)
    #    remove duplicates
    .map(shift_key_value)
    .reduceByKey(min)
    .map(shift_key_value)
)

dataPipe.take(7)

In [None]:
dataPipe.count()

## shingles

In [None]:
SH_LEN = 5

In [None]:
shigles = dataPipe.flatMap(
    lambda x: [(x[0], x[-1][i : i + SH_LEN]) for i in range(len(x[-1]) - SH_LEN)]
).distinct()

shigles.take(7)

In [None]:
!pip install crc16

import crc16
import binascii

In [None]:
def hash_value(v):
    b = bytes(v, "utf-8")
    h = binascii.crc32(b)  # 4 bytes
    return h

In [None]:
charactMx = shigles.mapValues(hash_value).distinct()

charactMx.take(7)

## MinHashing

In [None]:
def RN(stop, start=1):
    """
        simple alias for random.randrange
        but `start` is set to 1 by default
    """
    return random.randrange(start, stop)


def RB(B=4):
    """
        Generate a odd number smaller than `B` bytes
    """
    return RN(2 ** (8 * B - 1), 0) * 2 + 1


def PF(x, scale, shift, base):
    """
        Permutation Function:
        Permute `x` by given parameters
    """
    return (scale * x + shift) % base


In [None]:
n_permfoo, b_bands = 120, 12

(1 / b_bands) ** (1 / (n_permfoo // b_bands))  # threshold

In [None]:
params = [
    {"shift": RB(), "scale": RB(), "base": 2 ** (8 * 4)}
    for _ in range(n_permfoo)
]

params[:7]

In [None]:
def gen_perm(x):
    doc, shingle = x
    return [((doc, h), (shingle, h)) for h in range(n_permfoo)]

In [None]:
minHashSignMx = (
    charactMx.flatMap(gen_perm)
    .reduceByKey(
        lambda a, b: a if PF(a[0], **params[a[-1]]) < PF(b[0], **params[a[-1]]) else b
    )
    .mapValues(lambda v: v[0])
)

minHashSignMx.take(7)
# (doc, hash) , shingle

## LSH

In [None]:
scale, shift = RN(b_bands), RN(b_bands)
bandmap = lambda x: PF(x, scale, shift, b_bands)

In [None]:
def band_expand(x):
  (doc, h), v = x
  return ((doc, bandmap(h)), (h, v))

def band_reduct(x):
  (doc, band), a = x
  return ((hash(tuple(a)), band), doc)

In [None]:
bandMx = (minHashSignMx              # (doc, hashperm) , valshin
          .map(band_expand)             # (doc, band) , (hashperm , valshin)
          .groupByKey().mapValues(list)    # (doc, band) , [ (hashperm , valshin), (hashperm , valshin) ,(hashperm , valshin), ...]
          .mapValues(lambda a : sorted(a, key=lambda x: x[0]))   # (doc, band) , [ (hashperm , valshin), (hashperm , valshin) ,(hashperm , valshin), ...] but sorted
          .mapValues(lambda a : [v for _, v in a] )      # (doc, band) , [valshin, valshin, valshin]
)

bandMx.take(7)

In [None]:
bandBuckets = (
    bandMx.map(band_reduct)  # (HHH_valshin, band), doc
    .groupByKey()
    .mapValues(list)  # (HHH_valshin, band), [doc, doc, doc]
)

In [None]:
lst = bandBuckets.filter(lambda x: len(x[1]) > 1).collect()

lst[:7]

## results

In [None]:
adj_lst = {
    tuple(sorted([v[i], v[j]]))
    for k, v in lst
    for i in range(len(v) - 1)
    for j in range(i + 1, len(v))
}

In [None]:
import networkx as nx

In [None]:
G = nx.Graph()
G.add_edges_from(adj_lst)
canditates = [cc for cc in nx.connected_components(G) if len(cc) > 3]

In [None]:
for v in canditates:
  for i in v:
    print(df[i])
    print('---------------')
  print('___________________________________________________________________________________________')