<a href="https://colab.research.google.com/github/ranieri-unimi/lsh-malchiodi-2022/blob/main/faces.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [44]:
import os

os.environ["KAGGLE_USERNAME"] = "ranieriunimi"
os.environ["KAGGLE_KEY"] = str(hex(232307088475198570779809482024044346960))[2:]

In [45]:
ref = "bwandowando/ukraine-russian-crisis-twitter-dataset-1-2-m-rows"

!kaggle datasets download $ref --unzip -p .

Downloading ukraine-russian-crisis-twitter-dataset-1-2-m-rows.zip to .
100% 12.0G/12.0G [01:56<00:00, 127MB/s]
100% 12.0G/12.0G [01:56<00:00, 111MB/s]


In [46]:
!pip install pyspark
!pip install findspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [47]:
import pyspark
import findspark

In [48]:
import numpy as np
import pandas as pd
import csv
import re
import string
import random

# echo

set a number cap of tweets (0 to skip)

In [49]:
SAMPLE_SIZE = 40 * 1000

hadoop instance

In [50]:
from pyspark.sql import SparkSession

In [51]:
spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext

load the dataset

In [53]:
csv_list = [
    os.path.join(dirname, filename)
    for dirname, _, filenames in os.walk(".")
    for filename in filenames
    if filename.startswith("09")  # only September
]

In [56]:
df_list = [
    pd.read_csv(
        csv_file,
        engine='python',
        compression='gzip',
        encoding='utf-8',
        quoting=csv.QUOTE_ALL,
        usecols=lambda x: x in ["text","language","tweetid"]
        )
    for csv_file in csv_list
]

In [57]:
df = (pd
      .concat(df_list, axis=0)
      .reset_index(drop=True)
      .drop_duplicates(["tweetid"], keep='last')
      )

In [58]:
del df_list

In [59]:
df = df[df.language == "en"]

In [60]:
dl = df.text.tolist()
dl = random.sample(dl, SAMPLE_SIZE) if SAMPLE_SIZE else dl

In [61]:
rdd = sc.parallelize(enumerate(dl))

rdd.take(7)

[(0,
  'And Russia‚Äôs at it again. Seems like a trend with them.Anytime the Russians have to retreat they commit mass genocide to innocent civilians.This is a war crime yet nobody did anything during the #BuchaMassacre will this be any different. Probably not. When will justice be served? https://t.co/xKj5lC5QsB'),
 (1,
  "@nexta_tv Peacekeepers....  If it wasn't for #Lavrov and #Putler there would be peace...."),
 (2,
  'In a matter of days, Ukrainian forces seized over 1,000 sq. km. of land back, forcing the withdrawal of #russian troops from the majority of the region. Amid this turn of events, Premise asked our Contributors in #ukraine whether they had heard the news, and what they make of it.'),
 (3,
  '@BenCourts1 @Zhijie78 @GastonFoubert @blackintheempir Lol, no evidence of promise from anyone, who isn‚Äôt from Russia. No Russian protests before recent years. And the guy still believes this corrupted killer Putin, who is in charge for 20+ years)'),
 (4,
  "As long as they are o

## data cleaning

In [62]:
def preprocess_tweet_text(tweet):

    # opt for a case insensite analysis
    tweet = tweet.lower()

    # remove URLs
    tweet = re.sub(r"http\S+|www\S+|https\S+", "", tweet, flags=re.MULTILINE)

    # remove #hastags and @mentions
    tweet = re.sub("@[A-Za-z0-9_]+","", tweet)
    tweet = re.sub("#[A-Za-z0-9_]+","", tweet)

    # translate puntctuation to spaces
    tweet = tweet.translate(
        str.maketrans(string.punctuation + "‚Ä¶‚Äô‚Äù‚Äú‚Äò", " " * (len(string.punctuation) + 5))
    )

    # remove extra spaces
    tweet = " ".join(tweet.strip().split())

    return tweet

In [63]:
def shift_key_value(x):
    return (x[-1], x[0])

In [64]:
dataPipe = (
    rdd
    #    clean tweets
    .mapValues(preprocess_tweet_text)
    #    remove duplicates
    .map(shift_key_value)
    .reduceByKey(min)
    .map(shift_key_value)
    # # nvm
    # .filter(lambda x : 'ukrain' in x[-1])
)

dataPipe.take(7)

[(2,
  'in a matter of days ukrainian forces seized over 1 000 sq km of land back forcing the withdrawal of troops from the majority of the region amid this turn of events premise asked our contributors in whether they had heard the news and what they make of it'),
 (19,
  'ukraine must prevail ukraine will prevail putler s fascist empire must fail puler s fascist empire will fail'),
 (41,
  'the us weapon beating the russians in ukraine russian ukraine nato latest news today'),
 (45,
  'this is how ukrainians greet the ukrainians liberators after 6 months of russian hell √Øni ukraine armed forces üá∫üá¶ seems like they cuddle with god'),
 (63,
  'all i want for christmas is 1 trump in prison 2 ukraine winning the war and the fall of putin s regime 3 the collapse of iran s evil regime 4 maga republicans losing the midterms is that too much to ask'),
 (81,
  'over 30 31 august 2022 ukrainian armed forces continued offensive operations in southern ukraine read full british intelligence

In [65]:
dataPipe.count()

5941

## shingles

In [66]:
SH_LEN = 5

In [67]:
shigles = dataPipe.flatMap(
    # use a `SH_LEN` sliding window and list all shingles from each tweet `x`
    lambda x: [(x[0], x[-1][i : i + SH_LEN]) for i in range(len(x[-1]) - SH_LEN)]
).distinct()

shigles.take(7)

[(2, 'in a '),
 (2, ' a ma'),
 (2, 'a mat'),
 (2, 'tter '),
 (2, 'r of '),
 (2, ' of d'),
 (2, 'f day')]

In [68]:
import binascii

In [69]:
def hash_value(v):
    b = bytes(v, "utf-8") # from string to bytes
    h = binascii.crc32(b) # hashing to 4 bytes
    return h

In [70]:
charactMx = shigles.mapValues(hash_value).distinct()

charactMx.take(7)

[(2, 2848174446),
 (2, 1180087470),
 (2, 2923465592),
 (2, 1786980940),
 (2, 2535141870),
 (2, 4212066642),
 (2, 80402308)]

## MinHashing

In [71]:
def RN(stop, start=1):
    """
        simple alias for random.randrange
        but `start` is set to 1 by default
    """
    return random.randrange(start, stop)


def RB(B=4):
    """
        Generate a odd number smaller than `B` bytes
    """
    return RN(2 ** (8 * B - 1), 0) * 2 + 1


def PF(x, scale, shift, base):
    """
        Permutation Function:
        Permute `x` by given parameters
    """
    return (scale * x + shift) % base


In [72]:
n_permfoo, b_bands = 120, 12

(1 / b_bands) ** (1 / (n_permfoo // b_bands)) 
# approximation of similarity threshold

0.7799771419043033

In [73]:
params = [ 
    {"shift": RB(), "scale": RB(), "base": 2 ** (8 * 4)}
    for _ in range(n_permfoo)
    # list of 120 linear congruential generators
]

params[:7]

[{'shift': 2379111221, 'scale': 3900511489, 'base': 4294967296},
 {'shift': 445239449, 'scale': 518418609, 'base': 4294967296},
 {'shift': 3028619029, 'scale': 3958610497, 'base': 4294967296},
 {'shift': 2689406765, 'scale': 347507381, 'base': 4294967296},
 {'shift': 2267584307, 'scale': 2994529517, 'base': 4294967296},
 {'shift': 4076036023, 'scale': 2949604343, 'base': 4294967296},
 {'shift': 3637167983, 'scale': 3034253755, 'base': 4294967296}]

In [74]:
def multi_enum(x):
    doc, shingle = x
    return [((doc, h), (shingle, h)) for h in range(n_permfoo)]
    # enumerate 120 times each key-value pair, for future computations

In [75]:
minHashSignMx = (
    charactMx.flatMap(multi_enum)
    .reduceByKey(
        lambda a, b: a if PF(a[0], **params[a[-1]]) < PF(b[0], **params[a[-1]]) else b
        # permute `PF` both `a` and `b`, and keep the one which return the smallest output
    )
    .mapValues(lambda v: v[0])
    # drop enumeration
)

minHashSignMx.take(7)
# (doc, h) , shingle

[((2, 0), 2540208936),
 ((2, 2), 789266766),
 ((2, 4), 3191334155),
 ((2, 6), 4163760227),
 ((2, 8), 499326825),
 ((2, 10), 2518392153),
 ((2, 12), 3516760539)]

## LSH

In [76]:
scale, shift = RN(b_bands), RN(b_bands)
bandmap = lambda x: PF(x, scale, shift, b_bands)

In [77]:
def band_expand(x):
  (doc, h), shingle = x
  # map each hash to band
  return ((doc, bandmap(h)), (h, shingle))

In [78]:
bandMx = (minHashSignMx #                              # (doc, h_hash) , shingle
          .map(band_expand) # map each hash to band    # (doc, band) , (h_hash , shingle) 
          .groupByKey().mapValues(list) # group        # (doc, band) , [(h_hash , shingle), (h_hash , shingle), ... ]
          .mapValues(lambda a : sorted(a, key=lambda x: x[0])) # as above, but sorted
          .mapValues(lambda a : [v for _, v in a] ) # drop `h_hash` # (doc, band) , [shingle, shingle, ...]
)

bandMx.take(7)

[((2, 10),
  [1544024082,
   4026399056,
   1026882825,
   218462603,
   2535141870,
   236475740,
   3267864145,
   1962208152,
   4212066642,
   2914159121,
   4205663052,
   1836651710,
   2839980760,
   2560504928,
   1822166547,
   2021151087,
   4026399056,
   1004840252,
   1413222931,
   122402601,
   1976169923,
   4131151583,
   452695273,
   1366381447,
   3437516573,
   23674158,
   3009834554,
   122402601,
   637951864,
   499326825]),
 ((2, 4),
  [22495342,
   4131151583,
   1054255490,
   2761320979,
   2890015951,
   2790009549,
   2608946951,
   1905304219,
   3516760539,
   2234235648,
   675254994,
   448958865,
   284929611,
   2790009549,
   1601459314,
   118370699,
   2665402115,
   972749395,
   1013674347,
   2481466753,
   1180087470,
   576791929,
   2112752380,
   2721141291,
   2646012664,
   1822166547,
   3603298554,
   4218411702,
   895764605,
   665135834]),
 ((19, 1),
  [2143539719,
   2907865542,
   351080511,
   994163019,
   2014578746,
   2014659

In [None]:
def band_reduce(x):
  (doc, band), a = x
  return ((hash(tuple(a)), band), doc) # hashing / bucketing shingle list

In [79]:
bandBuckets = (bandMx # (doc, band) , [shingle, shingle, ...]
    .map(band_reduce) # (sh_lst_hash, band), doc
    .groupByKey()
    .mapValues(list)  # (sh_lst_hash, band), [doc, doc, ...]
)

In [80]:
lst = bandBuckets.filter(lambda x: len(x[1]) > 1).collect()

lst[:7]

[((-4424218269515868677, 10), [1640, 4426]),
 ((1488581101982547862, 10), [2950, 3204]),
 ((-6684715038001079823, 1), [6549, 8349]),
 ((2292679500768926280, 4), [9478, 14009]),
 ((5094125652322457331, 7), [15865, 19888]),
 ((-7731442979876610255, 4), [16102, 27331]),
 ((3882246829740074603, 10), [27812, 29099])]

## results

In [81]:
adj_lst = {
    tuple(sorted([v[i], v[j]]))
    for k, v in lst
    for i in range(len(v) - 1)
    for j in range(i + 1, len(v))
}

In [82]:
import networkx as nx

In [83]:
G = nx.Graph()
G.add_edges_from(adj_lst)
canditates = [cc for cc in nx.connected_components(G) if len(cc) > 1]

In [84]:
len(canditates)

35

In [86]:
for v in random.sample(canditates, 7):
  for i in v:
    print(dl[i])
    print('---------------')
  print('___________________________________________________________________________________________')

@POTUS Liberation of the territories of #Ukraineü•π People meet Ukrainian soldiers. 
For info: "oladyi" is kind of local "pancakes" ü§ó
#Balakliia #UkraineWillWin https://t.co/H0POVs52qs
---------------
Liberation of the territories of #Ukraine.  People meet Ukrainian soldiers. 
For info: "oladyi" is kind of local "pancakes"
#Balakliia #UkraineWillWin https://t.co/48sTn3lMLQ
---------------
___________________________________________________________________________________________
Glory to Ukraine and its soldiers üá∫üá¶

Death to enemiesüî•

#UkrainianArmy #Ukraine https://t.co/mFfh7VoW5f
---------------
Glory to Ukraine and its soldiers üá∫üá¶

Death to enemiesüî•
#UkraineRussiaWar #Ukraine #SlavaUkra√Øni https://t.co/O4raJcVISo
---------------
___________________________________________________________________________________________
@ResistersUniteX @MTHRGODDESS @Wipapa5 @QuinnLeone4 @ticdanurse @dmillerwats @calcara_tina @GratefulJjd @Dalocoengineer @RoxineKing @MahtogLv @