# **Finding similar items**

### Project for the **Algorithms for massive data course**


MSc, Data Science for Economics*

Shojaat Joodi Bigdilo

June 2024

In [1]:
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)

Mounted at /content/gdrive


In [None]:
!pip install kaggle

import warnings
warnings.filterwarnings("ignore")

In [2]:
!pip install findspark

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [3]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=bb1a538dff076776e0e3ef5085b9771e69ee6b5d60cbddb0b15a8ff94dc6f687
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [None]:
import os
os.environ['KAGGLE_USERNAME'] = 'xxxxxxxxx'
os.environ['KAGGLE_KEY'] = 'xxxxxxxxx'

In [None]:
!kaggle datasets download -d asaniczka/1-3m-linkedin-jobs-and-skills-2024

In [None]:
extract_to_path  = "/content/gdrive/My Drive/Massive_Data_Project/Job_Dataset"

import zipfile
with zipfile.ZipFile('1-3m-linkedin-jobs-and-skills-2024.zip', 'r') as zip_ref:
    zip_ref.extractall(extract_to_path)

In [85]:
from pyspark import SparkConf, SparkContext

from pyspark.sql import SparkSession
from pyspark.sql.functions import isnan, when, count, col, countDistinct
from pyspark.sql.functions import lower, regexp_replace, size
from pyspark.sql.functions import udf
from pyspark.sql.functions import explode
from pyspark.sql.types import StringType, IntegerType, DoubleType
from pyspark.sql.types import ArrayType

from pyspark.ml.feature import Tokenizer, StopWordsRemover, MinHashLSH
from pyspark.ml.linalg import Vectors, VectorUDT

import re
import time
import string
import datetime
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [5]:
conf = SparkConf().setAppName("Similar_Documents")
spark = SparkSession.builder.enableHiveSupport().config(conf = conf).getOrCreate()
sc = spark.sparkContext
type(sc)

In [6]:
file_path = "/content/gdrive/My Drive/Massive_Data_Project/Job_Dataset/job_summary.csv"

df_Dataset = spark.read.csv(file_path, header=True, inferSchema=True, multiLine=True, escape='"',
                           encoding = "ISO-8859-1")

## choosing chunk of data

In [7]:
size = 5000
Job_df = df_Dataset.limit(size)

In [8]:
type(Job_df)

# Pre-processsing

### Exploratory analysis

In [9]:
Job_df.show(n = 10)

+--------------------+--------------------+
|            job_link|         job_summary|
+--------------------+--------------------+
|https://www.linke...|Rock N Roll Sushi...|
|https://www.linke...|Schedule\n: PRN i...|
|https://www.linke...|Description\nIntr...|
|https://uk.linked...|Commercial accoun...|
|https://www.linke...|Address:\nUSA-CT-...|
|https://www.linke...|Description\nOur\...|
|https://www.linke...|Company Descripti...|
|https://uk.linked...|An exciting oppor...|
|https://www.linke...|Job Details:\nJob...|
|https://www.linke...|Our\nRestaurant T...|
+--------------------+--------------------+
only showing top 10 rows



In [10]:
Job_df = Job_df.select("job_summary")
Job_df.show(n = 10)

+--------------------+
|         job_summary|
+--------------------+
|Rock N Roll Sushi...|
|Schedule\n: PRN i...|
|Description\nIntr...|
|Commercial accoun...|
|Address:\nUSA-CT-...|
|Description\nOur\...|
|Company Descripti...|
|An exciting oppor...|
|Job Details:\nJob...|
|Our\nRestaurant T...|
+--------------------+
only showing top 10 rows



#### Giving Id for each row

In [11]:
indexed_rdd = Job_df.rdd.zipWithIndex()
Job_df = indexed_rdd.map(lambda x: (x[1], x[0][0])).toDF(["Id", "job_summary"])

In [12]:
Job_df.show(5)

+---+--------------------+
| Id|         job_summary|
+---+--------------------+
|  0|Rock N Roll Sushi...|
|  1|Schedule\n: PRN i...|
|  2|Description\nIntr...|
|  3|Commercial accoun...|
|  4|Address:\nUSA-CT-...|
+---+--------------------+
only showing top 5 rows



In [13]:
# checking missing values in the columns
Job_df.select([count(when(isnan(c), c)).alias(c) for c in Job_df.columns]).show()

+---+-----------+
| Id|job_summary|
+---+-----------+
|  0|          0|
+---+-----------+



In [14]:
#count distinct values in each column
Job_df.select([countDistinct(c).alias(c) for c in Job_df.columns]).show()

+----+-----------+
|  Id|job_summary|
+----+-----------+
|5000|       4069|
+----+-----------+



### Duplicates check

In [15]:
# show duplicates in Body column
Job_df.groupBy("job_summary").count().filter("count > 1").show()

+--------------------+-----+
|         job_summary|count|
+--------------------+-----+
|Job Title:\nCerti...|   10|
|Salary:\n$48,604....|    2|
|Overview\nThis jo...|    3|
|Dollar General Co...|   19|
|Now Hiring Immedi...|    2|
|FULL-TIME LICENSE...|    6|
|Summary\nThis pos...|    2|
|Commercial Underw...|    2|
|USD 497\nDetails\...|    2|
|This job posting ...|    2|
|Patrol Officer - ...|    2|
|We are looking fo...|    3|
|Full Time Corpora...|    2|
|Hospital Site Sec...|    4|
|Privacy Notice: V...|    2|
|Responsibilities\...|    2|
|Our values start ...|   12|
|Come Join The Lea...|    2|
|Title: - Identity...|    2|
|With significant ...|    3|
+--------------------+-----+
only showing top 20 rows



In [16]:
# Filter the rows where 'job_summary' starts with 'Job Title:\nCerti'
filtered_rows = Job_df.filter(col("job_summary").startswith("Job Title:\nCertified Nursing Assistant (CNA)\nCompany"))
filtered_rows.show()

+----+--------------------+
|  Id|         job_summary|
+----+--------------------+
|1319|Job Title:\nCerti...|
|1586|Job Title:\nCerti...|
|1613|Job Title:\nCerti...|
|1713|Job Title:\nCerti...|
|1729|Job Title:\nCerti...|
|1763|Job Title:\nCerti...|
|1765|Job Title:\nCerti...|
|1792|Job Title:\nCerti...|
|1847|Job Title:\nCerti...|
|2285|Job Title:\nCerti...|
+----+--------------------+



In [17]:
# ID number 1319
row_with_id_1319 = Job_df.filter(Job_df['ID'] == 1319).collect()

txt = row_with_id_1319[0][1]
txt

"Job Title:\nCertified Nursing Assistant (CNA)\nCompany:\nTilloHealth, a division of Tillotek Staffing Solutions\nPay Range:\nCompetitive salary based on experience\nShifts/Hours:\n1st, 2nd, and 3rd Shifts may be available depending on time of application\nEmployment Type:\nFull-Time/Part-Time/Per Diem availability\nOpportunity Type:\nPerm, Contract, and Travel may be available\nJob Summary\nAs a Certified Nursing Assistant (CNA) at Tillotek Staffing Solutions, you will play a crucial role in providing essential care and support to patients in various healthcare settings. CNAs are responsible for assisting patients with activities of daily living, such as bathing, dressing, and feeding, and ensuring their comfort and well-being. Join our team of dedicated healthcare professionals and make a meaningful difference in the lives of patients.\nResponsibilities\nAssist patients with daily tasks, including bathing, grooming, and dressing.\nProvide basic medical care, such as checking vital si

In [18]:
# ID number 1586
row_with_id_1586 = Job_df.filter(Job_df['ID'] == 1586).collect()

txt2 = row_with_id_1586[0][1]
txt2

"Job Title:\nCertified Nursing Assistant (CNA)\nCompany:\nTilloHealth, a division of Tillotek Staffing Solutions\nPay Range:\nCompetitive salary based on experience\nShifts/Hours:\n1st, 2nd, and 3rd Shifts may be available depending on time of application\nEmployment Type:\nFull-Time/Part-Time/Per Diem availability\nOpportunity Type:\nPerm, Contract, and Travel may be available\nJob Summary\nAs a Certified Nursing Assistant (CNA) at Tillotek Staffing Solutions, you will play a crucial role in providing essential care and support to patients in various healthcare settings. CNAs are responsible for assisting patients with activities of daily living, such as bathing, dressing, and feeding, and ensuring their comfort and well-being. Join our team of dedicated healthcare professionals and make a meaningful difference in the lives of patients.\nResponsibilities\nAssist patients with daily tasks, including bathing, grooming, and dressing.\nProvide basic medical care, such as checking vital si

In [19]:
# Checking Equality of texts
if txt == txt2:
    print('Equal')
else:
    print('Not Equal')

Equal


### Delete Duplicates Document

In [20]:
Job_df = Job_df.dropDuplicates(['job_summary'])

In [21]:
Job_df.select([countDistinct(c).alias(c) for c in Job_df.columns]).show()

+----+-----------+
|  Id|job_summary|
+----+-----------+
|4069|       4069|
+----+-----------+



In [22]:
# checking again duplicates
Job_df.groupBy("job_summary").count().filter("count > 1").show()

+-----------+-----+
|job_summary|count|
+-----------+-----+
+-----------+-----+



In [23]:
row_with_id_1586 = Job_df.filter(Job_df['ID'] == 1586).collect()
row_with_id_1586

[]

# Text cleaning and pre-processing

In [24]:
Job_df = Job_df.select('Id',"job_summary")

### LoweCasing Text

In [25]:
Job_df = Job_df.withColumn('job_summary', lower(Job_df['job_summary']))

### Remove HTML Tags

In [26]:
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text) if text else text

remove_html_tags_udf = udf(remove_html_tags, StringType())
Job_df = Job_df.withColumn('job_summary', remove_html_tags_udf(Job_df['job_summary']))

###  Remove URLs

In [27]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

remove_url_udf = udf(remove_url, StringType())
Job_df = Job_df.withColumn('job_summary', remove_url_udf(Job_df['job_summary']))

### Remove Punctuations

In [28]:
import string
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

remove_punctuation_udf = udf(remove_punctuation, StringType())
Job_df = Job_df.withColumn('job_summary', remove_punctuation_udf(Job_df['job_summary']))

### Remove numbers

The following document has aroud 42 different number inside it, so we need to delet them.
3x12 , 180000060000, 12003, 0, 4, 02142024, 05152024, 13, 556166975, 56166975 , 12 , 7 , 7, 100 , 133, 3467, 68100, 10 , 25, 50 , 100,
100 , 20 , 3, 2, , 1, 0, 100, 15, 15, 15, 91, 401,36, 50, 2023, 2022, 2021 ,2020, 2019.

In [29]:
row_with_id_160 = Job_df.filter(Job_df['ID'] == 160).collect()
row_with_id_160

[Row(Id=160, job_summary='profession\nregistered nurse rn\nlocation\nmeridian mississippi\nspecialty\nmed surg tele\nunit\ntelemetry med  surg observation critical care ccu\ngross pay\ntbd\nschedule\n3x12 nights 180000060000 12003\nguaranteed hours\n0\nopenings\n4\nstart date\n02142024\nend date\n05152024\nassignment length\n13\nexperience required\n1 year\njob id\nmedefis556166975\nother info\nid 56166975 shift 12 hour nights7p7aevery other weekend description total weight  100 job summary provides direct and indirect patient care in the assigned setting communicates with physiciansnurse managercoworkers as appropriate about changes in patients clinical condition including results of diagnostic studies and symptomatology is able to respond quickly and accurately to changes in condition or response to treatment additionally is able to perform general nursing duties utilizing the nursing process will float to other areas of the hospital as assigned duties and responsibilities demonstrat

In [30]:
def remove_numbers(text):
    pattern = re.compile(r'\d+')
    return pattern.sub(r'', text)

remove_numbers_udf = udf(remove_numbers, StringType())
Job_df = Job_df.withColumn('job_summary', remove_numbers_udf(Job_df['job_summary']))

In [31]:
row_with_id_160 = Job_df.filter(Job_df['ID'] == 160).collect()
row_with_id_160

[Row(Id=160, job_summary='profession\nregistered nurse rn\nlocation\nmeridian mississippi\nspecialty\nmed surg tele\nunit\ntelemetry med  surg observation critical care ccu\ngross pay\ntbd\nschedule\nx nights  \nguaranteed hours\n\nopenings\n\nstart date\n\nend date\n\nassignment length\n\nexperience required\n year\njob id\nmedefis\nother info\nid  shift  hour nightspaevery other weekend description total weight   job summary provides direct and indirect patient care in the assigned setting communicates with physiciansnurse managercoworkers as appropriate about changes in patients clinical condition including results of diagnostic studies and symptomatology is able to respond quickly and accurately to changes in condition or response to treatment additionally is able to perform general nursing duties utilizing the nursing process will float to other areas of the hospital as assigned duties and responsibilities demonstrates competency in the following areas professional requirements kn

### Remove Non-ASCII characters:
Some texts have some non-ASCII characters like (ã°â\x9fâ\x9fâ¡), so we need to delete them from texts

In [32]:
row_with_id_915 = Job_df.filter(Job_df['ID'] == 915).collect()
row_with_id_915

[Row(Id=915, job_summary='ð\x9f\x9f¡ head of operations\nð\x9f\x9f¡  office based role in mayfair london am to pm\nð\x9f\x9f¡ luxury real estate sector\nð\x9f\x9f¡ salary up to â£ pa dependent on experience\nð\x9f\x9f¡ progression route to coo\nwe are delighted to be partnering with a prestige property development boutique in their search for a new head of operations\nthis is a varied senior level role where daytoday you will be involved with multiple project streams coordinating between commercial design and sales teams on simultaneous high end development projects you will closely support the ceo across operational workflows and will take overarching responsibility for communications and compliance you will be an important and valued member of the senior leadership team and will be heavily involved in commercial strategy and business growth decisions\nin order to be considered you must have significant experience in a similar senior role ideally gained within high end real estate inv

In [33]:
def remove_non_ascii(text):
    if text is None:
        return None
    return re.sub(r'[^\x00-\x7F]+', '', str(text))

remove_non_ascii_udf = udf(remove_non_ascii, StringType())
Job_df = Job_df.withColumn('job_summary', remove_non_ascii_udf(Job_df['job_summary']))

In [34]:
row_with_id_915 = Job_df.filter(Job_df['ID'] == 915).collect()
row_with_id_915

[Row(Id=915, job_summary=' head of operations\n  office based role in mayfair london am to pm\n luxury real estate sector\n salary up to  pa dependent on experience\n progression route to coo\nwe are delighted to be partnering with a prestige property development boutique in their search for a new head of operations\nthis is a varied senior level role where daytoday you will be involved with multiple project streams coordinating between commercial design and sales teams on simultaneous high end development projects you will closely support the ceo across operational workflows and will take overarching responsibility for communications and compliance you will be an important and valued member of the senior leadership team and will be heavily involved in commercial strategy and business growth decisions\nin order to be considered you must have significant experience in a similar senior role ideally gained within high end real estate investment management or luxury consumer goods sectors\

### Remove extra space

In [35]:
from pyspark.sql.functions import regexp_replace, col, trim

def remove_extra_spaces(df, column_name):
    df = df.withColumn(column_name, regexp_replace(col(column_name), "\\s+", " "))
    return df.withColumn(column_name, trim(col(column_name)))

In [36]:
Job_df = remove_extra_spaces(Job_df, "job_summary")

### Tokenization

In [37]:
tokenizer = Tokenizer().setInputCol("job_summary").setOutputCol("Tokens")
Job_df = tokenizer.transform(Job_df)
Job_df.show(5)

+---+--------------------+--------------------+
| Id|         job_summary|              Tokens|
+---+--------------------+--------------------+
| 40|job title food se...|[job, title, food...|
| 45|your responsibili...|[your, responsibi...|
|160|profession regist...|[profession, regi...|
|589|to apply for this...|[to, apply, for, ...|
|656|the judge group i...|[the, judge, grou...|
+---+--------------------+--------------------+
only showing top 5 rows



### Removing Stopwords

In [38]:
remove_stopwords = StopWordsRemover()
stopwords = remove_stopwords.getStopWords()
print(stopwords[:10])
print(len(stopwords))

remove_stopwords.setInputCol("Tokens").setOutputCol("Tokens stopwords removed")
Job_df = remove_stopwords.transform(Job_df)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your']
181


In [42]:
# counting the number of tokens after stopwords removed
Job_df = Job_df.withColumn("Number of tokens", size(col("Tokens")))
Job_df = Job_df.withColumn("Number of tokens After stopwords removed", size(col("Tokens stopwords removed")))
Job_df = Job_df.withColumn("Number of stopwords removed", size(col("Tokens stopwords removed")) - size(col("Tokens")) )

In [43]:
Job_df.show(10)

+----+--------------------+--------------------+------------------------+----------------+----------------------------------------+---------------------------+
|  Id|         job_summary|              Tokens|Tokens stopwords removed|Number of tokens|Number of tokens After stopwords removed|Number of stopwords removed|
+----+--------------------+--------------------+------------------------+----------------+----------------------------------------+---------------------------+
|  40|job title food se...|[job, title, food...|    [job, title, food...|             374|                                     244|                       -130|
|  45|your responsibili...|[your, responsibi...|    [responsibilities...|             775|                                     502|                       -273|
| 160|profession regist...|[profession, regi...|    [profession, regi...|            1917|                                    1366|                       -551|
| 589|to apply for this...|[to, apply, f

###  Join the words

To join the words back together after tokenization and stopword removal, you can use the concat_ws function provided by PySpark. Here’s how you can do it:

In [44]:
from pyspark.sql.functions import concat_ws

Job_df = Job_df.withColumn("Cleaned_text", concat_ws(" ", col("Tokens stopwords removed")))

## Final dataset

In [45]:
Job_df_proces = Job_df.select('Id', "Cleaned_text")

In [47]:
Job_df_proces.show(4)

+---+--------------------+
| Id|        Cleaned_text|
+---+--------------------+
| 40|job title food se...|
| 45|responsibilities ...|
|160|profession regist...|
|589|apply job click l...|
+---+--------------------+
only showing top 4 rows



### Creating Shingles

In [48]:
def shingle(text, k):
    shingles = set()
    words = text.split()
    for i in range(len(words) - k + 1):
        shingles.add(' '.join(words[i:i+k]))
    return list(shingles)

In [49]:
k = 2
shingle_udf = udf(lambda text: shingle(text, k), ArrayType(StringType()))
Job_df_proces = Job_df_proces.withColumn("shingles", shingle_udf(col("Cleaned_text")))

In [50]:
Job_df_proces.select("shingles").show(5)

+--------------------+
|            shingles|
+--------------------+
|[looking musthave...|
|[logistics system...|
|[organizational v...|
|[skills must, ski...|
|[large army, expe...|
+--------------------+
only showing top 5 rows



#### convert shingles to sparse vectors

In [51]:
# Flatten the shingles column to get all unique shingles
unique_shingles = Job_df_proces.select(explode("shingles").alias("shingle")).distinct().collect()
shingle_index = {row["shingle"]: idx for idx, row in enumerate(unique_shingles)}

print("Unique shingles and their indices:")
print(shingle_index)

def shingles_to_sparse_vector(shingles):
    indices = sorted([shingle_index[sh] for sh in shingles if sh in shingle_index])
    values = [1.0] * len(indices)
    return Vectors.sparse(len(unique_shingles), indices, values)


sparse_vector_udf = udf(lambda shingles: shingles_to_sparse_vector(shingles), VectorUDT())

Job_df_proces = Job_df_proces.withColumn("features", sparse_vector_udf(col("shingles")))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [53]:
Job_df_proces.select("features").show(5, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## Implementing MinHashLSH

In [54]:
start = time.time()

# Initialize MinHashLSH
mh = MinHashLSH(inputCol="features", outputCol="hashes", seed=12345, numHashTables=20)
model = mh.fit(Job_df_proces)
print("The hashed dataset where hashed values are stored in the column 'hashes':")
hash = model.transform(Job_df_proces)

# Compute the locality sensitive hashes for the input rows, then perform approximate
# similarity join to Calculate Jaccard Distances.
result = model.approxSimilarityJoin(hash, hash, 0.6, distCol="JaccardDistance").select(
    col("datasetA.id").alias("idA"),
    col("datasetB.id").alias("idB"),
    col("JaccardDistance")
)

# Filter out self-pairs and display the results
result_filtered = result.filter(col("idA") < col("idB"))

end = time.time()
computation_time = round(end - start, 3)
print("Computation time: {} seconds".format(computation_time))

print('------------------------------------')

# https://spark.apache.org/docs/2.2.0/ml-features.html
# https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.ml.feature.MinHashLSH.html

The hashed dataset where hashed values are stored in the column 'hashes':
Computation time: 7.484 seconds
------------------------------------


In [55]:
type(result_filtered)

In [56]:
result_filtered.show()

+----+----+--------------------+
| idA| idB|     JaccardDistance|
+----+----+--------------------+
|2196|3217| 0.03754266211604096|
| 743| 949|                 0.0|
| 431| 827|                 0.0|
| 662| 984|                 0.0|
| 654| 995|                 0.0|
| 697|1013|0.018907563025210128|
|2973|2995| 0.02898550724637683|
|3363|3421|0.024324324324324298|
|3795|4717|0.044642857142857095|
| 912|3305|0.043624161073825496|
|1311|1706|                 0.0|
|2014|2942|  0.3739837398373984|
|2007|2356|0.012383900928792602|
| 910| 998|                 0.0|
|2680|2970| 0.10399999999999998|
|3324|3617|   0.038664323374341|
|2808|3139| 0.13458262350936967|
|2208|2209|  0.5513812154696133|
| 158| 948|                 0.0|
|  46| 576|                 0.0|
+----+----+--------------------+
only showing top 20 rows



In [57]:
result_filtered.sort(result_filtered.JaccardDistance.asc()).show(10)

+---+---+---------------+
|idA|idB|JaccardDistance|
+---+---+---------------+
|626|838|            0.0|
|576|948|            0.0|
|431|626|            0.0|
|817|918|            0.0|
| 52|226|            0.0|
|827|850|            0.0|
|626|665|            0.0|
|838|949|            0.0|
|743|883|            0.0|
|850|883|            0.0|
+---+---+---------------+
only showing top 10 rows



In [58]:
result_filtered.sort(result_filtered.JaccardDistance.desc()).show(10)

+----+----+------------------+
| idA| idB|   JaccardDistance|
+----+----+------------------+
|3284|4955|0.5998985801217038|
|4051|4261|0.5997536945812808|
|3187|4189|0.5995115995115995|
|3771|4880| 0.599294947121034|
|3817|3882| 0.599224305106658|
|3771|4808| 0.599121361889072|
|1715|2249|0.5988235294117648|
|2857|3735|0.5986984815618221|
|3735|3810|0.5984807379272925|
|3284|3801|0.5984251968503937|
+----+----+------------------+
only showing top 10 rows



In [59]:
# JaccardDistance between 0.2 and 0.3
filtered_result = result_filtered.filter((result_filtered.JaccardDistance >= 0.2) & (result_filtered.JaccardDistance <= 0.3))
filtered_result.sort(filtered_result.JaccardDistance.asc()).show(10)

+----+----+-------------------+
| idA| idB|    JaccardDistance|
+----+----+-------------------+
|1909|3014| 0.2005494505494505|
|3058|3148| 0.2008196721311475|
|2023|2036|0.20105820105820105|
| 553|1909|0.20110192837465568|
|2205|2223|0.20110192837465568|
|1602|2205|0.20110192837465568|
| 289|2572|  0.201641266119578|
|2572|3421|  0.201641266119578|
|4761|4840|0.20170454545454541|
|3451|4840|0.20170454545454541|
+----+----+-------------------+
only showing top 10 rows



#### Result of minhash function (hash values)

In [60]:
hash.show()

+----+--------------------+--------------------+--------------------+--------------------+
|  Id|        Cleaned_text|            shingles|            features|              hashes|
+----+--------------------+--------------------+--------------------+--------------------+
|  40|job title food se...|[looking musthave...|(558004,[3368,507...|[[2.9382435E7], [...|
|  45|responsibilities ...|[logistics system...|(558004,[0,1,2,3,...|[[8304474.0], [49...|
| 160|profession regist...|[organizational v...|(558004,[4,5,6,7,...|[[5008270.0], [38...|
| 589|apply job click l...|[skills must, ski...|(558004,[11,6723,...|[[1.3007104E7], [...|
| 656|judge group curre...|[large army, expe...|(558004,[13,14,15...|[[1.441622E7], [5...|
|1319|job title certifi...|[cnas responsible...|(558004,[17,1683,...|[[2301225.0], [87...|
|1324|come join leader ...|[training leaders...|(558004,[18,19,16...|[[1.7307271E7], [...|
|1507|visiting angels h...|[shopping errands...|(558004,[5081,839...|[[1930405.0], [85...|

#### Sparce vector for first document , id = 160

In [61]:
hash.first()['features']

SparseVector(558004, {3368: 1.0, 5072: 1.0, 8382: 1.0, 8383: 1.0, 10132: 1.0, 10133: 1.0, 10134: 1.0, 11808: 1.0, 13492: 1.0, 13493: 1.0, 15153: 1.0, 16783: 1.0, 16784: 1.0, 18384: 1.0, 18385: 1.0, 21656: 1.0, 23247: 1.0, 23248: 1.0, 24858: 1.0, 24859: 1.0, 26466: 1.0, 26467: 1.0, 29651: 1.0, 29652: 1.0, 32842: 1.0, 34440: 1.0, 35979: 1.0, 38996: 1.0, 40480: 1.0, 40481: 1.0, 41913: 1.0, 43392: 1.0, 44813: 1.0, 46206: 1.0, 47578: 1.0, 53150: 1.0, 54512: 1.0, 55850: 1.0, 55851: 1.0, 57158: 1.0, 58442: 1.0, 59783: 1.0, 63598: 1.0, 66527: 1.0, 67583: 1.0, 68008: 1.0, 68265: 1.0, 68545: 1.0, 69709: 1.0, 70789: 1.0, 73252: 1.0, 74248: 1.0, 74249: 1.0, 75911: 1.0, 77653: 1.0, 81037: 1.0, 84443: 1.0, 86122: 1.0, 87767: 1.0, 96293: 1.0, 96294: 1.0, 98068: 1.0, 98069: 1.0, 99790: 1.0, 101438: 1.0, 101439: 1.0, 101440: 1.0, 103036: 1.0, 103037: 1.0, 103038: 1.0, 104773: 1.0, 113211: 1.0, 113212: 1.0, 113213: 1.0, 114845: 1.0, 116554: 1.0, 118258: 1.0, 118259: 1.0, 119922: 1.0, 119923: 1.0, 119924

#### Signature vector for first document, id = 160
Values inside DenseVector shows value of each hash function.

In [62]:
hash.first()['hashes']

[DenseVector([29382435.0]),
 DenseVector([67391.0]),
 DenseVector([24946479.0]),
 DenseVector([15109894.0]),
 DenseVector([6009839.0]),
 DenseVector([8928521.0]),
 DenseVector([459012.0]),
 DenseVector([5302797.0]),
 DenseVector([5409404.0]),
 DenseVector([19769471.0]),
 DenseVector([224289.0]),
 DenseVector([118223.0]),
 DenseVector([18658145.0]),
 DenseVector([18318724.0]),
 DenseVector([4336886.0]),
 DenseVector([16571678.0]),
 DenseVector([5240649.0]),
 DenseVector([1428559.0]),
 DenseVector([8351152.0]),
 DenseVector([4989205.0])]

### Creating New dataframe in order to compare pair document with each other

In [70]:
Job_df2 = Job_df.select('Id', "Tokens stopwords removed")

In [72]:
from pyspark.sql import DataFrame

def analyze_text_by_id(df: DataFrame, id_number: int):
    row_with_id = df.filter(df['ID'] == id_number).collect()
    print(row_with_id)

    if not row_with_id:
        print(f"No row found with ID {id_number}")
        return

    txt = row_with_id[0][1:][0]

    print(f"Type of txt: {type(txt)}")
    print(f"Length of txt: {len(txt)}")
    print(f"Fourth character in txt: {txt[3]}")

    return txt

In [64]:
def analyze_lists(text1, text2):
    list1 = [word for word in text1 if word]
    list2 = [word for word in text2 if word]

    num_words_list1 = len(list1)
    num_words_list2 = len(list2)
    num_unique_words_list1 = len(set(list1))
    num_unique_words_list2 = len(set(list2))

    common_words = set(list1).intersection(list2)
    num_common_words = len(common_words)

    percentage_common_list1 = (num_common_words / num_unique_words_list1) * 100 if num_unique_words_list1 > 0 else 0
    percentage_common_list2 = (num_common_words / num_unique_words_list2) * 100 if num_unique_words_list2 > 0 else 0

    return (num_words_list1, num_words_list2, num_unique_words_list1,
            num_unique_words_list2, num_common_words,
            percentage_common_list1, percentage_common_list2)

### Comparing the Documents with 'ID' number of 1909 & 3014, which have Jaccard distance equal to 0.20



In [73]:
txt1 = analyze_text_by_id(Job_df2, 1909)

[Row(Id=1909, Tokens stopwords removed=['interested', 'making', 'positive', 'impact', 'youve', 'come', 'right', 'place', 'fusion', 'medical', 'staffing', 'goal', 'improve', 'lives', 'everyone', 'touch', 'always', 'looking', 'people', 'like', 'join', 'mission', 'making', 'difference', 'isnt', 'perk', 'traveling', 'us', 'start', 'medical', 'travel', 'career', 'fusion', 'medical', 'staffing', 'gain', 'access', 'competitive', 'pay', 'packages', 'comprehensive', 'benefits', 'corporate', 'discounts', 'perks', 'clinical', 'team', 'support', 'along', 'journey', 'recruiter', 'determined', 'help', 'succeed', 'weve', 'got', 'back', 'focus', 'best', 'helping', 'others', 'certified', 'nursing', 'assistant', 'skilled', 'nursing', 'facility', 'position', 'certified', 'nursing', 'assistant', 'specialty', 'skilled', 'nursing', 'facility', 'week', 'skilled', 'nursing', 'facility', 'certified', 'nursing', 'assistant', 'travel', 'assignment', 'client', 'de', 'smet', 'sd', 'looking', 'skilled', 'nursing', 

In [74]:
txt2 = analyze_text_by_id(Job_df2, 3014)

[Row(Id=3014, Tokens stopwords removed=['interested', 'making', 'positive', 'impact', 'youve', 'come', 'right', 'place', 'fusion', 'medical', 'staffing', 'goal', 'improve', 'lives', 'everyone', 'touch', 'always', 'looking', 'people', 'like', 'join', 'mission', 'making', 'difference', 'isnt', 'perk', 'traveling', 'us', 'start', 'medical', 'travel', 'career', 'fusion', 'medical', 'staffing', 'gain', 'access', 'competitive', 'pay', 'packages', 'comprehensive', 'benefits', 'corporate', 'discounts', 'perks', 'clinical', 'team', 'support', 'along', 'journey', 'recruiter', 'determined', 'help', 'succeed', 'weve', 'got', 'back', 'focus', 'best', 'helping', 'others', 'technician', 'patient', 'care', 'position', 'technician', 'specialty', 'patient', 'care', 'week', 'patient', 'care', 'technician', 'travel', 'assignment', 'client', 'hackensack', 'nj', 'looking', 'patient', 'care', 'technician', 'help', 'weeks', 'fusion', 'medical', 'truly', 'believe', 'people', 'taking', 'care', 'people', 'togeth

In [75]:
# Comparing number of common words inside Documents with 'ID' number of 1909 & 3014

(num_words_list1, num_words_list2, num_unique_words_list1,
 num_unique_words_list2, num_common_words,
 percentage_common_list1, percentage_common_list2) = analyze_lists(txt1, txt2)

print(f"Number of words in Text_1: {num_words_list1}")
print(f"Number of words in Text_2: {num_words_list2}")
print(f"Number of Unique words in Text_1: {num_unique_words_list1}")
print(f"Number of Unique words in Text_2: {num_unique_words_list2}")
print(f"Number of common Uniqe words: {num_common_words}")
print(f"Percentage of common words in Text_1: {percentage_common_list1:.2f}%")
print(f"Percentage of common words in Text_2: {percentage_common_list2:.2f}%")

Number of words in Text_1: 344
Number of words in Text_2: 361
Number of Unique words in Text_1: 249
Number of Unique words in Text_2: 260
Number of common Uniqe words: 240
Percentage of common words in Text_1: 96.39%
Percentage of common words in Text_2: 92.31%


### Comparing the Documents with 'ID' number of 3284 & 4955, which have Jaccard distance equal to 0.59

In [76]:
txt5 = analyze_text_by_id(Job_df2, 3284)

[Row(Id=3284, Tokens stopwords removed=['summary', 'position', 'eligible', 'education', 'debt', 'reduction', 'program', 'edrp', 'student', 'loan', 'payment', 'reimbursement', 'program', 'must', 'meet', 'specific', 'individual', 'eligibility', 'requirements', 'accordance', 'vha', 'policy', 'submit', 'edrp', 'application', 'within', 'four', 'months', 'appointment', 'approval', 'award', 'amount', 'eligibility', 'period', 'one', 'five', 'years', 'determined', 'vha', 'education', 'loan', 'repayment', 'services', 'program', 'office', 'complete', 'review', 'edrp', 'application', 'learn', 'agency', 'help', 'duties', 'staff', 'psychologist', 'position', 'fulltime', 'position', 'supporting', 'alcohol', 'drug', 'treatment', 'program', 'adtp', 'administered', 'mental', 'health', 'service', 'line', 'mhsl', 'located', 'grounds', 'va', 'boston', 'healthcare', 'system', 'vabhs', 'brockton', 'division', 'psychologist', 'provides', 'direct', 'care', 'services', 'including', 'consultation', 'diagnostic',

In [77]:
txt6 = analyze_text_by_id(Job_df2, 4955)

[Row(Id=4955, Tokens stopwords removed=['summary', 'position', 'eligible', 'education', 'debt', 'reduction', 'program', 'edrp', 'student', 'loan', 'payment', 'reimbursement', 'program', 'must', 'meet', 'specific', 'individual', 'eligibility', 'requirements', 'accordance', 'vha', 'policy', 'submit', 'edrp', 'application', 'within', 'four', 'months', 'appointment', 'approval', 'award', 'amount', 'eligibility', 'period', 'one', 'five', 'years', 'determined', 'vha', 'education', 'loan', 'repayment', 'services', 'program', 'office', 'complete', 'review', 'edrp', 'application', 'learn', 'agency', 'help', 'duties', 'open', 'continuous', 'announcement', 'remain', 'open', 'april', 'initial', 'cutoff', 'date', 'referral', 'eligible', 'applications', 'february', 'eligible', 'applications', 'received', 'date', 'referred', 'regular', 'intervals', 'additional', 'vacancies', 'occur', 'asneeded', 'basis', 'position', 'filled', 'relocationrecruitment', 'incentives', 'authorized', 'duties', 'include', '

In [78]:
# Comparing number of common words inside Documents with 'ID' number of 3284 & 4955

(num_words_list1, num_words_list2, num_unique_words_list1,
 num_unique_words_list2, num_common_words,
 percentage_common_list1, percentage_common_list2) = analyze_lists(txt5, txt6)

print(f"Number of words in Text_1: {num_words_list1}")
print(f"Number of words in Text_2: {num_words_list2}")
print(f"Number of Unique words in Text_1: {num_unique_words_list1}")
print(f"Number of Unique words in Text_2: {num_unique_words_list2}")
print(f"Number of common Uniqe words: {num_common_words}")
print(f"Percentage of common words in Text_1: {percentage_common_list1:.2f}%")
print(f"Percentage of common words in Text_2: {percentage_common_list2:.2f}%")

Number of words in Text_1: 1876
Number of words in Text_2: 1387
Number of Unique words in Text_1: 718
Number of Unique words in Text_2: 634
Number of common Uniqe words: 481
Percentage of common words in Text_1: 66.99%
Percentage of common words in Text_2: 75.87%


### Comparing the Documents with 'ID' number of 503 & 948, which have Jaccard distance equal to 0.



In [79]:
txt3 = analyze_text_by_id(Job_df2, 503)

[Row(Id=503, Tokens stopwords removed=['description', 'restaurant', 'teamshift', 'leaders', 'dual', 'role', 'youll', 'serve', 'restaurant', 'leader', 'team', 'member', 'leader', 'youll', 'work', 'closely', 'restaurant', 'manager', 'ensuring', 'operating', 'procedures', 'followed', 'youll', 'also', 'assist', 'scheduling', 'training', 'supervising', 'team', 'members', 'ensure', 'customer', 'enjoys', 'hot', 'freshlyprepared', 'product', 'using', 'highest', 'quality', 'ingredients', 'served', 'comfortable', 'clean', 'friendly', 'environment', 'whats', 'competitive', 'weekly', 'pay', 'hour', 'schedule', 'flexibility', 'dayeveningovernight', 'shifts', 'discounted', 'meals', 'opportunities', 'career', 'development', 'growth', 'whataburger', 'family', 'foundation', 'scholarship', 'program', 'medical', 'dental', 'vision', 'plans', 'k', 'savings', 'plans', 'whatagames', 'ask', 'us', 'people', 'make', 'difference', 'whataburger', 'take', 'pride', 'work', 'take', 'care', 'love', 'serving', 'custom

In [80]:
txt4 = analyze_text_by_id(Job_df2, 948)

[Row(Id=948, Tokens stopwords removed=['description', 'restaurant', 'teamshift', 'leaders', 'dual', 'role', 'youll', 'serve', 'restaurant', 'leader', 'team', 'member', 'leader', 'youll', 'work', 'closely', 'restaurant', 'manager', 'ensuring', 'operating', 'procedures', 'followed', 'youll', 'also', 'assist', 'scheduling', 'training', 'supervising', 'team', 'members', 'ensure', 'customer', 'enjoys', 'hot', 'freshlyprepared', 'product', 'using', 'highest', 'quality', 'ingredients', 'served', 'comfortable', 'clean', 'friendly', 'environment', 'whats', 'competitive', 'weekly', 'pay', 'hour', 'schedule', 'flexibility', 'dayeveningovernight', 'shifts', 'discounted', 'meals', 'opportunities', 'career', 'development', 'growth', 'whataburger', 'family', 'foundation', 'scholarship', 'program', 'medical', 'dental', 'vision', 'plans', 'k', 'savings', 'plans', 'whatagames', 'ask', 'us', 'people', 'make', 'difference', 'whataburger', 'take', 'pride', 'work', 'take', 'care', 'love', 'serving', 'custom

In [81]:
# Comparing number of common words inside Documents with 'ID' number of 503 & 948

(num_words_list1, num_words_list2, num_unique_words_list1,
 num_unique_words_list2, num_common_words,
 percentage_common_list1, percentage_common_list2) = analyze_lists(txt3, txt4)

print(f"Number of words in Text_1: {num_words_list1}")
print(f"Number of words in Text_2: {num_words_list2}")
print(f"Number of Unique words in Text_1: {num_unique_words_list1}")
print(f"Number of Unique words in Text_2: {num_unique_words_list2}")
print(f"Number of common Uniqe words: {num_common_words}")
print(f"Percentage of common words in Text_1: {percentage_common_list1:.2f}%")
print(f"Percentage of common words in Text_2: {percentage_common_list2:.2f}%")

Number of words in Text_1: 346
Number of words in Text_2: 346
Number of Unique words in Text_1: 268
Number of Unique words in Text_2: 268
Number of common Uniqe words: 268
Percentage of common words in Text_1: 100.00%
Percentage of common words in Text_2: 100.00%


## Cheking the Equality of documents:
#### Cheking the Equality of documents with ['ID'] number 503 & 948, which they have Jaccard Distance equal to Zero.

In [82]:
# ID number 503
row_with_id_503 = Job_df.filter(Job_df['ID'] == 503).collect()
txt = row_with_id_503[0][1:][0]
txt

'description our restaurant teamshift leaders have a dual role youll serve as both a restaurant leader and a team member as a leader youll work closely with the restaurant manager ensuring all operating procedures are followed youll also assist with scheduling training and supervising team members to ensure each customer enjoys a hot freshlyprepared product using the highest quality ingredients served in a comfortable clean friendly environment whats in it for you competitive weekly pay hour schedule flexibility dayeveningovernight shifts discounted meals opportunities for career development and growth whataburger family foundation and scholarship program medical dental and vision plans k savings plans whatagames ask us about this our people make the difference at whataburger we take pride in our work take care of each other and love serving our customers each and every day youll be learning skills that will serve you well no matter where your career journey leads you required qualific

In [83]:
# ID number 948
row_with_id_948 = Job_df.filter(Job_df['ID'] == 948).collect()
txt2 = row_with_id_948[0][1:][0]
txt2

'description our restaurant teamshift leaders have a dual role youll serve as both a restaurant leader and a team member as a leader youll work closely with the restaurant manager ensuring all operating procedures are followed youll also assist with scheduling training and supervising team members to ensure each customer enjoys a hot freshlyprepared product using the highest quality ingredients served in a comfortable clean friendly environment whats in it for you competitive weekly pay hour schedule flexibility dayeveningovernight shifts discounted meals opportunities for career development and growth whataburger family foundation and scholarship program medical dental and vision plans k savings plans whatagames ask us about this our people make the difference at whataburger we take pride in our work take care of each other and love serving our customers each and every day youll be learning skills that will serve you well no matter where your career journey leads you required qualific

In [84]:
if txt == txt2:
    print('Equal')
else:
    print('Not Equal')

Equal


### Dence vector

In [86]:
def shingles_to_one_hot_vector(shingles):
    vector = np.zeros(len(unique_shingles))
    for sh in shingles:
        if sh in shingle_index:
            vector[shingle_index[sh]] = 1.0
    return Vectors.dense(vector.tolist())

# UDF to convert shingles to one-hot vectors
one_hot_vector_udf = udf(lambda shingles: shingles_to_one_hot_vector(shingles), VectorUDT())

Job_df_proces2 = Job_df_proces.withColumn("features_dence", one_hot_vector_udf(col("shingles")))

In [88]:
Job_df_proces2.select("features_dence").show(5, truncate=False)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [89]:
Job_df_proces2.show(3)

+---+--------------------+--------------------+--------------------+--------------------+
| Id|        Cleaned_text|            shingles|            features|      features_dence|
+---+--------------------+--------------------+--------------------+--------------------+
| 40|job title food se...|[looking musthave...|(558004,[3368,507...|[0.0,0.0,0.0,0.0,...|
| 45|responsibilities ...|[logistics system...|(558004,[0,1,2,3,...|[1.0,1.0,1.0,1.0,...|
|160|profession regist...|[organizational v...|(558004,[4,5,6,7,...|[0.0,0.0,0.0,0.0,...|
+---+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows



In [90]:
start = time.time()

# Initialize MinHashLSH
mh = MinHashLSH(inputCol="features_dence", outputCol="hashes", seed=12345, numHashTables=20)
model = mh.fit(Job_df_proces2)
print("The hashed dataset where hashed values are stored in the column 'hashes':")
hash = model.transform(Job_df_proces2)

# Compute the locality sensitive hashes for the input rows, then perform approximate
# similarity join to Calculate Jaccard Distances.
result = model.approxSimilarityJoin(hash, hash, 0.6, distCol="JaccardDistance").select(
    col("datasetA.id").alias("idA"),
    col("datasetB.id").alias("idB"),
    col("JaccardDistance")
)

# Filter out self-pairs and display the results
result_filtered = result.filter(col("idA") < col("idB"))

end = time.time()
computation_time = round(end - start, 3)
print("Computation time: {} seconds".format(computation_time))

print('------------------------------------')

The hashed dataset where hashed values are stored in the column 'hashes':
Computation time: 17.329 seconds
------------------------------------
