# **Finding similar items**

### Project for the **Algorithms for massive data course**


MSc, Data Science for Economics*

Shojaat Joodi Bigdilo

June 2024

In [5]:
# connecting my Google Drive and google colab
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)

Mounted at /content/gdrive


In [6]:
!pip install kaggle

import warnings
warnings.filterwarnings("ignore")

In [7]:
!pip install findspark



In [8]:
!pip install pyspark



In [9]:
# connecting to Kaggle
import os
os.environ['KAGGLE_USERNAME'] = 'xxxxxxxxx'

os.environ['KAGGLE_KEY'] = 'xxxxxxxxx' 

In [10]:
!kaggle datasets download -d asaniczka/1-3m-linkedin-jobs-and-skills-2024

In [11]:
extract_to_path  = "/content/gdrive/My Drive/Massive_Data_Project/Job_Dataset"

import zipfile
with zipfile.ZipFile('1-3m-linkedin-jobs-and-skills-2024.zip', 'r') as zip_ref:
    zip_ref.extractall(extract_to_path)

In [12]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import isnan, when, count, col, countDistinct, udf
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, MinHashLSH, Normalizer
from pyspark.sql.types import DoubleType

from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import explode
from pyspark.sql.types import StringType, ArrayType, StructType, StructField, IntegerType

from pyspark.sql.functions import lower, regexp_replace, size
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, IntegerType

import re
import time
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

import string

import datetime
import warnings
warnings.filterwarnings("ignore")




In [13]:
# SparkSession initialization
conf = SparkConf().setAppName("Similar_Documents")
spark = SparkSession.builder.enableHiveSupport().config(conf = conf).getOrCreate()
sc = spark.sparkContext
type(sc)

In [14]:
# Reading Dataset from Google Drive
file_path = "/content/gdrive/My Drive/Massive_Data_Project/Job_Dataset/job_summary.csv"

df_Dataset = spark.read.csv(file_path, header=True, inferSchema=True, multiLine=True, escape='"',
                           encoding = "ISO-8859-1")

## choosing chunk of data

In [15]:
# Creating Chunk of Dataset
import pandas as pd

size = 5000
df_Dataset_2 = df_Dataset.limit(size)

# Convert Spark DataFrame to Pandas DataFrame
df_Dataset_2 = df_Dataset_2.toPandas()
df_Dataset_2.to_csv('/content/gdrive/My Drive/Massive_Data_Project/Job_Dataset/Chunk5000.csv', index=False)

In [16]:
# Reading Chunk Dataset
file_path = "/content/gdrive/My Drive/Massive_Data_Project/Job_Dataset/Chunk5000.csv"

Job_Dataset = spark.read.csv(file_path, header=True, inferSchema=True, multiLine=True, escape='"',
                           encoding = "ISO-8859-1")


In [17]:
type(Job_Dataset)

# Pre-processsing

### Exploratory analysis

In [18]:
Job_Dataset.show(n = 10)

+--------------------+--------------------+
|            job_link|         job_summary|
+--------------------+--------------------+
|https://www.linke...|Rock N Roll Sushi...|
|https://www.linke...|Schedule\n: PRN i...|
|https://www.linke...|Description\nIntr...|
|https://uk.linked...|Commercial accoun...|
|https://www.linke...|Address:\nUSA-CT-...|
|https://www.linke...|Description\nOur\...|
|https://www.linke...|Company Descripti...|
|https://uk.linked...|An exciting oppor...|
|https://www.linke...|Job Details:\nJob...|
|https://www.linke...|Our\nRestaurant T...|
+--------------------+--------------------+
only showing top 10 rows



In [19]:
Job_Dataset = Job_Dataset.select("job_summary")
Job_Dataset.show(n = 10)

+--------------------+
|         job_summary|
+--------------------+
|Rock N Roll Sushi...|
|Schedule\n: PRN i...|
|Description\nIntr...|
|Commercial accoun...|
|Address:\nUSA-CT-...|
|Description\nOur\...|
|Company Descripti...|
|An exciting oppor...|
|Job Details:\nJob...|
|Our\nRestaurant T...|
+--------------------+
only showing top 10 rows



#### Giving Id for each row

In [20]:
indexed_rdd = Job_Dataset.rdd.zipWithIndex()
Job_Dataset = indexed_rdd.map(lambda x: (x[1], x[0][0])).toDF(["Id", "job_summary"])

In [21]:
Job_Dataset.show(5)

+---+--------------------+
| Id|         job_summary|
+---+--------------------+
|  0|Rock N Roll Sushi...|
|  1|Schedule\n: PRN i...|
|  2|Description\nIntr...|
|  3|Commercial accoun...|
|  4|Address:\nUSA-CT-...|
+---+--------------------+
only showing top 5 rows



In [22]:
Job_df = Job_Dataset

In [23]:
Job_df.show(n = 5)

+---+--------------------+
| Id|         job_summary|
+---+--------------------+
|  0|Rock N Roll Sushi...|
|  1|Schedule\n: PRN i...|
|  2|Description\nIntr...|
|  3|Commercial accoun...|
|  4|Address:\nUSA-CT-...|
+---+--------------------+
only showing top 5 rows



In [24]:
# checking missing values in the columns
Job_df.select([count(when(isnan(c), c)).alias(c) for c in Job_df.columns]).show()

+---+-----------+
| Id|job_summary|
+---+-----------+
|  0|          0|
+---+-----------+



In [25]:
#count distinct values in each column
Job_df.select([countDistinct(c).alias(c) for c in Job_df.columns]).show()

+----+-----------+
|  Id|job_summary|
+----+-----------+
|5000|       4069|
+----+-----------+



### Duplicates check

In [26]:
# show duplicates in Body column
Job_df.groupBy("job_summary").count().filter("count > 1").show()

+--------------------+-----+
|         job_summary|count|
+--------------------+-----+
|At Five Below our...|    5|
|Job Title:\nCerti...|   10|
|Salary:\n$48,604....|    2|
|If you enjoy work...|    3|
|If youÃ¢ÂÂve ev...|    6|
|FULL-TIME LICENSE...|    6|
|Summary\nThis pos...|    2|
|Patrol Officer - ...|    2|
|We are looking fo...|    3|
|Description\nOur ...|    3|
|Job Description\n...|    4|
|Full Time Corpora...|    2|
|Hospital Site Sec...|    4|
|Responsibilities\...|    2|
|In a world where ...|    3|
|Come Join The Lea...|    2|
|Title: - Identity...|    2|
|With significant ...|    3|
|Description\nOur ...|    3|
|Description\nHow ...|    3|
+--------------------+-----+
only showing top 20 rows



In [27]:
# Filter the rows where 'job_summary' starts with 'Job Title:\nCerti'
filtered_rows = Job_df.filter(col("job_summary").startswith("Job Title:\nCertified Nursing Assistant (CNA)\nCompany"))
filtered_rows.show()

+----+--------------------+
|  Id|         job_summary|
+----+--------------------+
|1319|Job Title:\nCerti...|
|1586|Job Title:\nCerti...|
|1613|Job Title:\nCerti...|
|1713|Job Title:\nCerti...|
|1729|Job Title:\nCerti...|
|1763|Job Title:\nCerti...|
|1765|Job Title:\nCerti...|
|1792|Job Title:\nCerti...|
|1847|Job Title:\nCerti...|
|2285|Job Title:\nCerti...|
+----+--------------------+



In [28]:
# ID number 1319
row_with_id_1319 = Job_df.filter(Job_df['ID'] == 1319).collect()

txt = row_with_id_1319[0][1:][0]
txt

"Job Title:\nCertified Nursing Assistant (CNA)\nCompany:\nTilloHealth, a division of Tillotek Staffing Solutions\nPay Range:\nCompetitive salary based on experience\nShifts/Hours:\n1st, 2nd, and 3rd Shifts may be available depending on time of application\nEmployment Type:\nFull-Time/Part-Time/Per Diem availability\nOpportunity Type:\nPerm, Contract, and Travel may be available\nJob Summary\nAs a Certified Nursing Assistant (CNA) at Tillotek Staffing Solutions, you will play a crucial role in providing essential care and support to patients in various healthcare settings. CNAs are responsible for assisting patients with activities of daily living, such as bathing, dressing, and feeding, and ensuring their comfort and well-being. Join our team of dedicated healthcare professionals and make a meaningful difference in the lives of patients.\nResponsibilities\nAssist patients with daily tasks, including bathing, grooming, and dressing.\nProvide basic medical care, such as checking vital si

In [29]:
# ID number 1586
row_with_id_1586 = Job_df.filter(Job_df['ID'] == 1586).collect()

txt2 = row_with_id_1586[0][1:][0]
txt2

"Job Title:\nCertified Nursing Assistant (CNA)\nCompany:\nTilloHealth, a division of Tillotek Staffing Solutions\nPay Range:\nCompetitive salary based on experience\nShifts/Hours:\n1st, 2nd, and 3rd Shifts may be available depending on time of application\nEmployment Type:\nFull-Time/Part-Time/Per Diem availability\nOpportunity Type:\nPerm, Contract, and Travel may be available\nJob Summary\nAs a Certified Nursing Assistant (CNA) at Tillotek Staffing Solutions, you will play a crucial role in providing essential care and support to patients in various healthcare settings. CNAs are responsible for assisting patients with activities of daily living, such as bathing, dressing, and feeding, and ensuring their comfort and well-being. Join our team of dedicated healthcare professionals and make a meaningful difference in the lives of patients.\nResponsibilities\nAssist patients with daily tasks, including bathing, grooming, and dressing.\nProvide basic medical care, such as checking vital si

In [30]:
# Checking Equality of texts
if txt == txt2:
  print('Equal')
else:
  print('Not Equal')

Equal


### Delete Duplicates Document

In [31]:
# Optionally, delete duplicates based on a specific column
Job_df = Job_df.dropDuplicates(['job_summary'])

In [32]:
#count distinct values in each column
Job_df.select([countDistinct(c).alias(c) for c in Job_df.columns]).show()

+----+-----------+
|  Id|job_summary|
+----+-----------+
|4069|       4069|
+----+-----------+



In [33]:
# checking again duplicates
Job_df.groupBy("job_summary").count().filter("count > 1").show()

+-----------+-----+
|job_summary|count|
+-----------+-----+
+-----------+-----+



In [34]:
row_with_id_1586 = Job_df.filter(Job_df['ID'] == 1586).collect()
row_with_id_1586

[]

# Text cleaning and pre-processing

In [35]:
Job_df = Job_df.select('Id',"job_summary")
# questions_body.show(n = 10)

### LoweCasing Text

In [36]:
Job_df = Job_df.withColumn('job_summary', lower(Job_df['job_summary']))

### Remove HTML Tags

In [37]:
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text) if text else text

# Register the function as a UDF
remove_html_tags_udf = udf(remove_html_tags, StringType())

# Apply the UDF to the job_summary column
Job_df = Job_df.withColumn('job_summary', remove_html_tags_udf(Job_df['job_summary']))

In [38]:
# Job_df.show(n = 5 , truncate=False)

###  Remove URLs

In [39]:
# Here We also Use Regular Expressions to Remove URLs from Text or Whole Corpus.
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

remove_url_udf = udf(remove_url, StringType())
Job_df = Job_df.withColumn('job_summary', remove_url_udf(Job_df['job_summary']))

In [40]:
# Job_df.show(n = 5 , truncate=False)

### Remove Punctuations

In [41]:
import string
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))


remove_punctuation_udf = udf(remove_punctuation, StringType())
Job_df = Job_df.withColumn('job_summary', remove_punctuation_udf(Job_df['job_summary']))

In [42]:
# Job_df.show(n = 5 , truncate=False)

### Remove numbers

The following document has aroud 42 different number inside it, so we need to delet them.
3x12 , 180000060000, 12003, 0, 4, 02142024, 05152024, 13, 556166975, 56166975 , 12 , 7 , 7, 100 , 133, 3467, 68100, 10 , 25, 50 , 100,
100 , 20 , 3, 2, , 1, 0, 100, 15, 15, 15, 91, 401,36, 50, 2023, 2022, 2021 ,2020, 2019.

In [43]:
# row_with_id_160 = Job_df.filter(Job_df['ID'] == 160).collect()
# row_with_id_160

In [44]:
def remove_numbers(text):
    pattern = re.compile(r'\d+')
    return pattern.sub(r'', text)

remove_numbers_udf = udf(remove_numbers, StringType())
Job_df = Job_df.withColumn('job_summary', remove_numbers_udf(Job_df['job_summary']))

In [45]:
# row_with_id_160 = Job_df.filter(Job_df['ID'] == 160).collect()
# row_with_id_160

### Remove Non-ASCII characters:
Some texts have some non-ASCII characters like (ã°â\x9fâ\x9fâ¡), so we need to delete them from texts

In [46]:
row_with_id_915 = Job_df.filter(Job_df['ID'] == 915).collect()
row_with_id_915

In [47]:
def remove_non_ascii(text):
    if text is None:
        return None
    return re.sub(r'[^\x00-\x7F]+', '', str(text))

remove_non_ascii_udf = udf(remove_non_ascii, StringType())
Job_df = Job_df.withColumn('job_summary', remove_non_ascii_udf(Job_df['job_summary']))

In [48]:
# checking again non-ASCII characters 
row_with_id_915 = Job_df.filter(Job_df['ID'] == 915).collect()
row_with_id_915

### Remove extra space

In [49]:
from pyspark.sql.functions import regexp_replace, col, trim

def remove_extra_spaces(df, column_name):
    df = df.withColumn(column_name, regexp_replace(col(column_name), "\\s+", " "))  
    return df.withColumn(column_name, trim(col(column_name))) 

In [50]:
Job_df = remove_extra_spaces(Job_df, "job_summary")

In [51]:
Job_df.show(5)

+----+--------------------+
|  Id|         job_summary|
+----+--------------------+
| 160|profession regist...|
| 349|at five below our...|
| 589|to apply for this...|
|1319|job title certifi...|
|1324|come join the lea...|
+----+--------------------+
only showing top 5 rows



### Tokenization

In [52]:
tokenizer = Tokenizer().setInputCol("job_summary").setOutputCol("Tokens")
Job_df = tokenizer.transform(Job_df)
Job_df.show(5)

+----+--------------------+--------------------+
|  Id|         job_summary|              Tokens|
+----+--------------------+--------------------+
| 160|profession regist...|[profession, regi...|
| 349|at five below our...|[at, five, below,...|
| 589|to apply for this...|[to, apply, for, ...|
|1319|job title certifi...|[job, title, cert...|
|1324|come join the lea...|[come, join, the,...|
+----+--------------------+--------------------+
only showing top 5 rows



### Removing Stopwords

In [53]:
# removing stopwords using default list
remove_stopwords = StopWordsRemover()
stopwords = remove_stopwords.getStopWords()
print(stopwords[:10])
print(len(stopwords))

remove_stopwords.setInputCol("Tokens").setOutputCol("Tokens stopwords removed")
Job_df = remove_stopwords.transform(Job_df)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your']
181


In [54]:
# # counting the number of tokens with stopwords removed
Job_df = Job_df.withColumn("Number of tokens", size(Job_df['Tokens']))
Job_df = Job_df.withColumn("Number of tokens After stopwords removed", size(Job_df['Tokens stopwords removed']))
# counting the number of tokens with stopwords removed
Job_df = Job_df.withColumn("Number of stopwords removed", size(Job_df['Tokens stopwords removed']) - size(Job_df['Tokens']) )

In [55]:
Job_df.show(10)

+----+--------------------+--------------------+------------------------+----------------+----------------------------------------+---------------------------+
|  Id|         job_summary|              Tokens|Tokens stopwords removed|Number of tokens|Number of tokens After stopwords removed|Number of stopwords removed|
+----+--------------------+--------------------+------------------------+----------------+----------------------------------------+---------------------------+
| 160|profession regist...|[profession, regi...|    [profession, regi...|            1917|                                    1366|                       -551|
| 349|at five below our...|[at, five, below,...|    [five, growth, re...|             800|                                     497|                       -303|
| 589|to apply for this...|[to, apply, for, ...|    [apply, job, clic...|             269|                                     151|                       -118|
|1319|job title certifi...|[job, title, 

###  Join the words

To join the words back together after tokenization and stopword removal, you can use the concat_ws function provided by PySpark. Here’s how you can do it:

In [56]:
from pyspark.sql.functions import concat_ws

Job_df = Job_df.withColumn("Cleaned_text", concat_ws(" ", col("Tokens stopwords removed")))

In [57]:
Job_df.select("Cleaned_text").show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## Final dataset

In [58]:
Job_df_proces = Job_df.select('Id', "Cleaned_text")

In [60]:
Job_df_proces.show()

+----+--------------------+
|  Id|        Cleaned_text|
+----+--------------------+
| 160|profession regist...|
| 349|five growth resul...|
| 589|apply job click l...|
|1319|job title certifi...|
|1324|come join leader ...|
|1803|resort lifestyle ...|
|1928|manager radiology...|
|1936|description team ...|
|1971|overview industry...|
|2161|position summary ...|
|2408|dollar general co...|
|2493|baptist health co...|
|3063|job title dental ...|
|3525|company overview ...|
|4015|salary pay grade ...|
|4101|want learn role j...|
|4133|business travel c...|
|4427|doccafe immediate...|
|  65|please review ful...|
| 230|enjoy working par...|
+----+--------------------+
only showing top 20 rows



### Creating Shingles

In [61]:
def shingle(text, k):
    shingles = set()
    words = text.split()
    for i in range(len(words) - k + 1):
        shingles.add(' '.join(words[i:i+k]))
    return list(shingles)

In [62]:
k = 2  # Shingle length
shingle_udf = udf(lambda text: shingle(text, k), ArrayType(StringType()))
Job_df_proces = Job_df_proces.withColumn("shingles", shingle_udf(col("Cleaned_text")))

In [63]:
Job_df_proces.select("shingles").show(5)

+--------------------+
|            shingles|
+--------------------+
|[organizational v...|
|[physical invento...|
|[skills must, ski...|
|[cnas responsible...|
|[training leaders...|
+--------------------+
only showing top 5 rows



#### convert shingles to sparse vectors

In [65]:
# Flatten the shingles column to get all unique shingles
unique_shingles = Job_df_proces.select(explode("shingles").alias("shingle")).distinct().collect()
shingle_index = {row["shingle"]: idx for idx, row in enumerate(unique_shingles)}

print("Unique shingles and their indices:")
print(shingle_index)

def shingles_to_sparse_vector(shingles):
    indices = sorted([shingle_index[sh] for sh in shingles if sh in shingle_index])
    values = [1.0] * len(indices)
    return Vectors.sparse(len(unique_shingles), indices, values)

# UDF to convert shingles to sparse vectors
sparse_vector_udf = udf(lambda shingles: shingles_to_sparse_vector(shingles), VectorUDT())

Job_df_proces = Job_df_proces.withColumn("features", sparse_vector_udf(col("shingles")))

Unique shingles and their indices:


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [66]:
# Show the DataFrame with sparse vectors
Job_df_proces.select("features").show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [67]:
df = Job_df_proces
# df.show(truncate=False)

## Implementing MinHashLSH

In [68]:
start = time.time()

# Initialize MinHashLSH
mh = MinHashLSH(inputCol="features", outputCol="hashes", seed=12345, numHashTables=20)
model = mh.fit(df)
print("The hashed dataset where hashed values are stored in the column 'hashes':")
hash = model.transform(df)

# Compute the locality sensitive hashes for the input rows, then perform approximate
# similarity join to Calculate Jaccard Distances.
result = model.approxSimilarityJoin(hash, hash, 0.6, distCol="JaccardDistance").select(
    col("datasetA.id").alias("idA"),
    col("datasetB.id").alias("idB"),
    col("JaccardDistance")
)

# Filter out self-pairs and display the results
result_filtered = result.filter("idA < idB")

end = time.time()
computation_time = round(end - start, 3)
print("Computation time: {} seconds".format(computation_time))

print('------------------------------------')

The hashed dataset where hashed values are stored in the column 'hashes':
Computation time: 5.51 seconds
------------------------------------


In [None]:
type(result_filtered)

In [69]:
result_filtered.show()

+----+----+--------------------+
| idA| idB|     JaccardDistance|
+----+----+--------------------+
|3823|3828|  0.3668261562998405|
|1774|2361|0.024691358024691357|
|2045|2337|0.012422360248447228|
|3632|3734| 0.03988603988603989|
|3559|4320|0.039408866995073843|
|4462|4564| 0.06829268292682922|
|3936|4732| 0.05791505791505791|
| 158|1136| 0.04664723032069973|
| 158| 546|0.014705882352941124|
|1662|1810| 0.38329238329238324|
|1139|2067|  0.1923076923076923|
|4167|4427| 0.11111111111111116|
|3216|3386|  0.5738498789346247|
| 328|2775| 0.06629834254143652|
|1774|2043|                 0.0|
|1623|2109|0.021671826625387025|
|2105|2339| 0.26388888888888884|
| 583|1069| 0.03159851301115246|
|3015|3528| 0.02298850574712641|
|3015|3126| 0.05428571428571427|
+----+----+--------------------+
only showing top 20 rows



In [71]:
# showing id pairs with distance < 0.6 sorted in ascending order
result_filtered.sort(result_filtered.JaccardDistance.asc()).show(10)

+----+----+---------------+
| idA| idB|JaccardDistance|
+----+----+---------------+
| 497|1063|            0.0|
| 511|1084|            0.0|
|1063|1084|            0.0|
| 615| 963|            0.0|
| 544| 764|            0.0|
| 764|1063|            0.0|
|1940|3596|            0.0|
| 377| 558|            0.0|
| 118| 775|            0.0|
| 118| 511|            0.0|
+----+----+---------------+
only showing top 10 rows



In [70]:
result_filtered.sort(result_filtered.JaccardDistance.desc()).show(10)

+----+----+------------------+
| idA| idB|   JaccardDistance|
+----+----+------------------+
|3284|4955|0.5998985801217038|
|4051|4261|0.5997536945812808|
|3187|4189|0.5995115995115995|
|3771|4880| 0.599294947121034|
|3817|3882| 0.599224305106658|
|3771|4808| 0.599121361889072|
|1715|2249|0.5988235294117648|
|2857|3735|0.5986984815618221|
|3735|3810|0.5984807379272925|
|3284|3801|0.5984251968503937|
+----+----+------------------+
only showing top 10 rows



#### save the result

In [89]:
# save the result to a file
start = time.time()

size = 5000
result_path = f"/content/gdrive/My Drive/Massive_Data_Project/Result/results_{size}.csv"
result_filtered.write.csv(result_path, header=True)
print('------------- Result Saved ---------------')

end = time.time()
computation_time = round(end - start, 3)
print("Computation time: {} seconds".format(computation_time))
print('------------------------------------')

------------- Result Saved ---------------
Computation time: 256.858 seconds
------------------------------------


#### Load the result

In [90]:
# Load the result back from the saved CSV file
size = 5000
loaded_result_path = f"/content/gdrive/My Drive/Massive_Data_Project/Result/results_{size}.csv"
loaded_result = spark.read.csv(loaded_result_path, header=True, inferSchema=True)

In [None]:
loaded_result.sort(loaded_result.JaccardDistance.asc()).show(10)

+---+----+---------------+
|idA| idB|JaccardDistance|
+---+----+---------------+
|654| 850|            0.0|
|503| 764|            0.0|
|654| 949|            0.0|
|654| 998|            0.0|
|511| 615|            0.0|
|654| 662|            0.0|
|615| 764|            0.0|
|870|2999|            0.0|
|615| 726|            0.0|
|870|3629|            0.0|
+---+----+---------------+
only showing top 10 rows



In [91]:
# Filter the results to show only rows with JaccardDistance between 0.2 and 0.3
filtered_result = loaded_result.filter((loaded_result.JaccardDistance >= 0.2) & (loaded_result.JaccardDistance <= 0.3))
filtered_result.sort(filtered_result.JaccardDistance.asc()).show(10)

+----+----+-------------------+
| idA| idB|    JaccardDistance|
+----+----+-------------------+
|1909|3014| 0.2005494505494505|
|3058|3148| 0.2008196721311475|
|2023|2036|0.20105820105820105|
|1602|2205|0.20110192837465568|
|2205|2223|0.20110192837465568|
| 553|1909|0.20110192837465568|
| 289|2572|  0.201641266119578|
|2572|3421|  0.201641266119578|
|3646|4840|0.20170454545454541|
|3451|3888|0.20170454545454541|
+----+----+-------------------+
only showing top 10 rows



#### Result of minhash function (hash values)

In [86]:
hash.show()

+----+--------------------+--------------------+--------------------+--------------------+
|  Id|        Cleaned_text|            shingles|            features|              hashes|
+----+--------------------+--------------------+--------------------+--------------------+
| 160|profession regist...|[organizational v...|(558004,[0,1,2,3,...|[[817945.0], [849...|
| 349|five growth resul...|[physical invento...|(558004,[9,10,11,...|[[1245090.0], [64...|
| 589|apply job click l...|[skills must, ski...|(558004,[7,6873,6...|[[3.568234E7], [1...|
|1319|job title certifi...|[cnas responsible...|(558004,[12,1727,...|[[1.6176972E7], [...|
|1324|come join leader ...|[training leaders...|(558004,[13,14,17...|[[1.1126859E7], [...|
|1803|resort lifestyle ...|[considerate mann...|(558004,[15,1732,...|[[3884814.0], [44...|
|1928|manager radiology...|[crt standards, d...|(558004,[16,17,18...|[[43317.0], [5842...|
|1936|description team ...|[transform health...|(558004,[24,1734,...|[[7842996.0], [33...|

#### Sparce vector for first document , id = 160

In [87]:
hash.first()['features']

SparseVector(558004, {0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0, 8: 1.0, 1719: 1.0, 1720: 1.0, 1721: 1.0, 1722: 1.0, 1723: 1.0, 3422: 1.0, 3423: 1.0, 3424: 1.0, 3425: 1.0, 3426: 1.0, 3427: 1.0, 3428: 1.0, 3429: 1.0, 5172: 1.0, 5173: 1.0, 5174: 1.0, 5175: 1.0, 5176: 1.0, 6856: 1.0, 6857: 1.0, 6858: 1.0, 6859: 1.0, 6860: 1.0, 6861: 1.0, 6862: 1.0, 6863: 1.0, 6864: 1.0, 6865: 1.0, 6866: 1.0, 6867: 1.0, 6868: 1.0, 6869: 1.0, 6870: 1.0, 8578: 1.0, 8579: 1.0, 8580: 1.0, 8581: 1.0, 8582: 1.0, 8583: 1.0, 8584: 1.0, 8585: 1.0, 10339: 1.0, 10340: 1.0, 10341: 1.0, 10342: 1.0, 12047: 1.0, 12048: 1.0, 12049: 1.0, 12050: 1.0, 12051: 1.0, 12052: 1.0, 12053: 1.0, 12054: 1.0, 13768: 1.0, 13769: 1.0, 13770: 1.0, 13771: 1.0, 15414: 1.0, 15415: 1.0, 15416: 1.0, 17037: 1.0, 17038: 1.0, 17039: 1.0, 17040: 1.0, 17041: 1.0, 18667: 1.0, 18668: 1.0, 18669: 1.0, 20296: 1.0, 20297: 1.0, 20298: 1.0, 20299: 1.0, 20300: 1.0, 20301: 1.0, 20302: 1.0, 20303: 1.0, 20304: 1.0, 20305: 1.0, 21994: 1.0,

#### Signature vector for first document, id = 160
Values inside DenseVector shows value of each hash function.

In [88]:
# hash value of first Document
hash.first()['hashes']

[DenseVector([817945.0]),
 DenseVector([84975.0]),
 DenseVector([1314580.0]),
 DenseVector([792147.0]),
 DenseVector([1059460.0]),
 DenseVector([3875662.0]),
 DenseVector([641826.0]),
 DenseVector([482020.0]),
 DenseVector([1434890.0]),
 DenseVector([1621684.0]),
 DenseVector([2809550.0]),
 DenseVector([413624.0]),
 DenseVector([938124.0]),
 DenseVector([1989136.0]),
 DenseVector([726703.0]),
 DenseVector([5337888.0]),
 DenseVector([546638.0]),
 DenseVector([2574512.0]),
 DenseVector([85776.0]),
 DenseVector([2619402.0])]

### Creating New dataframe in order to compare pair document with each other

In [72]:
df_compare = Job_df.select('Id', "Tokens stopwords removed")
df_compare.show(5)

+----+------------------------+
|  Id|Tokens stopwords removed|
+----+------------------------+
| 160|    [profession, regi...|
| 349|    [five, growth, re...|
| 589|    [apply, job, clic...|
|1319|    [job, title, cert...|
|1324|    [come, join, lead...|
+----+------------------------+
only showing top 5 rows



In [73]:
from pyspark.sql import DataFrame

# Filter the dataset to get the row with the given ID number
def analyze_text_by_id(df: DataFrame, id_number: int):
    row_with_id = df.filter(df['ID'] == id_number).collect()
    print(row_with_id)

    if not row_with_id:
        print(f"No row found with ID {id_number}")
        return

    txt = row_with_id[0][1:][0]

    print(f"Type of txt: {type(txt)}")
    print(f"Length of txt: {len(txt)}")
    print(f"Fourth character in txt: {txt[3]}")

    return txt

In [74]:
# Function to calculate the number of words, common words, and percentage of common words
def analyze_lists(tokens1, tokens2):
    tokens1 = [word for word in tokens1 if word]
    tokens2 = [word for word in tokens2 if word]

    num_words_list1 = len(tokens1)
    num_words_list2 = len(tokens2)
    num_unique_words_list1 = len(set(tokens1))
    num_unique_words_list2 = len(set(tokens2))

    common_words = set(tokens1).intersection(tokens2)
    num_common_words = len(common_words)

    percentage_common_list1 = (num_common_words / num_unique_words_list1) * 100 if num_unique_words_list1 > 0 else 0
    percentage_common_list2 = (num_common_words / num_unique_words_list2) * 100 if num_unique_words_list2 > 0 else 0

    return (num_words_list1, num_words_list2, num_unique_words_list1,
            num_unique_words_list2, num_common_words,
            percentage_common_list1, percentage_common_list2)

### Comparing the Documents with 'ID' number of 1909 & 3014, which have Jaccard distance equal to 0.20



In [75]:
# Filter the dataset to get the row with ID number 1909
txt1 = analyze_text_by_id(df_compare, 1909)

[Row(Id=1909, Tokens stopwords removed=['interested', 'making', 'positive', 'impact', 'youve', 'come', 'right', 'place', 'fusion', 'medical', 'staffing', 'goal', 'improve', 'lives', 'everyone', 'touch', 'always', 'looking', 'people', 'like', 'join', 'mission', 'making', 'difference', 'isnt', 'perk', 'traveling', 'us', 'start', 'medical', 'travel', 'career', 'fusion', 'medical', 'staffing', 'gain', 'access', 'competitive', 'pay', 'packages', 'comprehensive', 'benefits', 'corporate', 'discounts', 'perks', 'clinical', 'team', 'support', 'along', 'journey', 'recruiter', 'determined', 'help', 'succeed', 'weve', 'got', 'back', 'focus', 'best', 'helping', 'others', 'certified', 'nursing', 'assistant', 'skilled', 'nursing', 'facility', 'position', 'certified', 'nursing', 'assistant', 'specialty', 'skilled', 'nursing', 'facility', 'week', 'skilled', 'nursing', 'facility', 'certified', 'nursing', 'assistant', 'travel', 'assignment', 'client', 'de', 'smet', 'sd', 'looking', 'skilled', 'nursing', 

In [76]:
# Filter the dataset to get the row with ID number 3014
txt2 = analyze_text_by_id(df_compare, 3014)

[Row(Id=3014, Tokens stopwords removed=['interested', 'making', 'positive', 'impact', 'youve', 'come', 'right', 'place', 'fusion', 'medical', 'staffing', 'goal', 'improve', 'lives', 'everyone', 'touch', 'always', 'looking', 'people', 'like', 'join', 'mission', 'making', 'difference', 'isnt', 'perk', 'traveling', 'us', 'start', 'medical', 'travel', 'career', 'fusion', 'medical', 'staffing', 'gain', 'access', 'competitive', 'pay', 'packages', 'comprehensive', 'benefits', 'corporate', 'discounts', 'perks', 'clinical', 'team', 'support', 'along', 'journey', 'recruiter', 'determined', 'help', 'succeed', 'weve', 'got', 'back', 'focus', 'best', 'helping', 'others', 'technician', 'patient', 'care', 'position', 'technician', 'specialty', 'patient', 'care', 'week', 'patient', 'care', 'technician', 'travel', 'assignment', 'client', 'hackensack', 'nj', 'looking', 'patient', 'care', 'technician', 'help', 'weeks', 'fusion', 'medical', 'truly', 'believe', 'people', 'taking', 'care', 'people', 'togeth

In [77]:
# Comparing number of common words inside Documents with 'ID' number of 1909 & 3014

(num_words_list1, num_words_list2, num_unique_words_list1,
 num_unique_words_list2, num_common_words,
 percentage_common_list1, percentage_common_list2) = analyze_lists(txt1, txt2)

print(f"Number of words in Text_1: {num_words_list1}")
print(f"Number of words in Text_2: {num_words_list2}")
print(f"Number of Unique words in Text_1: {num_unique_words_list1}")
print(f"Number of Unique words in Text_2: {num_unique_words_list2}")
print(f"Number of common Uniqe words: {num_common_words}")
print(f"Percentage of common words in Text_1: {percentage_common_list1:.2f}%")
print(f"Percentage of common words in Text_2: {percentage_common_list2:.2f}%")

Number of words in Text_1: 344
Number of words in Text_2: 361
Number of Unique words in Text_1: 249
Number of Unique words in Text_2: 260
Number of common Uniqe words: 240
Percentage of common words in Text_1: 96.39%
Percentage of common words in Text_2: 92.31%


### Comparing the Documents with 'ID' number of 3284 & 4955, which have Jaccard distance equal to 0.59

In [78]:
# Filter the dataset to get the row with ID number 3284
txt5 = analyze_text_by_id(df_compare, 3284)

[Row(Id=3284, Tokens stopwords removed=['summary', 'position', 'eligible', 'education', 'debt', 'reduction', 'program', 'edrp', 'student', 'loan', 'payment', 'reimbursement', 'program', 'must', 'meet', 'specific', 'individual', 'eligibility', 'requirements', 'accordance', 'vha', 'policy', 'submit', 'edrp', 'application', 'within', 'four', 'months', 'appointment', 'approval', 'award', 'amount', 'eligibility', 'period', 'one', 'five', 'years', 'determined', 'vha', 'education', 'loan', 'repayment', 'services', 'program', 'office', 'complete', 'review', 'edrp', 'application', 'learn', 'agency', 'help', 'duties', 'staff', 'psychologist', 'position', 'fulltime', 'position', 'supporting', 'alcohol', 'drug', 'treatment', 'program', 'adtp', 'administered', 'mental', 'health', 'service', 'line', 'mhsl', 'located', 'grounds', 'va', 'boston', 'healthcare', 'system', 'vabhs', 'brockton', 'division', 'psychologist', 'provides', 'direct', 'care', 'services', 'including', 'consultation', 'diagnostic',

In [79]:
# Filter the dataset to get the row with ID number 4955
txt6 = analyze_text_by_id(df_compare, 4955)

[Row(Id=4955, Tokens stopwords removed=['summary', 'position', 'eligible', 'education', 'debt', 'reduction', 'program', 'edrp', 'student', 'loan', 'payment', 'reimbursement', 'program', 'must', 'meet', 'specific', 'individual', 'eligibility', 'requirements', 'accordance', 'vha', 'policy', 'submit', 'edrp', 'application', 'within', 'four', 'months', 'appointment', 'approval', 'award', 'amount', 'eligibility', 'period', 'one', 'five', 'years', 'determined', 'vha', 'education', 'loan', 'repayment', 'services', 'program', 'office', 'complete', 'review', 'edrp', 'application', 'learn', 'agency', 'help', 'duties', 'open', 'continuous', 'announcement', 'remain', 'open', 'april', 'initial', 'cutoff', 'date', 'referral', 'eligible', 'applications', 'february', 'eligible', 'applications', 'received', 'date', 'referred', 'regular', 'intervals', 'additional', 'vacancies', 'occur', 'asneeded', 'basis', 'position', 'filled', 'relocationrecruitment', 'incentives', 'authorized', 'duties', 'include', '

In [80]:
# Comparing number of common words inside Documents with 'ID' number of 3284 & 4955

(num_words_list1, num_words_list2, num_unique_words_list1,
 num_unique_words_list2, num_common_words,
 percentage_common_list1, percentage_common_list2) = analyze_lists(txt5, txt6)

print(f"Number of words in Text_1: {num_words_list1}")
print(f"Number of words in Text_2: {num_words_list2}")
print(f"Number of Unique words in Text_1: {num_unique_words_list1}")
print(f"Number of Unique words in Text_2: {num_unique_words_list2}")
print(f"Number of common Uniqe words: {num_common_words}")
print(f"Percentage of common words in Text_1: {percentage_common_list1:.2f}%")
print(f"Percentage of common words in Text_2: {percentage_common_list2:.2f}%")

Number of words in Text_1: 1876
Number of words in Text_2: 1387
Number of Unique words in Text_1: 718
Number of Unique words in Text_2: 634
Number of common Uniqe words: 481
Percentage of common words in Text_1: 66.99%
Percentage of common words in Text_2: 75.87%


### Comparing the Documents with 'ID' number of 503 & 948, which have Jaccard distance equal to 0.



In [81]:
# Filter the dataset to get the row with ID number 503
txt3 = analyze_text_by_id(df_compare, 503)

[Row(Id=503, Tokens stopwords removed=['description', 'restaurant', 'teamshift', 'leaders', 'dual', 'role', 'youll', 'serve', 'restaurant', 'leader', 'team', 'member', 'leader', 'youll', 'work', 'closely', 'restaurant', 'manager', 'ensuring', 'operating', 'procedures', 'followed', 'youll', 'also', 'assist', 'scheduling', 'training', 'supervising', 'team', 'members', 'ensure', 'customer', 'enjoys', 'hot', 'freshlyprepared', 'product', 'using', 'highest', 'quality', 'ingredients', 'served', 'comfortable', 'clean', 'friendly', 'environment', 'whats', 'competitive', 'weekly', 'pay', 'hour', 'schedule', 'flexibility', 'dayeveningovernight', 'shifts', 'discounted', 'meals', 'opportunities', 'career', 'development', 'growth', 'whataburger', 'family', 'foundation', 'scholarship', 'program', 'medical', 'dental', 'vision', 'plans', 'k', 'savings', 'plans', 'whatagames', 'ask', 'us', 'people', 'make', 'difference', 'whataburger', 'take', 'pride', 'work', 'take', 'care', 'love', 'serving', 'custom

In [82]:
# Filter the dataset to get the row with ID number 948
txt4 = analyze_text_by_id(df_compare, 948)

[Row(Id=948, Tokens stopwords removed=['description', 'restaurant', 'teamshift', 'leaders', 'dual', 'role', 'youll', 'serve', 'restaurant', 'leader', 'team', 'member', 'leader', 'youll', 'work', 'closely', 'restaurant', 'manager', 'ensuring', 'operating', 'procedures', 'followed', 'youll', 'also', 'assist', 'scheduling', 'training', 'supervising', 'team', 'members', 'ensure', 'customer', 'enjoys', 'hot', 'freshlyprepared', 'product', 'using', 'highest', 'quality', 'ingredients', 'served', 'comfortable', 'clean', 'friendly', 'environment', 'whats', 'competitive', 'weekly', 'pay', 'hour', 'schedule', 'flexibility', 'dayeveningovernight', 'shifts', 'discounted', 'meals', 'opportunities', 'career', 'development', 'growth', 'whataburger', 'family', 'foundation', 'scholarship', 'program', 'medical', 'dental', 'vision', 'plans', 'k', 'savings', 'plans', 'whatagames', 'ask', 'us', 'people', 'make', 'difference', 'whataburger', 'take', 'pride', 'work', 'take', 'care', 'love', 'serving', 'custom

In [83]:
# Comparing number of common words inside Documents with 'ID' number of 503 & 948

(num_words_list1, num_words_list2, num_unique_words_list1,
 num_unique_words_list2, num_common_words,
 percentage_common_list1, percentage_common_list2) = analyze_lists(txt3, txt4)

print(f"Number of words in Text_1: {num_words_list1}")
print(f"Number of words in Text_2: {num_words_list2}")
print(f"Number of Unique words in Text_1: {num_unique_words_list1}")
print(f"Number of Unique words in Text_2: {num_unique_words_list2}")
print(f"Number of common Uniqe words: {num_common_words}")
print(f"Percentage of common words in Text_1: {percentage_common_list1:.2f}%")
print(f"Percentage of common words in Text_2: {percentage_common_list2:.2f}%")

Number of words in Text_1: 346
Number of words in Text_2: 346
Number of Unique words in Text_1: 268
Number of Unique words in Text_2: 268
Number of common Uniqe words: 268
Percentage of common words in Text_1: 100.00%
Percentage of common words in Text_2: 100.00%


## Cheking the Equality of documents:
#### Cheking the Equality of documents with ['ID'] number 503 & 948, which they have Jaccard Distance equal to Zero.

In [84]:
# ID number 503
row_with_id_503 = Job_Dataset.filter(Job_Dataset['ID'] == 503).collect()
txt = row_with_id_503[0][1:][0]
txt

"Description\nOur\nRestaurant Team/Shift Leaders\nhave a dual role - youÃ¢Â\x80Â\x99ll serve as both a restaurant leader and a team member. As a leader, youÃ¢Â\x80Â\x99ll work closely with the Restaurant Manager ensuring all operating procedures are followed. YouÃ¢Â\x80Â\x99ll also assist with scheduling, training and supervising Team Members to ensure each customer enjoys a hot, freshly-prepared product using the highest quality ingredients served in a comfortable, clean, friendly environment.\nWhat's In It For You\nCompetitive Weekly Pay\n$15.25 - $17 / hour\nSchedule Flexibility Ã¢Â\x80Â\x93 Day/Evening/Overnight Shifts\nDiscounted Meals\nOpportunities for Career Development and Growth\nWhataburger Family Foundation and Scholarship Program\nMedical, Dental and Vision Plans\n401K Savings Plans\nWhatagames (Ask us about this!)\nOur people make the difference at Whataburger. We take pride in our work, take care of each other and love serving our customers. Each and every day youÃ¢Â\x80

In [85]:
# ID number 948
row_with_id_948 = Job_Dataset.filter(Job_Dataset['ID'] == 948).collect()
txt2 = row_with_id_948[0][1:][0]
txt2

"Description\nOur\nRestaurant Team/Shift Leaders\nhave a dual role - youÃ¢Â\x80Â\x99ll serve as both a restaurant leader and a team member. As a leader, youÃ¢Â\x80Â\x99ll work closely with the Restaurant Manager ensuring all operating procedures are followed. YouÃ¢Â\x80Â\x99ll also assist with scheduling, training and supervising Team Members to ensure each customer enjoys a hot, freshly-prepared product using the highest quality ingredients served in a comfortable, clean, friendly environment.\nWhat's In It For You\nCompetitive Weekly Pay\n$16 - $17.50 / hour\nSchedule Flexibility Ã¢Â\x80Â\x93 Day/Evening/Overnight Shifts\nDiscounted Meals\nOpportunities for Career Development and Growth\nWhataburger Family Foundation and Scholarship Program\nMedical, Dental and Vision Plans\n401K Savings Plans\nWhatagames (Ask us about this!)\nOur people make the difference at Whataburger. We take pride in our work, take care of each other and love serving our customers. Each and every day youÃ¢Â\x80

In [None]:
if txt == txt2:
  print('Equal')
else:
  print('Not Equal')

Not Equal
