In [2]:
import pyspark 
from pyspark.sql import SparkSession
import pandas as pd
from pyspark.sql.functions import col, lower, regexp_replace
from pyspark.ml.feature import Tokenizer

In [3]:

# Initialize Spark session
spark = SparkSession.builder.appName("EntityResolutionPreprocessing").getOrCreate()

# Sample data
data = [("1", "This is an example sentence."),
        ("2", "Another example with some punctuation!")]

# Define the schema
schema = ["id", "description"]

# Create a DataFrame
df = spark.createDataFrame(data, schema=schema)

# Display the initial data
print("Initial Data:")
df.show()

# Step 1: Tokenization
tokenizer = Tokenizer(inputCol="description", outputCol="tokens")
df = tokenizer.transform(df)

# Display data after tokenization
print("Data after Tokenization:")
df.select("description", "tokens").show(truncate=False)

# Step 2: Normalization (lowercasing and removing special characters)
df = df.withColumn("normalized_description", lower(col("description")))
df = df.withColumn("normalized_description", regexp_replace("normalized_description", "[^a-zA-Z0-9\\s]", ""))

# Display data after normalization
print("Data after Normalization:")
df.select("description", "normalized_description").show(truncate=False)

# Additional preprocessing steps can be added based on your requirements.

# Finally, perform entity resolution using the preprocessed data.

# Stop the Spark session
spark.stop()




Initial Data:
+---+--------------------+
| id|         description|
+---+--------------------+
|  1|This is an exampl...|
|  2|Another example w...|
+---+--------------------+

Data after Tokenization:
+--------------------------------------+--------------------------------------------+
|description                           |tokens                                      |
+--------------------------------------+--------------------------------------------+
|This is an example sentence.          |[this, is, an, example, sentence.]          |
|Another example with some punctuation!|[another, example, with, some, punctuation!]|
+--------------------------------------+--------------------------------------------+

Data after Normalization:
+--------------------------------------+-------------------------------------+
|description                           |normalized_description               |
+--------------------------------------+-------------------------------------+
|This is an exampl

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, expr

# Initialize Spark session
spark = SparkSession.builder.appName("SimilarityScores").getOrCreate()

# Sample data
data = [("1", "apple orange banana"),
        ("2", "orange banana kiwi"),
        ("3", "pear apple banana")]

# Define the schema
schema = ["id", "description"]

# Create a DataFrame
df = spark.createDataFrame(data, schema=schema)

# Display the initial data
print("Initial Data:")
df.show()

# Split the description into a list of words
df = df.withColumn("words", split(col("description"), " "))

# Define a function to compute Jaccard similarity
def jaccard_similarity(list1, list2):
    set1 = set(list1)
    set2 = set(list2)
    intersection_size = len(set1 & set2)
    union_size = len(set1 | set2)
    return intersection_size / union_size if union_size != 0 else 0.0

# Define a UDF for the Jaccard similarity function
spark.udf.register("jaccard_similarity", jaccard_similarity)

# Compute Jaccard similarity scores between records
df_similarity = df.alias("df1").join(df.alias("df2"), col("df1.id") < col("df2.id")) \
    .select(col("df1.id").alias("id1"),
            col("df2.id").alias("id2"),
            col("df1.words").alias("words1"),
            col("df2.words").alias("words2"),
            expr("jaccard_similarity(words1, words2)").alias("jaccard_similarity"))

# Display similarity scores
print("Similarity Scores:")
df_similarity.show()

# Stop the Spark session
spark.stop()


Initial Data:


                                                                                

+---+-------------------+
| id|        description|
+---+-------------------+
|  1|apple orange banana|
|  2| orange banana kiwi|
|  3|  pear apple banana|
+---+-------------------+

Similarity Scores:


24/01/19 14:25:13 WARN SimpleFunctionRegistry: The function jaccard_similarity replaced a previously registered function.
                                                                                

+---+---+--------------------+--------------------+------------------+
|id1|id2|              words1|              words2|jaccard_similarity|
+---+---+--------------------+--------------------+------------------+
|  1|  2|[apple, orange, b...|[orange, banana, ...|               0.5|
|  1|  3|[apple, orange, b...|[pear, apple, ban...|               0.5|
|  2|  3|[orange, banana, ...|[pear, apple, ban...|               0.2|
+---+---+--------------------+--------------------+------------------+



Implement a PySpark program to evaluate the precision, recall, and F1-score of an entity
resolution model.

In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import Tokenizer, NGram, CountVectorizer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline

# Initialize Spark session
spark = SparkSession.builder.appName("EntityResolutionEvaluation").getOrCreate()

# Sample data
data = [("1", "John Doe", "123 Main St", "john.doe@email.com"),
        ("2", "Jane Smith", "456 Oak Ave", "jane.smith@email.com"),
        ("3", "John Doe", "789 Pine St", "john.doe@email.com"),
        ("4", "Jake Brown", "101 Elm St", "jake.brown@email.com")]

# Ground truth data (1 if match, 0 if non-match)
ground_truth_data = [("1", "2", 0),
                     ("1", "3", 1),
                     ("1", "4", 0),
                     ("2", "3", 0),
                     ("2", "4", 0),
                     ("3", "4", 0)]

# Define the schema
data_schema = ["id", "name", "address", "email"]
ground_truth_schema = ["id1", "id2", "label"]

# Create DataFrames
df = spark.createDataFrame(data, schema=data_schema)
ground_truth_df = spark.createDataFrame(ground_truth_data, schema=ground_truth_schema)

# Add a label column to the 'df' DataFrame based on matching names
df_labeled = df.alias("df1").join(df.alias("df2"), col("df1.name") == col("df2.name")) \
    .select(col("df1.id").alias("id1"),
            col("df2.id").alias("id2"),
            col("df1.name").alias("name"),
            when(col("df1.id") == col("df2.id"), 1).otherwise(0).alias("label"))

# Tokenize and normalize columns for matching
tokenizer = Tokenizer(inputCol="name", outputCol="name_tokens")
ngram = NGram(n=2, inputCol="name_tokens", outputCol="name_ngrams")
vectorizer = CountVectorizer(inputCol="name_ngrams", outputCol="name_vector")

# Create a simple entity resolution model (using Logistic Regression as an example)
lr = LogisticRegression(featuresCol="name_vector", labelCol="label", predictionCol="prediction")
pipeline = Pipeline(stages=[tokenizer, ngram, vectorizer, lr])

# Fit the model
model = pipeline.fit(df_labeled)  # Use the labeled DataFrame 'df_labeled' for fitting the pipeline

# Make predictions
predictions = model.transform(df_labeled)  # Use the labeled DataFrame 'df_labeled' for making predictions

# Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
area_under_roc = evaluator.evaluate(predictions)

# Compute precision, recall, and F1-score
tp = predictions.filter((col("label") == 1) & (col("prediction") == 1)).count()
fp = predictions.filter((col("label") == 0) & (col("prediction") == 1)).count()
fn = predictions.filter((col("label") == 1) & (col("prediction") == 0)).count()

precision = tp / (tp + fp) if (tp + fp) != 0 else 0.0
recall = tp / (tp + fn) if (tp + fn) != 0 else 0.0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0.0

# Display evaluation metrics
print("Area under ROC:", area_under_roc)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1_score)

# Stop the Spark session
spark.stop()


24/01/19 14:29:42 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


Area under ROC: 0.75
Precision: 0.6666666666666666
Recall: 1.0
F1-Score: 0.8
