In [None]:
# install Java8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# download spark3.0.0
!wget -q wget https://downloads.apache.org/spark/spark-3.0.1/spark-3.0.1-bin-hadoop2.7.tgz
# unzip it   spark-3.0.1-bin-hadoop2.7
!tar xf spark-3.0.1-bin-hadoop2.7.tgz
# install findspark 
!pip install -q findspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop2.7"

In [None]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()


In [None]:
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql.types import *
import pandas as pd
import numpy as np
from pyspark.sql.functions import *
import math

Using Spark DF API


In [None]:
df = spark.createDataFrame(    
        [('hi how are you','how are you'), 
        ('who am i', 'who am i'),
        ('this is human','this is'),
        ('cow is the source of milk', 'bufallow is the source of milk'),
        ('i am srinivas','i am srinivas')]
    ,
    ['c1', 'c2'] # add your columns label here
)
df.show()


+--------------------+--------------------+
|                  c1|                  c2|
+--------------------+--------------------+
|      hi how are you|         how are you|
|            who am i|            who am i|
|       this is human|             this is|
|cow is the source...|bufallow is the s...|
|       i am srinivas|       i am srinivas|
+--------------------+--------------------+



In [None]:
def show_common_words(c1,c2):   
    wc1 = c1.split(" ") 
    wc2 = c2.split(" ")
    commonWords = set(wc1) & set(wc2)   
    return commonWords 

def count_common_words(c1,c2):   
    wc1 = c1.split(" ") 
    wc2 = c2.split(" ")
    countOfCommonWords = len(set(wc1) & set(wc2))    
    return countOfCommonWords      

def compute_pct_similarity(c1,c2):   
    wc1 = c1.split(" ") 
    wc2 = c2.split(" ")
    pctOfCommonWords = (len(set(wc1) & set(wc2)) / len(wc1)) * 100     
    return pctOfCommonWords      


udf_func_showCommon = udf(show_common_words,StringType())
udf_func_countCommon = udf(count_common_words,IntegerType())
udf_func_pctCommon = udf(compute_pct_similarity,FloatType())

#Create a new column in a datfarme using withcolumn
df = df.withColumn("c3",col("c1") == col("c2"))
df = df.withColumn("c4",udf_func_showCommon(df.c1, df.c2))
df = df.withColumn("c5",udf_func_countCommon(df.c1, df.c2))
df = df.withColumn("c6",udf_func_pctCommon(df.c1, df.c2))

df.show()

+--------------------+--------------------+-----+--------------------+---+---------+
|                  c1|                  c2|   c3|                  c4| c5|       c6|
+--------------------+--------------------+-----+--------------------+---+---------+
|      hi how are you|         how are you|false|     [how, are, you]|  3|     75.0|
|            who am i|            who am i| true|        [i, am, who]|  3|    100.0|
|       this is human|             this is|false|          [this, is]|  2|66.666664|
|cow is the source...|bufallow is the s...|false|[the, of, milk, i...|  5|83.333336|
|       I am srinivas|       I am srinivas| true|   [I, srinivas, am]|  3|    100.0|
+--------------------+--------------------+-----+--------------------+---+---------+



Using Spark-SQL

In [None]:
df.createOrReplaceTempView("sentencetable")
spark.sql(""" select c1,c2, filter(split(c1,' '), x -> array_contains(split(c2,' '),x) ) c4 from sentencetable """).withColumn("a1_size",size('c4')).show()

+--------------------+--------------------+--------------------+-------+
|                  c1|                  c2|                  c4|a1_size|
+--------------------+--------------------+--------------------+-------+
|      hi how are you|         how are you|     [how, are, you]|      3|
|            who am i|            who am i|        [who, am, i]|      3|
|       this is human|             this is|          [this, is]|      2|
|cow is the source...|bufallow is the s...|[is, the, source,...|      5|
|       i am srinivas|       i am srinivas|   [i, am, srinivas]|      3|
+--------------------+--------------------+--------------------+-------+

