In [2]:
import tensorflow as tf       # To work with USE4
import pandas as pd           # To work with tables 
import tensorflow_hub as hub  # contains USE4
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" #Model is imported from this URL
model = hub.load(module_url)
def embed(input):
  return model(input)

In [3]:
Data = pd.read_csv("/content/drive/MyDrive/Precily_Assesment/Precily_Text_Similarity.csv")

In [4]:
Data.head()

Unnamed: 0,text1,text2
0,broadband challenges tv viewing the number of ...,gardener wins double in glasgow britain s jaso...
1,rap boss arrested over drug find rap mogul mar...,amnesty chief laments war failure the lack of ...
2,player burn-out worries robinson england coach...,hanks greeted at wintry premiere hollywood sta...
3,hearts of oak 3-2 cotonsport hearts of oak set...,redford s vision of sundance despite sporting ...
4,sir paul rocks super bowl crowds sir paul mcca...,mauresmo opens with victory in la amelie maure...


In [5]:
Data.shape

(3000, 2)

In [6]:
Data['text1'][0]

'broadband challenges tv viewing the number of europeans with broadband has exploded over the past 12 months  with the web eating into tv viewing habits  research suggests.  just over 54 million people are hooked up to the net via broadband  up from 34 million a year ago  according to market analysts nielsen/netratings. the total number of people online in europe has broken the 100 million mark. the popularity of the net has meant that many are turning away from tv  say analysts jupiter research. it found that a quarter of web users said they spent less time watching tv in favour of the net  the report by nielsen/netratings found that the number of people with fast internet access had risen by 60% over the past year.  the biggest jump was in italy  where it rose by 120%. britain was close behind  with broadband users almost doubling in a year. the growth has been fuelled by lower prices and a wider choice of always-on  fast-net subscription plans.  twelve months ago high speed internet

In [7]:
type(Data['text1'][0]) # we can see that all the data is in string type

str

In [8]:
message = [Data['text1'][0], Data['text2'][0]]
message_embeddings = embed(message)
message_embeddings

<tf.Tensor: shape=(2, 512), dtype=float32, numpy=
array([[-0.02720232,  0.00681642, -0.03939367, ..., -0.03903358,
        -0.05795865, -0.05810072],
       [-0.05569994, -0.0564485 , -0.056383  , ...,  0.04282598,
        -0.05645383, -0.05647698]], dtype=float32)>

In [9]:
type(message_embeddings)

tensorflow.python.framework.ops.EagerTensor

In [10]:
type(message_embeddings[0])

tensorflow.python.framework.ops.EagerTensor

In [11]:
type(tf.make_ndarray(tf.make_tensor_proto(message_embeddings)))

numpy.ndarray

In [12]:
a_np = tf.make_ndarray(tf.make_tensor_proto(message_embeddings))

In [13]:
from numpy import dot                                           # to calculate the dot product of two vectors
from numpy.linalg import norm                                   #for finding the norm of a vector

ans = []                                                        # This list will contain the cosin similarity value for each vector pair present.
for i in range(len(Data)):
  messages = [Data['text1'][i], Data['text2'][i]]               #storing each sentence pair in messages
  message_embeddings = embed(messages)                          #converting the sentence pair to vector pair using the embed() function
  a = tf.make_ndarray(tf.make_tensor_proto(message_embeddings)) #storing the vector in the form of numpy array
  cos_sim = dot(a[0], a[1])/(norm(a[0])*norm(a[1]))             #Finding the cosine between the two vectors
  ans.append(cos_sim)                                           #Appending the values into the ans list

In [14]:
len(ans) 

3000

In [15]:
Ans = pd.DataFrame(ans, columns = ['Similarity_Score'])         #converting the ans list into Dataframe so that we can add it to our "Data"

In [16]:
Ans.head()

Unnamed: 0,Similarity_Score
0,0.272668
1,0.277622
2,0.169011
3,0.157467
4,0.246201


In [17]:
Data = Data.join(Ans)  #Joining the Similarity_Score Dataframe (Ans) to our main Data

In [18]:
#adding 1 to each of the values of Similarity_Score to make the values from 0 to 2. (Initially it was from [-1,1])
Data['Similarity_Score'] = Data['Similarity_Score'] + 1

In [19]:
Data.head(2)

Unnamed: 0,text1,text2,Similarity_Score
0,broadband challenges tv viewing the number of ...,gardener wins double in glasgow britain s jaso...,1.272668
1,rap boss arrested over drug find rap mogul mar...,amnesty chief laments war failure the lack of ...,1.277622


In [20]:
#Normalizing the Similarity_Score to get the value between 0 and 1
Data['Similarity_Score'] = Data['Similarity_Score']/Data['Similarity_Score'].abs().max()

In [21]:
Data.head()

Unnamed: 0,text1,text2,Similarity_Score
0,broadband challenges tv viewing the number of ...,gardener wins double in glasgow britain s jaso...,0.636334
1,rap boss arrested over drug find rap mogul mar...,amnesty chief laments war failure the lack of ...,0.638811
2,player burn-out worries robinson england coach...,hanks greeted at wintry premiere hollywood sta...,0.584505
3,hearts of oak 3-2 cotonsport hearts of oak set...,redford s vision of sundance despite sporting ...,0.578734
4,sir paul rocks super bowl crowds sir paul mcca...,mauresmo opens with victory in la amelie maure...,0.6231


In [22]:
Data.shape

(3000, 3)

In [23]:
Data.insert(0, 'Unique_ID', range(3000))

In [25]:
Data.tail()

Unnamed: 0,Unique_ID,text1,text2,Similarity_Score
2995,2995,uk directors guild nominees named martin scors...,steel firm to cut 45 000 jobs mittal steel ...,0.631628
2996,2996,u2 to play at grammy awards show irish rock ba...,israel looks to us for bank chief israel has a...,0.661699
2997,2997,pountney handed ban and fine northampton coach...,india and iran in gas export deal india has si...,0.60486
2998,2998,belle named best scottish band belle & sebas...,mido makes third apology ahmed mido hossam h...,0.643591
2999,2999,criminal probe on citigroup deals traders at u...,former ni minister scott dies former northern ...,0.619776


In [26]:
Submission = Data[['Unique_ID', 'Similarity_Score']]

In [27]:
Submission.head()

Unnamed: 0,Unique_ID,Similarity_Score
0,0,0.636334
1,1,0.638811
2,2,0.584505
3,3,0.578734
4,4,0.6231


In [30]:
Submission.set_index("Unique_ID", inplace = True)

In [31]:
from google.colab import files
Submission.to_csv('Submission.csv') 
files.download('Submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>