In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import os
import numpy as np, pandas as pd
import requests

In [0]:
df = pd.read_csv('/content/drive/My Drive/list_of_sentences', header = None)

In [4]:
df.head(4)

Unnamed: 0,0
0,good morning
1,how are you doing ?
2,the weather is awesome today
3,samsung


In [0]:
list_of_sent = df.astype(str).values.tolist()

In [6]:
list_of_sent

[['good morning'],
 ['how are you doing ?'],
 ['the weather is awesome today'],
 ['samsung'],
 ['good afternoon'],
 ['baseball is played in the USA'],
 ['there is a thunderstorm '],
 ['are you doing good ?'],
 ['The polar regions are melting"'],
 ['apple'],
 ['nokia'],
 ['cricket is a fun game'],
 ['the climate change is a problem']]

In [0]:
GLOVE_DIR = '/content/drive/My Drive/glove.6B.50d.txt'

In [0]:
def loadGloveModel(GLOVE_DIR):
    print ("Loading Glove Model")
    with open(GLOVE_DIR, encoding="utf8" ) as f:
        content = f.readlines()
    model = {}
    for line in content:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print ("Done.",len(model)," words loaded!")
    return model

In [9]:
import re, nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import wordnet
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def preprocess(raw_text):

    # keep only words
    letters_only_text = re.sub("[^a-zA-Z]", " ", raw_text)

    # convert to lower case and split 
    words = letters_only_text.lower().split()

    # remove stopwords
    stopword_set = set(stopwords.words("english"))
    cleaned_words = list(set([w for w in words if w not in stopword_set]))

    lemm_words = [lemmatizer.lemmatize(w, wordnet.NOUN) for w in cleaned_words]

    return lemm_words

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
model = loadGloveModel(GLOVE_DIR)

Loading Glove Model
Done. 400000  words loaded!


In [0]:
def cosine_distance_wordembedding_method(s1, s2):
    import scipy
    vector_1 = np.mean([model[word] for word in preprocess(s1)],axis=0)
    vector_2 = np.mean([model[word] for word in preprocess(s2)],axis=0)
    cosine = scipy.spatial.distance.cosine(vector_1, vector_2)
    #print('Word Embedding method with a cosine distance asses that our two sentences are similar to',round((1-cosine)*100,2),'%')
    return round((1-cosine)*100,2)

In [0]:
def output(list_of_words):

  op = []
  #sentences = [list_of_words]
  for i in range(0, len(list_of_words)):
    for j in range(i+1, len(list_of_words)):
      if cosine_distance_wordembedding_method(list_of_words[i][0], list_of_words[j][0]) == 100.0:
        continue
      elif cosine_distance_wordembedding_method(list_of_words[i][0], list_of_words[j][0]) > 70.0:
        op.extend([[list_of_words[i], list_of_words[j]]])
      else:
        continue
  
  return op

In [13]:
output(list_of_sent)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


[[['good morning'], ['the weather is awesome today']],
 [['good morning'], ['good afternoon']],
 [['good morning'], ['are you doing good ?']],
 [['good morning'], ['cricket is a fun game']],
 [['the weather is awesome today'], ['good afternoon']],
 [['the weather is awesome today'], ['are you doing good ?']],
 [['the weather is awesome today'], ['the climate change is a problem']],
 [['samsung'], ['nokia']],
 [['good afternoon'], ['are you doing good ?']],
 [['good afternoon'], ['cricket is a fun game']],
 [['baseball is played in the USA'], ['cricket is a fun game']],
 [['are you doing good ?'], ['cricket is a fun game']],
 [['are you doing good ?'], ['the climate change is a problem']]]

In [14]:
user_query = pd.DataFrame(["Football is played in Brazil", "Cricket is played in India", "Travelling is good for health", "People love traveling in winter"])
k = user_query.astype(str).values.tolist()
output(k)

[[['Football is played in Brazil'], ['Cricket is played in India']],
 [['Travelling is good for health'], ['People love traveling in winter']]]

In [15]:
from sklearn.externals import joblib 
  
# Save the model as a pickle in a file 
joblib.dump(output, 'semantic_similarity.pkl')



['semantic_similarity.pkl']

In [16]:
v = joblib.load('semantic_similarity.pkl') 
v(list_of_sent)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


[[['good morning'], ['the weather is awesome today']],
 [['good morning'], ['good afternoon']],
 [['good morning'], ['are you doing good ?']],
 [['good morning'], ['cricket is a fun game']],
 [['the weather is awesome today'], ['good afternoon']],
 [['the weather is awesome today'], ['are you doing good ?']],
 [['the weather is awesome today'], ['the climate change is a problem']],
 [['samsung'], ['nokia']],
 [['good afternoon'], ['are you doing good ?']],
 [['good afternoon'], ['cricket is a fun game']],
 [['baseball is played in the USA'], ['cricket is a fun game']],
 [['are you doing good ?'], ['cricket is a fun game']],
 [['are you doing good ?'], ['the climate change is a problem']]]

In [0]:
#pip install Flask-RESTful

In [0]:
from flask import Flask
from flask_restful import reqparse, abort, Api, Resource
import pickle
import numpy as np

In [19]:
app = Flask(__name__)
api = Api(app)

clf_path = '/content/semantic_similarity.pkl'
with open(clf_path, 'rb') as f:
    model = pickle.load(f)

# argument parsing
parser = reqparse.RequestParser()
parser.add_argument('query')

<flask_restful.reqparse.RequestParser at 0x7fb369b42b70>

In [0]:
class textual_similarity(Resource):
    def get(self):
        # use parser and find the user's query
        args = parser.parse_args()
        user_query = args['query']

        user_query = pd.DataFrame(user_query)
        input_file = user_query.astype(str).values.tolist()

        pred_text = model
        # create JSON object
        output = {'prediction': pred_text}
        
        return output

In [21]:
# Setup the Api resource routing here
# Route the URL to the resource
#api.add_resource(textual_similarity, '/')
'''
if __name__ == '__main__':
    app.run(debug=True)
'''

"\nif __name__ == '__main__':\n    app.run(debug=True)\n"