## Importing Libraries

In [417]:
# !pip install sentence_transformers

In [418]:
# !pip install flask_restful
# !pip install bcrypt

In [419]:
import pandas as pd

# Vectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Sequence Matcher
import difflib

# Bert
from sentence_transformers import SentenceTransformer

# Flask
from flask import Flask, jsonify, request
from flask_restful import Api, Resource

# MongoDB
from pymongo import MongoClient
import pymongo
import bcrypt

### Creating App for Flask

In [420]:
app = Flask(__name__)
api = Api(app)

### MongoDB Configuration

In [421]:
# Connection URL
client = pymongo.MongoClient("mongodb://localhost:27017/")
# Database
db = client.SimilarityDB
# Collection
users = db["Accounts"]

## Register User functionality

In [422]:
# To check if User already exists in database or not
def UserExist(username):
    if users.find({"Username":username}).count() == 0:
        return False
    else:
        return True

In [423]:
class Register(Resource):
    def post(self):
        # To get posted data by the user in form of JSON
        postedData = request.get_json()

        # Get the data
        username = postedData["username"]
        password = postedData["password"]

        if UserExist(username):
            retJson = {
                'status':301,
                'msg': 'Username already present, try again!'
            }
            return jsonify(retJson)

        # Making password encrypted
        hashed_pw = bcrypt.hashpw(password.encode('utf8'), bcrypt.gensalt())

         #Store username and pw into the database
        users.insert({
            "Username": username,
            "Password": hashed_pw
        })

        # Response from the REST API
        retJson = {
            "status": 200,
            "msg": "You successfully signed up for the API"
        }
        return jsonify(retJson)

In [424]:
# REST API URI
api.add_resource(Register, '/register')

## Detect Similarity Functionality

In [425]:
# To check if username and password are correct
def verifyPw(username, password):
    if not UserExist(username):
        return False

    hashed_pw = users.find({
        "Username":username
    })[0]["Password"]

    if bcrypt.hashpw(password.encode('utf8'), hashed_pw) == hashed_pw:
        return True
    else:
        return False

In [426]:
class Detect(Resource):
    def post(self):
        # To get the posted data from the user in the Form of JSON
        postedData = request.get_json()

        # Read the data
        username = postedData["username"]
        password = postedData["password"]
        text1 = postedData["text1"]
        text2 = postedData["text2"]

        # Response from REST API if user does not exist in the Database
        if not UserExist(username):
            retJson = {
                'status':301,
                'msg': "Invalid Username"
            }
            return jsonify(retJson)
        
        # Verify the username password match
        correct_pw = verifyPw(username, password)

        if not correct_pw:
            retJson = {
                "status":302,
                "msg": "Incorrect Password"
            }
            return jsonify(retJson)
        
        max_ratio = []
        
        # First model - Sequence Matcher
        seq = difflib.SequenceMatcher(None, text1, text2)
        d = seq.ratio()*100
        print("Similarity percentage using Sequence Matcher - ", d)
        max_ratio.append(d)

        text = [text1, text2]

        # Bag of words using Count Vectorizer
        count_vect = CountVectorizer()
        X_train_counts = count_vect.fit_transform(text)
        pd.DataFrame(X_train_counts.toarray(), columns=count_vect.get_feature_names(), index=['Para 1', 'Para 2'])

        # Bag of words using TF-IDF Vectorizer
        vectorizer = TfidfVectorizer()
        trsfm = vectorizer.fit_transform(text)
        pd.DataFrame(trsfm.toarray(), columns=vectorizer.get_feature_names(), index=['Text 1', 'Text 2'])

        # Cosine Similarity between 2 paragraphs using TFID Vectorizer
        css_tfid = cosine_similarity(trsfm[0:1], trsfm)
        r_tfid = css_tfid[0][1]*100
        print("Similarity percentage using TFID Vectorizer - ", r_tfid)
        max_ratio.append(r_tfid)
        
        # Cosine Similarity between 2 paragraphs using Count Vectorizer
        css_cnt = cosine_similarity(X_train_counts[0:1], X_train_counts)
        r_cnt = css_cnt[0][1]*100
        print("Similarity percentage using Count Vectorizer - ", r_cnt)
        max_ratio.append(r_cnt)

        # Third model Bert using sentence transformer
        model = SentenceTransformer('bert-base-nli-mean-tokens')

        sentence_embeddings = model.encode(text)

        sentence_embeddings.shape

        bert = cosine_similarity(
            [sentence_embeddings[0]],
            sentence_embeddings[1:]
        )
 
        for b in bert:
            bert_ratio = int(b[0]*100)
        
        #4
        print("Similarity percentage using Bert/Sentence Transformer - ", bert_ratio)
        max_ratio.append(bert_ratio)

        ratio = max(max_ratio)

        retJson = {
            "status":200,
            "ratio": ratio,
            "msg":"Best similarity score calculated successfully"
        }
        return jsonify(retJson)

In [427]:
api.add_resource(Detect, '/detect')

### Welcome page and flask main application run

In [428]:
class Welcome(Resource):
    def get(self):
      return "Welcome to Similarity Check!"

In [429]:
api.add_resource(Welcome, '/welcome')

In [None]:
if __name__ == '__main__':
    app.run(debug=False)

 * Serving Flask app '__main__' (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [14/Dec/2021 20:00:32] "GET /welcome HTTP/1.1" 200 -
  if users.find({"Username":username}).count() == 0:
127.0.0.1 - - [14/Dec/2021 20:00:38] "POST /register HTTP/1.1" 200 -


Similarity percentage using Sequence Matcher -  3.434816549570648
Similarity percentage using TFID Vectorizer -  50.19749267300544
Similarity percentage using Count Vectorizer -  64.26557589755045


127.0.0.1 - - [14/Dec/2021 20:01:13] "POST /detect HTTP/1.1" 200 -


Similarity percentage using Bert/Sentence Transformer -  69
