# A simple flask webapp to try out our NER based heuristic function that returns news similarity

In [4]:
from flask import Flask
from flask import request
from flask import redirect

#iinstall spacy in your conda conda install -c conda-forge spacy
# python -m spacy download en_core_web_sm
import spacy
import os
import pandas as pd
import glob
from collections import defaultdict
import re
from datetime import datetime
from datetime import timedelta
import dateutil.parser
import pandas as pd

In [6]:
# https://spacy.io/api/annotation#named-entities

Weights = {'CARDINAL': 1,
 'DATE': 1,
 'EVENT': 1,
 'FAC':  2,
 'GPE': 2,
 'LANGUAGE': 1,
 'LAW': 2,
 'LOC': 2,
 'MONEY': 4,
 'NORP': 1,
 'ORDINAL': 1,
 'ORG': 16,
 'PERCENT': 16,
 'PERSON': 16,
 'PRODUCT': 4,
 'QUANTITY': 2,
 'TIME': 1,
 'WORK_OF_ART': 4
          }

# Load English tokenizer, tagger, parser, NER and word vectors
# nlp = spacy.load("en_core_web_lg")
# nlp = spacy.load("en_core_web_md")
nlp = spacy.load("en_core_web_sm")


def intersection(x, y):
    r = set.intersection(x, y).difference(['Join Livemint', 'Telegram', 'Mint'])
    return r


def weighted_score(inter):
    l = list(inter)
    score = 0
    for i in l:
        score += Weights[i.split(':')[0]]
    return score


def vec_similarity(x, y):
    if x is not None and y is not None:
        return x.similarity(y)

    return -1


def tag(x):
    s = set()
    doc = nlp(x)
    for ent in doc.ents:
        s.add(ent.label_+":"+ent.text)
    return s



def heuristic(x, y): 
    intersect = intersection(tag(x), tag(y))
    score = weighted_score(intersect)
    vec_sim = vec_similarity(nlp(x), nlp(y))
    verdict = 0
    if (vec_sim>0.95) and len(intersect)>2 and (score>100):
        verdict = 1
    r = "Score: "+str(score)+ "<br>Intersect: "+str(intersect)+"<br>Content Vector Similarity: "+str(vec_sim)+ "<br>Verdict: "+ str(verdict)
    return verdict, r

In [7]:
app = Flask(__name__)

def get_index():
    return "<form action=\"/eval\" method=\"get\" id=\"eval\"> \
      <textarea name=\"x\" rows=\"20\" cols=\"100\" form=\"eval\">Enter text X here...</textarea> \
      <textarea name=\"y\" rows=\"20\" cols=\"100\" form=\"eval\">Enter text Y here...</textarea> \
      <input type=\"submit\">"
    

@app.route("/")
def init():
    return get_index()


@app.route("/eval")
def eval():
    x =request.args.get('x')
    y =request.args.get('y')
    verdict, r = heuristic(x, y)
    return r


if __name__ == "__main__":
    app.run(host='0.0.0.0')

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://0.0.0.0:5000/ (Press CTRL+C to quit)
  self._bootstrap_inner()
127.0.0.1 - - [06/Dec/2020 18:09:56] "GET /eval?x=San+Francisco+is+a+beautiful+city&y=San+Francisco+has+a+lot+of+beauty HTTP/1.1" 200 -
127.0.0.1 - - [06/Dec/2020 18:10:03] "GET / HTTP/1.1" 200 -


In [10]:
def confusion(row):
    suffix = 'N'
    if int(row.prediction) == 1:
        suffix = 'P'
    prefix = 'F'
    if row.prediction == row['binary-label']:
        prefix = 'T'
    return prefix+suffix

no_of = lambda x: len(result[result['confusion']==x])

def print_quality(result):
    result['prediction'] = result.apply(lambda row: on_row(row), axis=1)
    result['confusion'] = result.apply(lambda row: confusion(row), axis = 1)
    Accuracy = (no_of('TP') + no_of('TN'))/(no_of('TP') + no_of('TN') + no_of('FP') + no_of('FN'))
    Precision = no_of('TP') / (no_of('TP') + no_of('FP'))
    Recall = no_of('TP') / (no_of('TP') + no_of('FN'))
    print('Accuracy = '+ str(Accuracy))
    print('Precision = '+ str(Precision))
    print('Recall = '+ str(Recall))

def on_row(row):
    x = row['content_x']
    y = row['content_y']
    verdict, r = heuristic(x, y)
    return verdict

test_data = 'D:\\newsapibackup\\benchmark-data_balance.csv'
# test_data = 'D:\\newsapibackup\\benchmark-data_real.csv'
result = pd.read_csv(test_data)


Sorry reader. I need to suppress the warnings. They will be printed in a loop otherwise

In [11]:
import warnings
warnings.filterwarnings('ignore')

In [12]:
print_quality(result)

Accuracy = 0.8
Precision = 0.7380952380952381
Recall = 0.6966292134831461
