# News Similarity Problem - Detecting Paraphrases

If you are developing a news aggregator you will face the issue of news similarity. A number of sources either report paraphrases of the same new story. At times more than 2 sources will send you literally the same text, picked up from a major agency such as Reuter.


# Named Entity Recognition (NER)
A commercial NLP library such as spacy will have NER. This is useful for our purposes. If 2 news items are referring to the same peron, organization, event and has the same numbers then it is quite likely that they are paraphrases of each other.


## This notebook demonstrates a heuristic technique using NER to group news items with the same meaning.

# Further improvement
With a large training data set one can fine tune this NER based algorithm by doing supervised learning over NER and below mentioned weighted scores.

In [None]:
from flask import Flask
from flask import request
from flask import redirect

#iinstall spacy in your conda conda install -c conda-forge spacy
# python -m spacy download en_core_web_sm
import spacy
import os
import pandas as pd
import glob
from collections import defaultdict
import re
from datetime import datetime
from datetime import timedelta
import dateutil.parser
import pandas as pd
import numpy as np

#iinstall spacy in your conda conda install -c conda-forge spacy
# python -m spacy download en_core_web_sm
# https://spacy.io/api/annotation#named-entities

Weights = {'CARDINAL': 1,
 'DATE': 1,
 'EVENT': 1,
 'FAC':  2,
 'GPE': 2,
 'LANGUAGE': 1,
 'LAW': 2,
 'LOC': 2,
 'MONEY': 4,
 'NORP': 1,
 'ORDINAL': 1,
 'ORG': 16,
 'PERCENT': 16,
 'PERSON': 16,
 'PRODUCT': 4,
 'QUANTITY': 2,
 'TIME': 1,
 'WORK_OF_ART': 4
          }

common_junk_words = ['Join Livemint', 'Telegram', 'Mint']

# Load English tokenizer, tagger, parser, NER and word vectors
# nlp = spacy.load("en_core_web_lg")
# nlp = spacy.load("en_core_web_md")
nlp = spacy.load("en_core_web_sm")


def intersection(x, y):
    r = set.intersection(x, y).difference(common_junk_words)
    return r


def weighted_score(inter):
    l = list(inter)
    score = 0
    for i in l:
        score += Weights[i.split(':')[0]]
    return score


def vec_similarity(x, y):
    if x is not None and y is not None:
        return x.similarity(y)

    return -1


def tag(x):
    s = set()
    doc = nlp(x)
    for ent in doc.ents:
        s.add(ent.label_+":"+ent.text)
    return s


def heuristic(x, y): 
    intersect = intersection(tag(x), tag(y))
    score = weighted_score(intersect)
    vec_sim = vec_similarity(nlp(x), nlp(y))
    verdict = 0
    if (vec_sim>0.95) and len(intersect)>2 and (score>100):
        verdict = 1
    return verdict

In [None]:
from newsapi import NewsApiClient

newsapi = NewsApiClient(api_key=os.environ.get("NEWS_API_KEY"))

page_size = 100
    
domains = 'reuters.com, bloomberg.com, indianexpress.com'

def date_string(d):
    return d.strftime("%Y-%m-%d")

def get_all_articles(page):
    to = datetime.now() - timedelta(days=1)
    to_param = date_string(to)
    from_param = date_string(to - timedelta(days=0))
    all_articles = newsapi.get_everything(q='',
                                          sources='',
                                          domains= domains,
                                          from_param=from_param,
                                          to=to_param,
                                          language='en',
                                          sort_by='publishedAt',
                                         page_size = page_size,
                                         page = page)
    return all_articles

all_articles = get_all_articles(1)
totalResults = int(all_articles['totalResults'])
no_of_pages = int(totalResults/page_size) + 1
no_of_pages

titles = []
contents = []
entities = []
urls = []
descriptions = []
ids = []
published_ats = []
sources = []
doc_vecs = []
title_vecs = []

id = 0
for page in range(1,no_of_pages+1):
    all_articles = get_all_articles(page)
    for j in all_articles['articles']: 
        titles.append(j['title'])
        contents.append(j['content'])
        urls.append(j['url'])
        ids.append(id)
        descriptions.append(j['description'])
        sources.append(j['source']['name'])
        s = set()
        doc = None
        if(j['content'] != None):
            doc = nlp(j['content'])
            for ent in doc.ents:
                s.add(ent.label_+":"+ent.text)
        doc_vecs.append(doc) 
        entities.append(s)
        published_ats.append(dateutil.parser.parse(j['publishedAt']))
        tn = None
        if(j['title'] != None):
            tn = nlp(j['title'])
        title_vecs.append(tn)
        id += 1
        
data = {'title': titles, 
        'content' : contents,
        'entities' : entities,
        'url' : urls,
        'description' : descriptions,
        'id' : ids,
        'published_at' : published_ats,
        'source': sources,
        'doc_vec': doc_vecs,
        'title_vec': title_vecs 
       } 
df = pd.DataFrame(data) 

df['content'].replace('None', np.nan, inplace=True)
df.dropna(subset=['content'], inplace=True)

df['key'] = 1
# to obtain the cross join we will merge  
# on the key and drop it. 
result = pd.merge(df, df, on ='key').drop("key", 1)
result = result[(result.title_x != result.title_y)] 

def sorted_tuple(x, y):
    if(x>y):
        return y, x
    return x, y
    
result['temp'] = result.apply(lambda row: sorted_tuple(row.id_x, row.id_y), axis=1)
result = result.drop_duplicates(subset='temp', keep="first")
result =  result[(result.source_x != result.source_y)]

In [None]:
result['similar'] = result.apply(lambda row: heuristic(row['content_x'], row['content_y']), axis = 1)

In [None]:
result['similar'] = result[result['similar']==1]

In [None]:
result.head()