In [21]:
# utils.py
# Class containing code for all utility functions 
import utils
import numpy as np
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import pandas as pd
import glob as g
import time
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy import spatial

files = g.glob("../DataBase/data/*.csv")

In [6]:
def toframe(files, attribute=None, query=None):
    """Generate pandas data frame from article csv
    Positional Arguments:
    files -- file path for article csv's
    Valid queries: 'pfizer', 'moderna', 'Johnson and Johnson', 'Johnson', 'Johnson & Johnson', 'covid', 'coronavirus', 'vaccine', 'covid 19'
    """

    col_names = ['title', 'abstract', 'author', 'source', 'time_stamp', 'link', 'query']
    articles = []
    for filename in files:
        articles.append(pd.read_csv(filename, names=col_names))

    article_frame = pd.concat(articles, ignore_index=True)
    article_frame.drop_duplicates(['title'], inplace=True)

    # hash function to generate article id. Uses link, because it is a unique value for each article
    key_vals = []
    for i, row in article_frame.iterrows():
        key = hash(row['link'])
        if key not in key_vals:
            key_vals.append(key)
        else:
            key = hash(row['title'])
            if key not in key_vals:
                key_vals.append(key)

    article_frame.insert(loc=0, column='article_id', value=key_vals)
    af = article_frame.dropna()

    if attribute is not None and query is not None:
        query_frame = af[af['query'].astype(str).str.contains(query)]
        sentenceList = query_frame[attribute].tolist()
        return sentenceList
    elif attribute is not None:
        sentenceList = af[attribute].tolist()
        return sentenceList
    elif query is not None:
        temp_list = list()
        fl = []
        query_frame = af[af['query'].str.contains(query)]
        for col in query_frame:
            temp_list = article_frame[col].tolist()
            fl.append(temp_list)
        return fl
    else:
        return af

In [7]:
def encode(text, model, max_seq_length = 300):
    """Encode a set of text given an encoding model

    Positional Arguments:
    text -- set of articles to be encoded
    model -- model generated from sentence_transformers
    a fixed-sized output representation (vector u) accomplished by pooling
    """
    start_time = time.time()
    model.max_seq_length = max_seq_length 
    sentence_embedding = model.encode(text)
    end_time = time.time()
    print("Time for computting embeddings:" + str(end_time - start_time))
    return sentence_embedding

In [8]:
def doc_sim(v1, v2):
    """Find the similarity between two articles

    Positional Arguments:
    v1 -- vector pertaining to the first article
    v2 -- vector pertaining to the second article
    """

    return cosine_similarity(v1.reshape(1,-1),v2.reshape(1,-1))[0][0]

In [9]:
def ent_sim(a1, a2):
    pass

In [10]:
def eos(doc_sim, ent_sim, alpha):
    return (alpha*doc_sim) + ((1-alpha)*ent_sim)

In [117]:
def get_ents(text, confidence= 0.35):
    import requests
    from IPython.core.display import display, HTML# An API Error Exception
    class APIError(Exception):
        def __init__(self, status):
            self.status = status
            def __str__(self):
                return "APIError: status={}".format(self.status)

    # Base URL for Spotlight API
    base_url = "http://api.dbpedia-spotlight.org/en/annotate"# Parameters 
    # 'text' - text to be annotated 
    # 'confidence' -   confidence score for linking
    params = {"text": text, "confidence": confidence}# Response content type
    headers = {'accept': 'application/json'}# GET Request
    res = requests.get(base_url, params=params, headers=headers)
    if res.status_code != 200:
        # Something went wrong
        raise APIError(res.status_code)# Display the result as HTML in Jupyter Notebook
    
    import json 
    bruh = json.loads(res.content)
    
    try: 
        return set([x['@URI'] for x in bruh['Resources']])
    except:
        return set()

In [12]:
def cluster():
    pass

In [18]:
#tuple = (title, abstract, author, source, date, URL, query)
doc1 = ['Three Men Are Accused in Scheme to Sell Covid-19 Vaccines', 'Prosecutors said the men created a fake duplicate of the Moderna website to fraudulently sell doses of the vaccine, which they never had.', 'John', 'NY Times', 'april 23', 'sample_url', 'moderna']
doc2 = ['title2', 'Four Men Are Talking About Covid-19 Vaccines', 'Jack', 'Washington Post', 'march 15', 'sample_url2', 'pfizer']

In [13]:
def vectorize(dx):  # Modify this function
    list_x = dx.split()
    vx = np.array(list_x)

    return vx

In [14]:
def docSim(abstract_i, abstract_j):
    roberta = SentenceTransformer('stsb-roberta-base')

    vi = utils.encode(abstract_i, roberta)
    vj = utils.encode(abstract_j, roberta)
    cosine_similarity = 1 - spatial.distance.cosine(vi, vj)
    return cosine_similarity

In [15]:
def knowledgeSim(abstract_i, abstract_j):
    vi = vectorize(abstract_i)
    vj = vectorize(abstract_j)
    jaccard_similarity = spatial.distance.jaccard(vi, vj)
    jaccard_distance = 1 - jaccard_similarity
    return jaccard_distance

In [16]:
def Compute_EOS(di, dj, alpha):  # di, dj are 7-tuples, whose 2nd element is abstract

    doc_sim = docSim(di[1], dj[1])
    knowledge_sim = knowledgeSim(di[1], dj[1])
    eos = alpha * doc_sim + (1 - alpha) * knowledge_sim
    return eos

In [19]:
# vec1 = [0, 1, 7]
# vec2 = [15, 6, 9]
# print(docSim(vec1, vec2))
#print(Compute_EOS(doc1, doc2, 0.3))

In [31]:
if __name__ == '__main__':
    data = utils.toframe(utils.files, 'abstract', 'pfizer')
    print(data[0])

Two new pieces of research add strong evidence to the case for giving just a single dose of the Pfizer vaccine to people who have antibodies against the virus.


In [22]:
sentences = [
    'He is playing in the field.',
    'He is running towards the football.',
    'The football game ended.',
    'It started raining while everyone was playing in the field.'
]

vectorizer = TfidfVectorizer(norm = False, smooth_idf = False)
sentence_vectors = vectorizer.fit_transform(sentences)
print(sentence_vectors.toarray())

[[0.         0.         1.69314718 0.         0.         1.69314718
  1.69314718 1.69314718 0.         1.69314718 0.         0.
  0.         1.         0.         0.         0.        ]
 [0.         0.         0.         1.69314718 0.         1.69314718
  0.         1.69314718 0.         0.         0.         2.38629436
  0.         1.         2.38629436 0.         0.        ]
 [2.38629436 0.         0.         1.69314718 2.38629436 0.
  0.         0.         0.         0.         0.         0.
  0.         1.         0.         0.         0.        ]
 [0.         2.38629436 1.69314718 0.         0.         0.
  1.69314718 0.         2.38629436 1.69314718 2.38629436 0.
  2.38629436 1.         0.         2.38629436 2.38629436]]


In [36]:
data = utils.toframe(utils.files)

In [37]:
data['title'][0]

'Three Men Are Accused in Scheme to Sell Covid-19 Vaccines'

In [41]:
data

Unnamed: 0,article_id,title,abstract,author,source,time_stamp,link,query
0,1987704234591290295,Three Men Are Accused in Scheme to Sell Covid-...,Prosecutors said the men created a fake duplic...,By Concepción de León,The New York Times,2021-02-12T22:09:48+0000,https://www.nytimes.com/2021/02/12/us/baltimor...,moderna
1,-8259722555075809818,"Covid Vaccines for Kids Are Coming, but Not fo...",Pfizer and Moderna are testing their vaccines ...,By Apoorva Mandavilli,The New York Times,2021-02-12T10:00:21+0000,https://www.nytimes.com/2021/02/12/health/covi...,moderna
2,-4410056933049431936,"Where Do Vaccine Doses Go, and Who Gets Them? ...",Health agencies and hospitals are using differ...,By Natasha Singer,The New York Times,2021-02-07T19:37:49+0000,https://www.nytimes.com/2021/02/07/technology/...,moderna
3,-6485334848460497503,It’s Time to Trust China’s and Russia’s Vaccines,"They, too, work, and they can help fill shorta...",By Achal Prabhala and Chee Yoke Ling,The New York Times,2021-02-05T10:00:58+0000,https://www.nytimes.com/2021/02/05/opinion/cov...,moderna
4,519476207371917269,Russian Campaign Promotes Homegrown Vaccine an...,Russian news outlets with connections to the K...,"By Sheera Frenkel, Maria Abi-Habib and Julian ...",The New York Times,2021-02-05T16:09:45+0000,https://www.nytimes.com/2021/02/05/technology/...,moderna
...,...,...,...,...,...,...,...,...
4148,-3288512819308595128,Juventus: De Ligt has COVID,Juventus have announced Matthijs de Ligt has t...,football-italia.net,2021-01-08T17:27:50.000000Z,https://www.football-italia.net/164525/juventu...,['sports'],covid
4149,1354707690989037941,Record Hospitalizations Limit U.S. COVID Response,Beds filled as new COVID-19 cases rose 20% acr...,webmd.com,2020-11-11T17:05:41.000000Z,https://www.webmd.com/lung/news/20201111/recor...,['health'],covid
4150,3296918486111389903,Hand Sanitizers | COVID-19,Help meet the increased demand for safe and ef...,fda.gov,2020-12-29T19:05:38.000000Z,https://www.fda.gov/drugs/coronavirus-covid-19...,['health'],covid
4152,-1633760270221848077,"U.S. Reaches 500,000 Covid Deaths","In a single year, Covid-19 became a leading ca...",upstract.com,2021-02-23T00:08:00.000000Z,https://upstract.com/p/mk5g72g9?ref=rss&rd=1,"['entertainment', 'general']",covid


In [56]:
temp = []

for x in range(2):
    temp.append(data['title'][x] + ' ' + data['abstract'][x])
    print(x)
    
vectorizer = TfidfVectorizer(norm = False, smooth_idf = False)
sentence_vectors = vectorizer.fit_transform(temp)
print(temp)
print(sentence_vectors.toarray())

0
1
['Three Men Are Accused in Scheme to Sell Covid-19 Vaccines Prosecutors said the men created a fake duplicate of the Moderna website to fraudulently sell doses of the vaccine, which they never had.', 'Covid Vaccines for Kids Are Coming, but Not for Many Months Pfizer and Moderna are testing their vaccines on children 12 and older and hope to have results by the summer.']
[[0.         1.69314718 1.69314718 0.         1.         0.
  0.         0.         0.         1.         1.69314718 1.69314718
  1.69314718 1.69314718 0.         1.69314718 1.69314718 0.
  0.         1.69314718 0.         0.         3.38629436 1.
  0.         1.69314718 0.         3.38629436 0.         0.
  0.         1.69314718 0.         1.69314718 1.69314718 3.38629436
  0.         0.         3.         0.         1.69314718 1.69314718
  2.         1.69314718 1.         1.69314718 1.69314718]
 [1.69314718 0.         0.         5.07944154 2.         1.69314718
  1.69314718 1.69314718 1.69314718 1.         0.    

In [61]:
url = get_ents('evan went to the US but actually ended up in Egypt! But instead, he is in Egypt!')
keywords = [list(url)[x].rsplit('/', 1)[-1] for x in range(len(url))]
print(keywords)

['United_States', 'Egypt']


In [118]:
tempSent = []

for x in range(4):
    titleEnts = list(get_ents(data['title'][x].lower()))
    abstractEnts = list(get_ents(data['abstract'][x].lower()))
    totalEnts = titleEnts + abstractEnts
    
    keySplit = [totalEnts[x].rsplit('/', 1)[-1] for x in range(len(totalEnts))]
    sentence = ''
    for y in keySplit:
        sentence += y + ' '
    tempSent.append(sentence)
    
vectorizer = TfidfVectorizer(norm = False, smooth_idf = False)
sentence_vectors = vectorizer.fit_transform(tempSent)
print(tempSent)
print(sentence_vectors.toarray())

['Coronavirus_disease_2019 Vaccine Vaccine ', 'Coronavirus_disease Vaccine Pfizer Vaccine Summer ', 'Algorithm Vaccine Health Coronavirus Vaccine ', 'Russia Vaccine ']
[[0.         0.         0.         2.38629436 0.         0.
  0.         0.         2.        ]
 [0.         0.         2.38629436 0.         0.         2.38629436
  0.         2.38629436 2.        ]
 [2.38629436 2.38629436 0.         0.         2.38629436 0.
  0.         0.         2.        ]
 [0.         0.         0.         0.         0.         0.
  2.38629436 0.         1.        ]]


In [122]:
def knowledge_sim(article_i, article_j):
    ai = get_ents(article_i.lower())
    aj = get_ents(article_j.lower())
    
    union = ai.union(aj)
    intersect = ai.intersection(aj)
    
    jaccard_coefficient = len(intersect) / len(union)
    
    jaccard_distance = 1 - jaccard_coefficient
    
    return jaccard_distance

In [123]:
print(knowledge_sim(data['abstract'][0], data['abstract'][1]))

0.6666666666666667
