In [16]:
import newspaper

In [17]:
file = open('urls', 'r')
data = file.readlines()

urls = []

for url in data:
    url = url.rstrip('\n')
    urls.append(url)
    
urls

['https://timesofindia.indiatimes.com/city/mumbai/6-year-old-abducted-girl-found-dead-in-railway-toilet-in-navsari/articleshow/63462708.cms',
 'https://economictimes.indiatimes.com/news/politics-and-nation/sushma-swaraj-to-visit-japan-for-strategic-dialogue/articleshow/63488634.cms',
 'https://thelogicalindian.com/environment/illegal-stone-quarries-in-bannerghatta/',
 'https://timesofindia.indiatimes.com/city/pune/beed-girl-throws-light-on-rampant-mass-copying-before-child-rights-commission/articleshow/63472161.cms',
 'http://odishasuntimes.com/mahaprayan-ambulance-failure-in-odisha-body-carried-on-rickshaw-baby-delivered-in-auto/',
 'https://economictimes.indiatimes.com/news/politics-and-nation/congress-will-disappear-from-karnataka-like-other-states-prakash-javadekar/articleshow/63487965.cms',
 'https://timesofindia.indiatimes.com/india/congress-will-disappear-from-karnataka-like-other-states-prakash-javadekar/articleshow/63487837.cms',
 'https://www.japantimes.co.jp/news/2018/03/27/

In [18]:
from nltk import *
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn

In [19]:
def penn_to_wn(tag):
    """ Convert between a Penn Treebank tag to a simplified Wordnet tag """
    if tag.startswith('N'):
        return 'n'
 
    if tag.startswith('V'):
        return 'v'
 
    if tag.startswith('J'):
        return 'a'
 
    if tag.startswith('R'):
        return 'r'
 
    return None
 
def tagged_to_synset(word, tag):
    wn_tag = penn_to_wn(tag)
    if wn_tag is None:
        return None
    try:
        return wn.synsets(word, wn_tag)[0]
    except:
        return None

In [20]:
def sentence_similarity(sentence1, sentence2):
    """ compute the sentence similarity using Wordnet """
#     print(sentence1, sentence2)
    # Tokenize and tag
    sentence1 = pos_tag(word_tokenize(sentence1))
    sentence2 = pos_tag(word_tokenize(sentence2))
 
    # Get the synsets for the tagged words
    synsets1 = [tagged_to_synset(*tagged_word) for tagged_word in sentence1]
    synsets2 = [tagged_to_synset(*tagged_word) for tagged_word in sentence2]
 
    # Filter out the Nones
    synsets1 = [ss for ss in synsets1 if ss]
    synsets2 = [ss for ss in synsets2 if ss]

#     print(synsets1, synsets2)
    score, count = 0.0, 0
 
    # For each word in the first sentence
    for synset in synsets1:
        # Get the similarity value of the most similar word in the other sentence
        scores = [synset.path_similarity(ss) for ss in synsets2]
        scores.append(0)
        best_score = max([score for score in scores if score is not None])
 
        # Check that the similarity could have been computed
        if best_score is not None:
            score += best_score
            count += 1
 
    # Average the values
    if count != 0:
        score /= count
    return score

def symmetric_sentence_similarity(sentence1, sentence2):
    """ compute the symmetric sentence similarity using Wordnet """
    return (sentence_similarity(sentence1, sentence2) + sentence_similarity(sentence2, sentence1)) / 2

In [21]:
def get_continuous_chunks(text):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    prev = None
    continuous_chunk = []
    current_chunk = []
    for i in chunked:
            if type(i) == Tree:
                current_chunk.append(i.label()[:3] + "-")
                current_chunk.append(" ".join([token for token, pos in i.leaves()]))
            elif current_chunk:
                named_entity = " ".join(current_chunk)
                if named_entity not in continuous_chunk:
                    continuous_chunk.append(named_entity)
                    current_chunk = []
            else:
                continue
    return continuous_chunk

In [22]:
def get_relation_tuples(sentences, image, image_caption):
    relation_tuples = []
    for sentence in sentences:
        similarity_score = symmetric_sentence_similarity(sentence, image_caption)
        rel_tuple_single = []
        rel_tuple_single = get_continuous_chunks(sentence)
        
        if len(rel_tuple_single):
            tokens = word_tokenize(sentence)
            tagged = pos_tag(tokens)

        entities = chunk.ne_chunk(tagged)
        for entity in entities:
            if len(entity)>1 and (entity[1] == 'VB' or entity[1] == 'VBD' or entity[1] == 'VBN' or entity[1] == 'VM'):
                if entity[0] not in stopwords.words('english'):
                    rel_tuple_single.append('REL-'+entity[0])
        rel_tuple_single.append('IMG-'+image)
        rel_tuple_single.append('RM-'+str(similarity_score))

        if len(rel_tuple_single):
            relation_tuples.append(rel_tuple_single)

    with open("input.txt", "a") as myfile:
        myfile.write(str(relation_tuples)+'\n')

In [23]:
from requests import get
from bs4 import BeautifulSoup

def get_image_caption(url):
    response = get(url)
    caption = ""
    html_soup = BeautifulSoup(response.text, 'html.parser')
    print(url)
    if url.find('indianexpress') is not -1:
        image_containers = html_soup.find_all('span', class_ = 'custom-caption')
        if len(image_containers):
            caption = image_containers[0].text
    elif url.find('deccanchronicle') is not -1:
        image_containers = html_soup.find_all('div', class_ = 'storyimg-caption')
        if len(image_containers):
            caption = image_containers[0].text
    return caption

In [25]:
articles = []

for i, url in enumerate(urls):
    print ('Article number %d' % (i+1))
    article = newspaper.Article(url=url, language='en')
    article.download()
    article.parse()
    article.nlp()
    try:
        get_relation_tuples(sent_tokenize(article.summary), article.top_image, get_image_caption(url))
        print('Added to inputs.txt')
    except:
        print ('Invalid Article')
    print ('----------------')

Article number 1
https://timesofindia.indiatimes.com/city/mumbai/6-year-old-abducted-girl-found-dead-in-railway-toilet-in-navsari/articleshow/63462708.cms
Added to inputs.txt
----------------
Article number 2
https://economictimes.indiatimes.com/news/politics-and-nation/sushma-swaraj-to-visit-japan-for-strategic-dialogue/articleshow/63488634.cms
Added to inputs.txt
----------------
Article number 3
https://thelogicalindian.com/environment/illegal-stone-quarries-in-bannerghatta/
Added to inputs.txt
----------------
Article number 4
https://timesofindia.indiatimes.com/city/pune/beed-girl-throws-light-on-rampant-mass-copying-before-child-rights-commission/articleshow/63472161.cms
Added to inputs.txt
----------------
Article number 5
http://odishasuntimes.com/mahaprayan-ambulance-failure-in-odisha-body-carried-on-rickshaw-baby-delivered-in-auto/
Added to inputs.txt
----------------
Article number 6
https://economictimes.indiatimes.com/news/politics-and-nation/congress-will-disappear-from-k