In [1]:
import newspaper

In [2]:
file = open('urls', 'r')
data = file.readlines()

urls = []

for url in data:
    url = url.rstrip('\n')
    urls.append(url)
    
urls

['https://timesofindia.indiatimes.com/city/mumbai/6-year-old-abducted-girl-found-dead-in-railway-toilet-in-navsari/articleshow/63462708.cms',
 'https://economictimes.indiatimes.com/news/politics-and-nation/sushma-swaraj-to-visit-japan-for-strategic-dialogue/articleshow/63488634.cms',
 'https://thelogicalindian.com/environment/illegal-stone-quarries-in-bannerghatta/',
 'https://timesofindia.indiatimes.com/city/pune/beed-girl-throws-light-on-rampant-mass-copying-before-child-rights-commission/articleshow/63472161.cms',
 'http://odishasuntimes.com/mahaprayan-ambulance-failure-in-odisha-body-carried-on-rickshaw-baby-delivered-in-auto/',
 'https://economictimes.indiatimes.com/news/politics-and-nation/congress-will-disappear-from-karnataka-like-other-states-prakash-javadekar/articleshow/63487965.cms',
 'https://timesofindia.indiatimes.com/india/congress-will-disappear-from-karnataka-like-other-states-prakash-javadekar/articleshow/63487837.cms',
 'https://www.japantimes.co.jp/news/2018/03/27/

In [3]:
from nltk import *
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn

In [4]:
def penn_to_wn(tag):
    """ Convert between a Penn Treebank tag to a simplified Wordnet tag """
    if tag.startswith('N'):
        return 'n'
 
    if tag.startswith('V'):
        return 'v'
 
    if tag.startswith('J'):
        return 'a'
 
    if tag.startswith('R'):
        return 'r'
 
    return None
 
def tagged_to_synset(word, tag):
    wn_tag = penn_to_wn(tag)
    if wn_tag is None:
        return None
    try:
        return wn.synsets(word, wn_tag)[0]
    except:
        return None

In [11]:
def sentence_similarity(sentence1, sentence2):
    """ compute the sentence similarity using Wordnet """
#     print(sentence1, sentence2)
    # Tokenize and tag
    sentence1 = pos_tag(word_tokenize(sentence1))
    sentence2 = pos_tag(word_tokenize(sentence2))
 
    # Get the synsets for the tagged words
    synsets1 = [tagged_to_synset(*tagged_word) for tagged_word in sentence1]
    synsets2 = [tagged_to_synset(*tagged_word) for tagged_word in sentence2]
 
    # Filter out the Nones
    synsets1 = [ss for ss in synsets1 if ss]
    synsets2 = [ss for ss in synsets2 if ss]

#     print(synsets1, synsets2)
    score, count = 0.0, 0
 
    # For each word in the first sentence
    for synset in synsets1:
        # Get the similarity value of the most similar word in the other sentence
        scores = [synset.path_similarity(ss) for ss in synsets2]
        scores.append(0)
        best_score = max([score for score in scores if score is not None])
 
        # Check that the similarity could have been computed
        if best_score is not None:
            score += best_score
            count += 1
 
    # Average the values
    if count != 0:
        score /= count
    return score

def symmetric_sentence_similarity(sentence1, sentence2):
    """ compute the symmetric sentence similarity using Wordnet """
    return (sentence_similarity(sentence1, sentence2) + sentence_similarity(sentence2, sentence1)) / 2

In [12]:
def get_continuous_chunks(text):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    prev = None
    continuous_chunk = []
    current_chunk = []
    for i in chunked:
            if type(i) == Tree:
                current_chunk.append(i.label()[:3] + "-")
                current_chunk.append(" ".join([token for token, pos in i.leaves()]))
            elif current_chunk:
                named_entity = " ".join(current_chunk)
                if named_entity not in continuous_chunk:
                    continuous_chunk.append(named_entity)
                    current_chunk = []
            else:
                continue
    return continuous_chunk

In [13]:
def get_relation_tuples(sentences, image, image_caption):
    relation_tuples = []
    for sentence in sentences:
        similarity_score = symmetric_sentence_similarity(sentence, image_caption)
        rel_tuple_single = []
        rel_tuple_single = get_continuous_chunks(sentence)
        
        if len(rel_tuple_single):
            tokens = word_tokenize(sentence)
            tagged = pos_tag(tokens)

        entities = chunk.ne_chunk(tagged)
        for entity in entities:
            if len(entity)>1 and (entity[1] == 'VB' or entity[1] == 'VBD' or entity[1] == 'VBN' or entity[1] == 'VM'):
                if entity[0] not in stopwords.words('english'):
                    rel_tuple_single.append('REL-'+entity[0])
        rel_tuple_single.append('IMG-'+image)
        rel_tuple_single.append('RM-'+str(similarity_score))

        if len(rel_tuple_single):
            relation_tuples.append(rel_tuple_single)

    with open("input.txt", "a") as myfile:
        myfile.write(str(relation_tuples)+'\n')

In [14]:
from requests import get
from bs4 import BeautifulSoup

def get_image_caption(url):
    response = get(url)
    caption = ""
    html_soup = BeautifulSoup(response.text, 'html.parser')
    print(url)
    if url.find('indianexpress') is not -1:
        image_containers = html_soup.find_all('span', class_ = 'custom-caption')
        if len(image_containers):
            caption = image_containers[0].text
    elif url.find('deccanchronicle') is not -1:
        image_containers = html_soup.find_all('div', class_ = 'storyimg-caption')
        if len(image_containers):
            caption = image_containers[0].text
    return caption

In [15]:
articles = []

for i, url in enumerate(urls):
    print ('Article number %d' % (i+1))
    article = newspaper.Article(url=url, language='en')
    article.download()
    article.parse()
    article.nlp()
    try:
        get_relation_tuples(sent_tokenize(article.summary), article.top_image, get_image_caption(url))
        print('Added to inputs.txt')
    except:
        print ('Invalid Article')
    print ('----------------')

Article number 1
https://timesofindia.indiatimes.com/city/mumbai/6-year-old-abducted-girl-found-dead-in-railway-toilet-in-navsari/articleshow/63462708.cms
MUMBAI: A 6-year-old girl who was abducted by an unidentified woman from outside her building on Saturday was found dead in a ladies toilet at Navsari railway station in Gujarat The Tuling police in Nalasopara (east) had prepared a sketch of the kidnapper who was vaguely captured on a closed circuit television (CCTV) outside the Sai Arpan building in Vijay Nagar where the girl Anjali Saroj stayed with her parents.The CCTV footage showed Anjali being forcibly taken away by the woman at around 8 pm. 
[Synset('mumbai.n.01'), Synset('girl.n.01'), Synset('be.v.01'), Synset('kidnap.v.01'), Synset('unidentified.s.01'), Synset('woman.n.01'), Synset('building.n.01'), Synset('saturday.n.01'), Synset('be.v.01'), Synset('establish.v.01'), Synset('dead.a.01'), Synset('lady.n.01'), Synset('toilet.n.01'), Synset('railway.n.01'), Synset('station.n.0

[Synset('india.n.01'), Synset('japan.n.01'), Synset('have.v.01'), Synset('strong.a.01'), Synset('cooperation.n.01'), Synset('area.n.01'), Synset('nuclear.a.01'), Synset('energy.n.01'), Synset('defense_mechanism.n.01'), Synset('science.n.01'), Synset('mumbai.n.01'), Synset('high.n.01'), Synset('speed.n.01'), Synset('railing.n.01'), Synset('undertaking.n.01'), Synset('be.v.01'), Synset('establish.v.08'), Synset('superintendent.n.02'), Synset('speed.n.01'), Synset('train.n.01'), Synset('be.v.01'), Synset('today.n.01'), Synset('large.a.01'), Synset('investor.n.01'), Synset('india.n.01'), Synset('turn.v.07'), Synset('presence.n.01'), Synset('infrastructure.n.01'), Synset('undertaking.n.01'), Synset('fabrication.n.03'), Synset('fiscal.a.01'), Synset('market.n.01'), Synset('official.a.01'), Synset('figure.n.01'), Synset('japanese.a.01'), Synset('india.n.01'), Synset('be.v.01'), Synset('addition.n.03'), Synset('cent.n.01'), Synset('last.s.01'), Synset('trade.n.01'), Synset('reach.v.01'), Synse

[] [Synset('girl.n.01'), Synset('be.v.01'), Synset('age.n.01'), Synset('group.n.01'), Synset('merely.r.01'), Synset('not.r.01'), Synset('state.v.01'), Synset('teacher.n.01'), Synset('be.v.01'), Synset('keep.v.01'), Synset('responsible.a.01'), Synset('student.n.01'), Synset('fail.v.01'), Synset('perform.v.01')]
“On the one side, the government wants to shut down schools, while on the other hand, schools are being ruined by teachers and management though the students have the desire to learn,” said Deshpande. 
[Synset('side.n.01'), Synset('government.n.01'), Synset('desire.v.01'), Synset('close.v.01'), Synset('down.r.01'), Synset('school.n.01'), Synset('other.a.01'), Synset('hand.n.01'), Synset('school.n.01'), Synset('be.v.01'), Synset('be.v.01'), Synset('destroy.v.02'), Synset('teacher.n.01'), Synset('management.n.01'), Synset('student.n.01'), Synset('have.v.01'), Synset('desire.n.01'), Synset('learn.v.01'), Synset('state.v.01')] []
 “On the one side, the government wants to shut down s

https://timesofindia.indiatimes.com/india/congress-will-disappear-from-karnataka-like-other-states-prakash-javadekar/articleshow/63487837.cms
NEW DELHI: Union minister Prakash Javadekar , the BJP 's in-charge for Karnataka polls, on Tuesday claimed the Congress will "disappear" from the state like it has from other parts of the country and accused the Siddaramaiah government of being "anti-farmer and anti-poor".On a day the Election Commission announced May 12 as the polling date in the state, Javadekar claimed that the people there are eager to throw out the Congress government due to its "divide and rule policy". 
[Synset('new.a.01'), Synset('delhi.n.01'), Synset('union.n.01'), Synset('curate.n.01'), Synset('karnataka.n.01'), Synset('polls.n.01'), Synset('tuesday.n.01'), Synset('claim.v.01'), Synset('congress.n.01'), Synset('disappear.v.01'), Synset('state.n.01'), Synset('have.v.01'), Synset('other.a.01'), Synset('parts.n.01'), Synset('state.n.04'), Synset('accuse.v.01'), Synset('gov

KeyboardInterrupt: 