# News Extraction

### Import Packages

In [39]:
from bs4 import BeautifulSoup
from requests import get

### Creating a Function to Extract only Text from `<p>` Tags

In [40]:
def get_only_text(url):
 """ 
  return the title and the text of the article
  at the specified url
 """
 page = get(url)
 soup = BeautifulSoup(page.content, "html.parser")
 text = ' '.join(map(lambda p: p.text, soup.find_all('p')))
 #text = soup.text
 title = ' '.join(soup.title.stripped_strings)
 return title , text    

### Calling the function with the desired News URL

In [41]:
text = get_only_text("https://en.wikinews.org/wiki/Football:_Italian_club_AC_Milan_sacks_Vincenzo_Montella_as_manager")

In [43]:
text

('Football: Italian club AC Milan sacks Vincenzo Montella as manager - Wikinews, the free news source',
 "Thursday, November 30, 2017\xa0 On Monday, Italian football club AC Milan sacked manager Vincenzo Montella. Gennaro Gatusso, manager of AC Milan's youth team, was named as Manetella's successor.\n Monetella's team started this season with winning ten out of twelve fixtures in all competitions. However, in fourteen Italian Serie A league matches played so far, Monetella's men won just six matches, losing six and drawing two matches. Having spent £205 million in the summer transfer window, AC Milan has won just 20 points in Serie A. They are seventh in the league table, and eighteen points behind the league leaders SSC Napoli.\n The Milanese club named Gatusso as their new manager. Gatusso played for Milan from 1999 till 2013. As a player, Gatusso has won two Serie A titles and two UEFA Champions League titles with AC Milan. Since 2013, AC Milan has not finished in the top three spot

### Number of Words - Original Text

In [44]:
# Print page titel

In [45]:
print(text[0])

Football: Italian club AC Milan sacks Vincenzo Montella as manager - Wikinews, the free news source


In [46]:
# print number of words from <p> tags

In [47]:
len(str.split(text[1]))

345

# Summarization

In [48]:
from gensim.summarization.summarizer import summarize
from gensim.summarization import keywords

In [49]:
from transformers import pipeline

# Initialize the summarization pipeline with a specific model
model_name = "sshleifer/distilbart-cnn-12-6"
summarizer = pipeline("summarization", model=model_name)

# Generate the summary
summary = summarizer(text[1], max_length=100, min_length=50, do_sample=False)


In [50]:
summary

[{'summary_text': " AC Milan sacked manager Vincenzo Montella on Monday . Gennaro Gatusso, manager of AC Milan's youth team, named as new manager . Milan have won just six of their 14 Serie A league matches so far this season . The Rossoneri are currently seventh in the league table ."}]

### Printing the Summarized Text

In [58]:
print ("Title : " + text[0])
print ("Summary : ")
print (summary[0])

Title : Football: Italian club AC Milan sacks Vincenzo Montella as manager - Wikinews, the free news source
Summary : 
{'summary_text': " AC Milan sacked manager Vincenzo Montella on Monday . Gennaro Gatusso, manager of AC Milan's youth team, named as new manager . Milan have won just six of their 14 Serie A league matches so far this season . The Rossoneri are currently seventh in the league table ."}
