# Extract news articles from a website

In [32]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

In [6]:
# website url
url = 'https://www.hirunews.lk/english/local-news.php?pageID=1'

# download the content of the page
data = requests.get(url).text

# create a BeautifulSoup object
soup = BeautifulSoup(data, "html.parser")

In [83]:
# See the nested structure of the web page
#print(soup.prettify())

In [84]:
all_news = soup.find_all(class_="all-section-tittle")  # finding the tags

# news titles of the page
for title in all_news:
    print(title.text)


Eid Ul Fitr festival on Tuesday - moon not sighted 


Gazette issued making full Covid vaccination mandatory, repealed 


Sri Lanka Rugby Asian membership suspended by Asia Rugby 


No medicines for baby - couple assaulted for their FB post by MP's brother (Pics)


Pakistan's state-run PTV suspends 17 officials


New Guidelines for Fuel Transport


Demonstration Alert: U.S. Embassy Colombo, Sri Lanka 


Mirissa prize catch - fish weighing over 500 kilos (Video)


Poor economic management has led to instability - Ranil Wickremesinghe (Video) 


Liquor outlets closed along the May day procession routes 


Two suspects arrested for illegally storing Gas tanks 


Suranga Lakmal shines with a maiden 5 wicket haul for Derbyshire 


May day: International Workers' Day celebrated around the world


Showers expected in several areas 


23 trapped, 39 missing after building collapses in China



In [33]:
all_news[0].text

'\nSri Lanka Rugby Asian membership suspended by Asia Rugby \n'

In [28]:
# link to the whole content
url2 = all_news[0].find('a').get('href')
url2

'https://www.hirunews.lk/english/303272/sri-lanka-rugby-asian-membership-suspended-by-asia-rugby'

In [29]:
# get the news content by loading the url
data2 = requests.get(url2).text
soup2 = BeautifulSoup(data2, "html.parser")
soup2.find(id="article-phara").text

'\nAsia Rugby has announced the suspension of the membership of Sri Lanka Rugby: At an Extraordinary Sub Meeting of Asia Rugby Executive Committee (EXCO) it was unanimously decided to ratify the decision of the EXCO at its meeting on 9th April 2022 in Thailand to impose a full and immediate membership suspension until further notice.The EXCO discussed the status report from the Asia Rugby delegate who travelled to Sri Lanka for the period 20th to 24th April 2022, the delegate has met with the relevant stakeholders the Sri Lanka Ministry of Youth and Sport, Sri Lanka National Olympic Committee, the suspended Sri Lanka Rugby, various Clubs’ representatives and the Rugby Referees Association; a consensus was established by all stakeholders that a fresh election is a necessity. Furthermore, the status report transpires a major concern that main stakeholders are not fairly represented in the decision-making environment.In maintaining a sport neutrality and appropriate governance, Asia Rugby

In [34]:
# create a dataframe
df = pd.DataFrame(columns=['Title', 'Content'])

for article in all_news:
    if article != []:
        #title
        title = article.text
        title = re.sub("\n", "", title)
        
        #content
        data2 = requests.get(article.find('a').get('href')).text
        soup2 = BeautifulSoup(data2, "html.parser")
        content = soup2.find(id="article-phara").text
        content = re.sub("\n", "", content)
        content = re.sub("\u200d", "", content)
        
        # fill dataframe - append the dictionary
        df = df.append({
            'Title':title,
            'Content':content
        }, ignore_index=True)

In [36]:
print(df.shape)
df

(15, 2)


Unnamed: 0,Title,Content
0,Sri Lanka Rugby Asian membership suspended by ...,Asia Rugby has announced the suspension of the...
1,No medicines for baby - couple assaulted for t...,The Kirindiwela Police say that a couple has b...
2,New Guidelines for Fuel Transport,New Guidelines for Fuel Transport: Train fuel ...
3,"Demonstration Alert: U.S. Embassy Colombo, Sri...",The US Embassy in Sri Lanka has issued a “demo...
4,Mirissa prize catch - fish weighing over 500 k...,A group of fishermen who left the Weligama - M...
5,Poor economic management has led to instabilit...,Former Prime Minister and UNP leader Ranil Wic...
6,Liquor outlets closed along the May day proces...,The Excise Department has announced that liquo...
7,Two suspects arrested for illegally storing Ga...,Officers of the Intelligence Division of the S...
8,Suranga Lakmal shines with a maiden 5 wicket h...,Suranga Lakmal’s maiden five-wicket haul for D...
9,May day: International Workers' Day celebrated...,"May Day, also known as International Workers' ..."


In [37]:
df.Content[0]

'Asia Rugby has announced the suspension of the membership of Sri Lanka Rugby: At an Extraordinary Sub Meeting of Asia Rugby Executive Committee (EXCO) it was unanimously decided to ratify the decision of the EXCO at its meeting on 9th April 2022 in Thailand to impose a full and immediate membership suspension until further notice.The EXCO discussed the status report from the Asia Rugby delegate who travelled to Sri Lanka for the period 20th to 24th April 2022, the delegate has met with the relevant stakeholders the Sri Lanka Ministry of Youth and Sport, Sri Lanka National Olympic Committee, the suspended Sri Lanka Rugby, various Clubs’ representatives and the Rugby Referees Association; a consensus was established by all stakeholders that a fresh election is a necessity. Furthermore, the status report transpires a major concern that main stakeholders are not fairly represented in the decision-making environment.In maintaining a sport neutrality and appropriate governance, Asia Rugby c

## Go through pagination

#### 1. using 'next page'(>>) icon in the pagination

In [54]:
url = "https://www.hirunews.lk/english/local-news.php?pageID=1"
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")

In [None]:
# go through all pages until last page - ****This Takes time as website has huge number of pages****
urls = []
currSoup = soup 
urls.append(url)  # 1st page

while currSoup.find(title ="next page"):
    nextPage = currSoup.find(title ="next page").get('href')
    urls.append(nextPage)
    
    rNext = requests.get(nextPage)
    currSoup = BeautifulSoup(rNext.content, "html.parser")

urls.append(currSoup.find(title ="next page").get('href')) # last page

In [59]:
for url in urls:
    print(url)

#### 2. Proving the urls manually

In [76]:
url_common = "https://www.hirunews.lk/english/local-news.php?pageID="
urls = []

In [77]:
# create url list - for simplicity, lets take only 2 urls
for x in range(1, 3):
    urls.append(url_common+str(x))

In [78]:
for url in urls:
    print(url)

https://www.hirunews.lk/english/local-news.php?pageID=1
https://www.hirunews.lk/english/local-news.php?pageID=2


Create the dataset by going through pagination. Here, we provide limited number of urls manually. Because we can not use >> icon to go through pages as this website has so many pages(It takes time).

In [79]:
df2 = pd.DataFrame(columns = ['Title', 'Content', 'ContentID'])

In [80]:
def get_data(url):
    global df2
    r = requests.get(url)
    soup = BeautifulSoup(r.content, "html.parser")
    all_news = soup.find_all(class_ = "all-section-tittle")  # all news per page
    
    for one_news in all_news:
        if (one_news != []):
            # title
            title = one_news.text
            pattern1 = "\n|\u200d"                    
            title = re.sub(pattern1, "", title)

            # content
            contentLink = one_news.find('a').get('href')  # link to the conent page
            r_content = requests.get(contentLink) 
            contentSoup = BeautifulSoup(r_content.content, "html.parser")

            content = contentSoup.find(id="article-phara").text
            pattern2 = "\n|\u200d"                                 
            content = re.sub(pattern2, "", content)

            # content id
            content_id = re.findall("www.hirunews.lk\/english\/(\d{1,})", contentLink)[0]
            
            # fill columns (append a dictionary)
            df2 = df2.append({
                "Title" : title,
                "Content" : content,
                "ContentID" : content_id
            }, ignore_index=True)

In [81]:
import concurrent.futures


with concurrent.futures.ThreadPoolExecutor() as executor:      # this reduces the time taken
    executor.map(get_data, urls)

In [82]:
df2

Unnamed: 0,Title,Content,ContentID
0,Eid Ul Fitr festival on Tuesday - moon not sig...,Sri Lankan Muslims will be celebrating the Eid...,303276
1,Special traffic plan for tomorrow,A special traffic plan will be in place for Co...,303206
2,Sri Lanka Rugby Asian membership suspended by ...,Asia Rugby has announced the suspension of the...,303272
3,"""Open up registration for new suppliers to tra...",Kanchana Wijesekera -Minister of Power & Energ...,303202
4,"Treasury bill auction for Rs. 97,500 million t...","A Treasury bill auction worth Rs. 97,500 milli...",303201
5,No medicines for baby - couple assaulted for t...,The Kirindiwela Police say that a couple has b...,303269
6,Pakistan's state-run PTV suspends 17 officials,Pakistan's state-run PTV has suspended 17 offi...,303267
7,Sangha pledge issued calling for the PM and th...,A Sangha pledge was issued today that the Prim...,303198
8,New Guidelines for Fuel Transport,New Guidelines for Fuel Transport: Train fuel ...,303263
9,Jacqueline Fernandez's assets worth INR 72M at...,The Indian Enforcement Directorate on Saturday...,303180


Here we store the article id. This is helpful when we want to add only new articles to the database. We can check if the article Id is present in the dataset. If it is not, we apped that article to the dataset. Otherwise ignore