## HIV News articles extraction from The Daily Observer. Data Extraction of following parameters
- Headline
- Description
- Author
- Published Date
- Category
- Publication
- News
- URL
- Keywords
- Summary

### Importing the necessary Libraries

In [1]:
from selenium.webdriver.chrome.options import Options # enables options in web browser
from selenium import webdriver # web-based automation tool for Python
from newspaper import Article # Article scraping & curation
from bs4 import BeautifulSoup # Python library for pulling data out of HTML and XML files
from requests import get # standard for making HTTP requests in Python
import pandas as pd # library written for data manipulation and analysis
import sys, time #  System-specific parameters and functions
from dateutil import parser #handling the date format

### Creating Empty lists for HIV News Articles parameters data to be extracted

In [2]:
headlines, descriptions, dates, authors, news, keywords, summaries, urls, category, publication = [], [], [], [], [], [], [], [], [], []

### Finding the total no.of.pages by total no.of articles from google search results¶

In [3]:
keyword = 'HIV site:www.observerbd.com'

url = 'https://www.google.com/search?q=' + '+'.join(keyword.split())

soup = BeautifulSoup(get(url).text, 'lxml')
try:
    # Extracts the digits if it the resulted number without comma ','. eg: About 680 results (0.23 seconds)
    max_pages = round([int(s) for s in soup.select_one('div#resultStats').text.split() if s.isdigit()][0]/10)
    max_pages = max_pages + 1
except:
    # Extracts the digits if it the resulted number without comma ','. eg: About 1,080 results (0.23 seconds)
    max_pages = round(int(''.join(i for i in soup.select_one('div#resultStats').text if i.isdigit()))/10)
    max_pages = max_pages + 1

### Iterates max_pages value through while loop. Scraping the Articles urls

In [4]:
options = Options()
options.headless = True
browser = webdriver.Chrome(options=options)
browser.get(url)

index = 0

while True:
    try:
        index +=1
        page = browser.page_source
        soup = BeautifulSoup(page, 'lxml')
        linky = [soup.select('.r')[i].a['href'] for i in range(len(soup.select('.r')))]
        urls.extend(linky)
        if index == max_pages:
            break
        browser.find_element_by_xpath('//*[@id="pnnext"]/span[2]').click()
        time.sleep(2)
        sys.stdout.write('\r' + str(index) + ' : ' + str(max_pages) + '\r')
        sys.stdout.flush()
    except:
        pass
    
browser.quit()

23 : 75

### To remove duplicates urls entries in the list by executing below line

In [5]:
urls = list(dict.fromkeys(urls))
print("Total Extracted URL's are" + ' : ' + str(len(urls)), type(urls))

Total Extracted URL's are : 231 <class 'list'>


### Iterates urls through for loop. Scraping the Articles with above parameters

In [6]:
%%time
for index, url in enumerate(urls):
    try:
        # Parse the url to NewsPlease
        soup = BeautifulSoup(get(url).text, 'lxml')
        article = Article(url)
        article.download()
        article.parse()
        article.nlp()
        
        # Extracts the Headlines
        try:
            headlines.append(article.title.strip())
        except:
            headlines.append(None)
            
        # Extracts the Descriptions    
        try:
            descriptions.append(' '.join(article.meta_data['og']['description'].split()))
        except:
            descriptions.append(None)
            
        # Extracts the Authors
        try:
            authors.append(article.authors.strip())
        except:
            authors.append(None)
        
        # Extracts the published dates
        try:
            dt = parser.parse(soup.select_one('.pub').span.text.split(":")[1].split("at")[0])
            dates.append(str(dt).split()[0])
        except:
            dates.append(None)
            
        # Extracts the news category
        try:
            category.append(article.meta_data['category'])
        except:
            category.append(None)
            
        # Extracts the news articles
        try:
            news.append(' '.join(article.text.split()).replace("\'\'"," ").replace("\'", "").replace(" / ", ""))
        except:
            news.append(None)
            
        # Extracts the news publication
        try:
            publication.append(article.meta_data['og']['site_name'])
        except:
            publication.append(None)

        # Extracts Keywords and Summaries
        try:
            keywords.append(article.keywords)
            summaries.append(' '.join(article.summary.split()))
        except:
            keywords.append(None)
            summaries.append(None)
                        
    except:
        headlines.append(None)
        descriptions.append(None)
        authors.append(None)
        dates.append(None)
        category.append(None)
        publication.append(None)
        news.append(None)
        keywords.append(None)
        summaries.append(None)

    sys.stdout.write('\r' + str(index) + ' : ' + str(url) + '\r')
    sys.stdout.flush()

Wall time: 16min 20sserverbd.com/details.php?id=99263ppg=20pg=3


### Checking Array Length of each list to create DataFrame

In [7]:
print(len(headlines), len(descriptions), len(authors), len(dates), len(category), len(publication), len(news), len(keywords), len(summaries), len(urls))

231 231 231 231 231 231 231 231 231 231


### Creating a csv file after checking array length and droping the missing values from the dataset

In [8]:
if len(headlines) == len(descriptions) == len(authors) == len(dates) == len(news) == len(publication) == len(keywords) == len(summaries) == len(urls) == len(category):
    tbl = pd.DataFrame({'Headlines' : headlines,
                        'Descriptions' : descriptions,
                        'Authors' : authors,
                        'Published_Dates' : dates,
                        'Publication' : publication,
                        'Articles' : news,
                        'category' : category,
                        'Keywords' : keywords,
                        'Summaries' : summaries,
                        'Source_URLs' : urls})
    tbl.dropna()
    path = 'D:\\#Backups\\Desktop\\!Code!\\CDRI\\HIV\\Data Extraction\\#Datasets\\'
    tbl.to_csv(path+'The_Daily_Observer.csv', index=False)
else:
    print('Array lenght does not match!')

tbl.head()

Unnamed: 0,Headlines,Descriptions,Authors,Published_Dates,Publication,Articles,category,Keywords,Summaries,Source_URLs
0,London HIV patient becomes world's second AIDS...,"London, Mar 5: An HIV-positive man in Britain ...",,2019-03-06,The Daily Observer,"London, Mar 5: An HIV-positive man in Britain ...",{},"[london, patient, hope, man, hiv, cure, gupta,...","We can't detect anything,"" said Ravindra Gupta...",https://www.observerbd.com/details.php?id=186899
1,90 infected by HIV syringe,"KARACHI, Apr 3: At least 90 people, including ...",,,The Daily Observer,"KARACHI, Apr 3: At least 90 people, including ...",{},"[children, 90, hiv, using, 65, syringe, doctor...","KARACHI, Apr 3: At least 90 people, including ...",https://www.observerbd.com/details.php?id=196017
2,Third HIV patient ‘cured’ of virus after bone ...,"LONDON, Mar 9: A third, previously HIV-positiv...",,2019-03-10,The Daily Observer,Third HIV patient ‘cured’ of virus after bone ...,{},"[london, patient, cells, case, ccr5, hiv, syst...",Third HIV patient ‘cured’ of virus after bone ...,http://www.observerbd.com/details.php?id=187553
3,Daily Observer,Most Popular Online Newportal in Bangladesh,,,Daily Observer,"London, Mar 5: An HIV-positive man in Britain ...",{},"[london, observer, cleared, man, daily, mar, k...","London, Mar 5: An HIV-positive man in Britain ...",http://www.observerbd.com/cat.php?cd=1&key=HIV
4,HIV patient gifts kidney,"WASHINGTON, Mar 29: The kidney of a 35-year-ol...",,,The Daily Observer,"WASHINGTON, Mar 29: The kidney of a 35-year-ol...",{},"[virus, patient, woman, washington, kidney, gi...","WASHINGTON, Mar 29: The kidney of a 35-year-ol...",http://www.observerbd.com/details.php?id=190823


In [9]:
tbl.shape

(231, 10)