## HIV News articles extraction from Daily Sun. Data Extraction of following parameters
- Headline
- Description
- Author
- Published_Date
- News
- URL
- Keywords
- Summary

### Importing the necessary Libraries

In [1]:
from selenium.webdriver.chrome.options import Options # enables options in web browser
from selenium import webdriver # web-based automation tool for Python
from newspaper import Article # Article scraping & curation
from bs4 import BeautifulSoup # Python library for pulling data out of HTML and XML files
from requests import get # standard for making HTTP requests in Python
import pandas as pd # library written for data manipulation and analysis
import sys, time #  System-specific parameters and functions

### Creating Empty lists for HIV News Articles parameters data to be extracted

In [2]:
headlines, descriptions, dates, authors, news, keywords, summaries, urls = [], [], [], [], [], [], [], []

### Finding the total no.of.pages by total no.of articles from google search results¶

In [3]:
keyword = 'HIV site:www.dailysun.co.za'

url = 'https://www.google.com/search?q=' + '+'.join(keyword.split())

soup = BeautifulSoup(get(url).text, 'lxml')
try:
    # Extracts the digits if it the resulted number without comma ','. eg: About 680 results (0.23 seconds)
    max_pages = round([int(s) for s in soup.select_one('div#resultStats').text.split() if s.isdigit()][0]/10)
    max_pages = max_pages + 1
except:
    # Extracts the digits if it the resulted number without comma ','. eg: About 1,080 results (0.23 seconds)
    max_pages = round(int(''.join(i for i in soup.select_one('div#resultStats').text if i.isdigit()))/10)
    max_pages = max_pages + 1

### Iterates max_pages value through while loop. Scraping the Articles urls

In [4]:
options = Options()
options.headless = True
browser = webdriver.Chrome(options=options)
browser.get(url)

index = 0

while True:
    try:
        index +=1
        page = browser.page_source
        soup = BeautifulSoup(page, 'lxml')
        linky = [soup.select('.r')[i].a['href'] for i in range(len(soup.select('.r')))]
        urls.extend(linky)
        if index == max_pages:
            break
        browser.find_element_by_xpath('//*[@id="pnnext"]/span[2]').click()
        time.sleep(2)
        sys.stdout.write('\r' + str(index) + ' : ' + str(max_pages) + '\r')
        sys.stdout.flush()
    except:
        pass
    
browser.quit()

24 : 79

### To remove duplicates urls entries in the list by executing below line

In [5]:
urls = list(dict.fromkeys(urls))
print(len(urls), type(urls))

239 <class 'list'>


### Iterates urls through for loop. Scraping the Articles with above parameters

In [6]:
%%time
for index, url in enumerate(urls):
    try:
        # Parse the url to NewsPlease 
        soup = BeautifulSoup(get(url).text, 'lxml')
        article = Article(url)
        article.download()
        article.parse()
        article.nlp()
        
        # Extracts the Headlines
        try:
            try:
                headlines.append(soup.select_one('meta[property="og:title"]')['content'].strip())
            except:
                headlines.append(article.title.strip())
        except:
            headlines.append(None)
            
        # Extracts the Descriptions    
        try:
            try:
                descriptions.append(soup.select_one('meta[name="description"]')['content'].strip().replace('\n', ' '))
            except:
                descriptions.append(article.meta_description.strip())
        except:
            descriptions.append(None)
            
        # Extracts the Authors
        try:
            try:
                authors.append(soup.select_one('meta[name="cXenseParse:tss-byline"]')['content'].strip())
            except:
                authors.append(article.authors.strip())
        except:
            authors.append(None)
        
        # Extracts the published dates
        try:
            try:
                dates.append(soup.select_one('meta[name="cXenseParse:recs:publishtime"]')['content'].strip())
            except:
                dates.append(str(article.publish_date))
        except:
            dates.append(None)
            
        # Extracts the news articles
        try:
            try:
                news.append(soup.select_one('div.content').text.replace('\n', '').strip())
            except:
                news.append(article.text.strip())
        except:
            news.append(None)
            
        # Extracts Keywords and Summaries    
        try:
            keywords.append(article.keywords)
            summaries.append(article.summary)
        except:
            keywords.append(None)
            summaries.append(None)
            
    except:
        headlines.append(None)
        descriptions.append(None)
        authors.append(None)
        dates.append(None)
        news.append(None)
        keywords.append(None)
        summaries.append(None)

    sys.stdout.write('\r' + str(index) + ' : ' + str(url) + '\r')
    sys.stdout.flush()

Wall time: 14min 18silysun.co.za/News/International/somizi-cancels-mampintsha-gig-20190304es-offence-20160313mobile=trueue8cb-8387-4d97-bc9a-712b49d58220&mobile=true


### Checking Array Length of each list to create DataFrame

In [7]:
print(len(headlines), len(descriptions), len(authors), len(dates), len(news), len(keywords), len(summaries), len(urls))

239 239 239 239 239 239 239 239


### Creating a csv file after checking array length and droping the missing values from the dataset

In [8]:
tbl = pd.DataFrame({'Headlines' : headlines,
                    'Descriptions' : descriptions,
                    'Authors' : authors,
                    'Published_Dates' : dates, 
                    'Articles' : news,
                    'Keywords' : keywords,
                    'Summaries' : summaries,})
#tbl = tbl.dropna()
tbl.to_csv('Daily_Sun.csv', index=False)
tbl.head()

Unnamed: 0,Headlines,Descriptions,Authors,Published_Dates,Articles,Keywords,Summaries
0,MAKER OF POTENTIAL HIV CURE SPEAKS UP!,"News of a ""potential HIV cure"" had many on soc...",,2018-11-08T19:00:03.000Z,"News of a ""potential HIV cure"" had many on soc...","[potential, need, trial, trials, results, medi...","News of a ""potential HIV cure"" had many on soc..."
1,SECOND PERSON ‘CURED’ OF HIV,A LONDON man appears to be free of HIV after a...,Health24,2019-03-05T13:05:29.000Z,A LONDON man appears to be free of HIV after a...,"[stem, timothy, cured, transplant, brown, hiv,...",A LONDON man appears to be free of HIV after a...
2,AIDS CURE A POSSIBILITY!,"FORMER president Jacob Zuma’s boasting, in a t...",,2019-02-03T06:00:22.000Z,"FORMER president Jacob Zuma’s boasting, in a t...","[aids, programme, possibility, president, mill...","First, Zuma wrote that more than3,9 million pe..."
3,LOVE'S STRONGER THAN HIV,Dear MizzB.,,2019-02-17T05:30:02.000Z,Dear MizzB I AM a 24-year-old HIV-positive wom...,"[hivpositive, loves, partner, having, hiv, sex...",Dear MizzBI AM a 24-year-old HIV-positive woma...
4,MY SON GOT HIV FROM RAPIST DAD!,A MUM is still trying to come to terms with th...,Lethabo Khambule,2018-11-13T08:00:06.000Z,A MUM is still trying to come to terms with th...,"[father, closed, son, rapist, told, hiv, worke...",A MUM is still trying to come to terms with th...
