<a href="https://colab.research.google.com/github/shekharkoirala/60daysofUdacity/blob/master/notebooks/day2/scrap_news.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This Notebook includes scrapping process , finding images , summary , keywords, from news sites. Since, these websites are different in structure, two different class parser are created. 

#### !pip3 install newspaper3k #similarly, we could install other libraries if needed.

In [0]:
import time
from datetime import datetime
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
from newspaper import Article

import unicodedata
from gensim.summarization.summarizer import summarize
from gensim.summarization import keywords

## for https://www.nsenergybusiness.com/power/ , attribute "depth" calculate how more pages we will scrap.

In [0]:
class ParseyA():
    def __init__(self, url):
        self.url = None
        self.u_url = url
        self.depth = 100
        self.df = pd.DataFrame(columns=['link', 'tags', 'title'])
        self.html_soup = None
        self.g_article = None
        self.error_data = None
        self.use_error_data = 0
        self.time_ = 0
    
    def create_soup(self):
        """
        load webpage in beautiful soup
        """
        response = get(self.url)
        html_soup = BeautifulSoup(response.text, 'html.parser')
        return html_soup
    
    def create_g_article(self):
        """
        send list of beautiful tags , grid articles
        """
        return self.html_soup.findAll("div",attrs={"class":"grid-x","id":"mian-article"})
    
    def print_soup(self):
        print(self.html_soup)
        
    def locate_error_data(self):
        """
        fix naming of classes
        
        fetch lost tag of selected news.
        """
        error_tag = self.html_soup.findAll("p",attrs={"class":"tags project-tag"})
        return {str(idx):[span_tag.text for idx, span_tag in enumerate(error_entries.find_all('span', recursive=False))]
                for idx, error_entries in enumerate(error_tag)}

    def get_text(self,b_tag):
        """
        b_tag : beautiful soup tag format
        Returns
        text =  Output_text , string format
        """
        first_search = b_tag.find("a").text
        if not first_search:
            return b_tag.find("img")['alt']
        return first_search

    def get_tags(self,b_tag):
        """
        b_tag : beautiful soup tag format
        Returns
        text =  tags in webpage , list format
        """
        first_search = b_tag.findAll('p', class_="tags")
        if first_search:
            return [tag.text for tag in first_search[0].findAll('span', recursive=False)]
        else:
            tag = self.error_data.get(str(self.use_error_data), [])
            self.use_error_data += 1
            return tag
        return []

    def get_dataFrame(self):
        """Dataframe for a scarped webpage"""
        data = [{"link":article.select('div[class*="cell "]')[0].find("a")['href'],
                  "title":self.get_text(article.select('div[class*="cell "]')[0]),
                  "tags":self.get_tags(article)}
                  for article in self.g_article]
        return pd.DataFrame(data)
    
    def show_depth(self):
        """ show how many pages of data we can scrap"""
        strings = html_soup.select('div[class*="nav-"]')[0].findAll('a', class_="page-numbers")[-1]['href'].split('/')
        return [x for x in strings if x][-1]
    
    def parse(self):
        for i in range(1, self.depth):
            self.url = self.u_url+ "page/"+ str(i) + "/"
            self.html_soup = self.create_soup()
            self.g_article = self.create_g_article()
            self.error_data = self.locate_error_data()
            self.use_error_data = 0
            temp_df = self.get_dataFrame()
            self.df = pd.concat([self.df,temp_df],ignore_index=True)
            
    def get_data(self):
        "return the dataframe"
        return self.df
    
    def extract_post(self, article_url):
        """Using Newspaper3k library , get insights
        Scrapping is used to get links of news, where as all text of news 
        are scrapped using newspaper3k
        """
        time.sleep(.900)
        article = Article(self.format_url(article_url))
        article.download()
        article.parse()
        article.nlp()
        if not article.publish_date:
            date_ = self.format_date(article)
        else:
            date_ = article.publish_date
        return article.top_image, article.text, article.summary, article.keywords, date_
    
    def format_url(self,url):
        if url[-1] == "/":
            return url[:-1]
        return url
    
    def format_date(self,article):
        _soup = BeautifulSoup(article.html, 'html.parser')
        if _soup.find("span", id="date"):
            return datetime.strptime(_soup.find("span", id="date").text.strip(), '%d %b %Y')
        return ''
    
    def format_df(self):
        " combining scrapping and newspaper3k insights"
        data= [self.extract_post(row['link']) for idx, row in self.df.iterrows()]
        self.df = pd.concat([self.df,pd.DataFrame(data, columns= ['page_image', "text", "summary", "keyword", "date"])], axis=1)
        self.df['date'] = self.df['date'].fillna(method='bfill')
        return self.df
    
    def extract_post_raw(self, article_url):
        """without Using Newspaper3k library , get insights
        Scrapping is used to get links of news, where as all text of news 
        are scrapped using raw beautiful soup and requests module
        """
        response = get(self.format_url(article_url))
        _soup = BeautifulSoup(response.text, 'html.parser')
        if not _soup.findAll('img', class_="img_caption"):
            image = ""
        else:
            image = _soup.findAll('img', class_="img_caption")[0]['src']
        
        table = _soup.findAll('div',attrs={"class":"cell small-12 medium-12 large-10"})
        if not table:
            data = ""
            summary = ""
            if _soup.findAll('div',attrs={"class":"cell large-10"}):
                table = _soup.findAll('div',attrs={"class":"cell large-10"})
                data = unicodedata.normalize("NFKD", "".join([y.text for x in table for y in x.findAll('p')]))
                summary = summarize(data)
        else:
            data = unicodedata.normalize("NFKD", "".join([y.text for x in table for y in x.findAll('p')]))
            summary = summarize(data)
        
        if not _soup.find("span", id="date"):
            date_ = ""
        else:
            date_ = datetime.strptime(_soup.find("span", id="date").text.strip(), '%d %b %Y')

        return image, data, summary, date_

    def format_df_raw(self):
        " combining scrapping without using newspaper3k insights"
        data= [self.extract_post_raw(row['link']) for idx, row in self.df.iterrows()]
        self.df = pd.concat([self.df,pd.DataFrame(data, columns= ['page_image', "text", "summary", "date"])], axis=1)
        self.df['date'] = self.df['date'].fillna(method='bfill')
        return self.df

parserA = ParseyA("https://www.nsenergybusiness.com/power/")

In [0]:
parserA.parse()
# df1 = parserA.format_df() # this work pretty well , but some glitches.


# Due to local internet connectivity, or the website service. Sometimes the data scraped got multiple redirects scrapping in Google Collab looks a feasible smart idea. https://colab.research.google.com/drive/1uGUbH9YIjzLyCT4j2uGpAgYt9yp9QZpg

In [0]:
df1 = parserA.format_df_raw()  # raw method, without using newspaper3k, with faster result

# for . https://www.esi-africa.com/category/news/

In [0]:
class ParseyB():
    def __init__(self, url):
        self.url = None
        self.u_url = url
        self.depth = 100     # change for how much news page we want to scrap
        self.df = pd.DataFrame(columns=['link', 'image', 'title'])
        self.html_soup = None
        self.g_article = None
    
    def create_soup(self):
        """
        load webpage in beautiful soup
        """
        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
        result = get(self.url, headers=headers)
        html_soup = BeautifulSoup(result.text, 'html.parser')
        return html_soup
    
    def create_g_article(self):
        """
        send list of beautiful tags , grid articles
        """
        return self.html_soup.findAll('div', class_="td-module-thumb")
    
    def print_soup(self):
        print(self.html_soup)

    def get_dataFrame(self):
        data = [{"image" : article.findAll('img')[0]['src'],
                  "link"  : article.find('a')['href'],
                  "title" : article.find('a')['title']}for article in self.g_article]
        return pd.DataFrame(data)
    
    def show_depth(self):
        """ show how many pages of data we can scrap"""
        strings = html_soup.select('div[class*="nav"]')[0].findAll('a', class_="last")[0]['href'].split("/")
        return [x for x in strings if x][-1]
    
    def parse(self):
        for i in range(1, self.depth):
            self.url = self.u_url+ "page/"+ str(i) + "/"
            self.html_soup = self.create_soup()
            self.g_article = self.create_g_article()
            temp_df = self.get_dataFrame()
            self.df = pd.concat([self.df,temp_df],ignore_index=True)
            
    def get_data(self):
        return self.df
    
    def extract_post(self, article_url):
        """Using Newspaper3k library , get insights
        Scrapping is used to get links of news, where as all text of news 
        are scrapped using newspaper3k
        """
        time.sleep(.500)
        article = Article(article_url)
        article.download()
        article.parse()
        article.nlp()
        return article.top_image, article.text, article.summary, article.keywords, article.publish_date
    
    def format_df(self):
        " combining scrapping and newspaper3k insights"
        data= [self.extract_post(row['link']) for idx, row in self.df.iterrows()]
        self.df = pd.concat([self.df,pd.DataFrame(data, columns= ['page_image', "text", "summary", "keyword", "date"])], axis=1)
        return self.df
        
parserB = ParseyB("https://www.esi-africa.com/category/news/")

In [0]:
t = time.process_time()
parserB.parse()
elapsed_time = time.process_time() - t
print("Time to parse : ", elapsed_time,  "sec")
t = time.process_time()
df2 = parserB.format_df()
elapsed_time = time.process_time() - t
print("Time to get dataframe : ", elapsed_time,  "sec")

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Time to parse :  7.067197495999949 sec
Time to get dataframe :  266.07974973999995 sec


# Two csv are generated from this Notebook

In [0]:
df2.to_csv("esiafrica_1980.csv", encoding='utf-8', index=False)

In [0]:
df1.to_csv("nsenergy_990.csv", encoding='utf-8', index=False)

In [0]:
df1.head(2)

Unnamed: 0,link,tags,title,page_image,text,summary,date
0,https://www.nsenergybusiness.com/news/sharp-bu...,"[Power, Solar, PV]",Sharp builds solar plant near New Ulaanbaatar ...,https://www.nsenergybusiness.com/wp-content/up...,Sharp Energy Solutions Corporation (SESJ) comp...,Sharp Energy Solutions Corporation (SESJ) comp...,2019-06-14
1,https://www.nsenergybusiness.com/news/jolywood...,"[Power, Solar, PV]",Jolywood signs agreement to supplyBifacial Mod...,https://www.nsenergybusiness.com/wp-content/up...,"Jolywood, a Chinese manufacturer of bifacial s...","Jolywood, a Chinese manufacturer of bifacial s...",2019-06-14
