# Implementing an ETL Workflow Using Pandas for Web Scraping: Top-5 News Headlines and Sentiment Analysis with VADER NLP Model.

In [1]:
# Importing all required libraries and modules

import os
import requests
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
from datetime import datetime
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


# news_scraping() function will send get request to extraxt all html tags on the weblink page for top-5 news Headlines
# returns news as text format

def news_scraping():
    r=requests.get(r'https://www.firstpost.com/category/india')
    news=r.text
    return news


#data=news_scraping()  check html page extracted from above function.

#beautifulsoup_extract_element() calls news_scraping function
#creates soup and find all particular tags for top-5 news under css class story-list

def beautifulsoup_extract_element():
    html_raw=news_scraping()
    soup=bs(html_raw,'html.parser')
    result=soup.find_all('ul',class_="story-list-ul")
    result=result[0]
    result=result.find_all('a')
# We will extract needful information under story list,with below defined 3 list data structures.
    
    title=[]   #tile:news headline
    desc=[]    # Short summary of the news
    web_link=[] #Webink to access the news article page
    for i in range(len(result)):
        title.append(result[i].h3.text)
        desc.append(result[i].h4.text)
        web_link.append(result[i].get('href'))

    title=[item.strip() for item in title]  #cleaning spaces
    desc=[item.strip() for item in desc]
    #runtime timestamp for next update
    runtime=datetime.now().strftime("%x %H:%M") #formated with Date and Hr,min for every runtime
        
    newsframe_dic={"title":title,"short_summ":desc,"News_Link":web_link,"News_time":runtime}  #dict with all above lists
    df=pd.DataFrame(newsframe_dic)   #converting to dataframe with dict defined colmun names.
    return df

# For Sentiment analysis, used Vandor NLP Model to understand the mood genere of news article.
# Sentiment Polarity is applied on Short summry of every 5 news articles with every runtime
# Based on compound result, A condition is implemented to tag news as Neutral,Positive or Negative.

def sentiment_analyzer():
    f=beautifulsoup_extract_element()
    sia = SentimentIntensityAnalyzer()
    ids=[]
    neg=[]
    pos=[]
    neu=[]
    compound=[]
    for i,row in (f.iterrows()):
        #print(i,row['Short_sum'])
        score_dict=sia.polarity_scores(row['short_summ'])
        ids.append(i)
        neg.append(score_dict['neg'])
        pos.append(score_dict['pos'])
        neu.append(score_dict['neu'])
        compound.append(score_dict['compound'])
        
    #DataFrame is created using news id as primary key to map with our first news extracted dataframe df.
    sentiment_frame=pd.DataFrame(list(zip(ids,pos,neu,neg,compound)),columns =['ids', 'pos','neu','neg','compound'])

    final_frame=pd.merge(f, sentiment_frame,  left_index=True, right_index=True)  #merged on index
    final_frame['compound']=final_frame['compound']*100.0
    
    #sentiment_key--> a nested funcion to entitle news with mood genre based on compound score.
    def sentiment_key(x):
        if (x>0 and x<50):
            return "Neutral"
        elif x<0:
            return "Negative"
        else:
            return "Positive"
    final_frame['Mood']=final_frame['compound'].apply(sentiment_key)
    return final_frame   #table with news atttributes+mood genre
    
def csv_loader():
    data=sentiment_analyzer()
    ff=data.drop(columns=["pos","neu","neg","ids"])  #dropping unnecessary columns from final frame
    print("Appending Top-5 News to CSV file...")
    
    if os.path.exists(r"C:\Users\cvb\Documents\automation_python\Top-5 Headlines News\News.csv")==False:
        ff.to_csv("News.csv",mode='w', index=False, header=True)
    else:
        ff.to_csv("News.csv",mode='a', index=False, header=False)
    
    return 0



In [2]:
# As the html page of top-5 news refreshes around in 30 min. So using schedule library to run the above functions as ETL job in every 30 min

import time
import schedule


def job():
    print("news_scraping Completed\n\n")
    time.sleep(2)
    print("beautifulsoup_extract_element Completed\n\n")
    time.sleep(2)
    print("sentiment_analyzer Completed\n\n")
    csv_loader()
    print("csv loader Completed")

schedule.every(30).minutes.do(job)

while True:
    schedule.run_pending()
    time.sleep(1)



news_scraping Completed


beautifulsoup_extract_element Completed


sentiment_analyzer Completed


Appending Top-5 News to CSV file...
csv loader Completed
news_scraping Completed


beautifulsoup_extract_element Completed


sentiment_analyzer Completed


Appending Top-5 News to CSV file...
csv loader Completed
news_scraping Completed


beautifulsoup_extract_element Completed


sentiment_analyzer Completed


Appending Top-5 News to CSV file...
csv loader Completed
news_scraping Completed


beautifulsoup_extract_element Completed


sentiment_analyzer Completed


Appending Top-5 News to CSV file...
csv loader Completed
news_scraping Completed


beautifulsoup_extract_element Completed


sentiment_analyzer Completed


Appending Top-5 News to CSV file...
csv loader Completed
news_scraping Completed


beautifulsoup_extract_element Completed


sentiment_analyzer Completed


Appending Top-5 News to CSV file...
csv loader Completed
news_scraping Completed


beautifulsoup_extract_element Complete

KeyboardInterrupt: 