# News Parse prototyping

for sentiment analysis of a specified stock we need to 

In [2]:
import re
from typing import List, NoReturn
from bs4 import BeautifulSoup
from requests import get

import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stopwords = stopwords.words("english")


def get_page(url_string: str) -> BeautifulSoup:
    """Scrape the page specified by the input url

    Args:
        url_string (str): input url

    Returns:
        BeautifulSoup: return the page obtained
    """

    response = get(url_string)
    return BeautifulSoup(response.text, "html.parser")


def clean_article(article: str) -> List[str]:
    """Clean the article by removing punctuations and stop words

    Args:
        article (str): article content in string format

    Returns:
        List[str]: list of words in the article
    """

    article = article.replace("/(\n)/gm", " ")
    article = re.sub("[.,!?:;%&$^*@#)/(-" '`"—=+]', " ", article)
    article = re.sub("[0-9]", " ", article)
    article = article.replace("`|’|”|“", "'")
    article = article.replace("/(\\x)/g", "")
    new_stop_words = [
        "said",
        "also",
        "per",
        "cent",
        "would",
        "last",
        "first",
        "like",
        "'",
        '"',
        "'",
        '"',
        "’",
        "'s",
        "“",
        "”",
    ]
    stopwords.extend(new_stop_words)
    clean = [word for word in word_tokenize(article) if not word in stopwords]
    return clean


def lower_case(article: List[str]) -> List[str]:
    """Convert all characters of the words in the article to lowercase

    Args:
        article (List[str]): words present in the article

    Returns:
        List[str]: words in the article converted to lowercase
    """
    lower_case_list = [sentence.lower() for sentence in article]
    return lower_case_list


def sort_dictionary(dictionary: dict) -> dict:
    """Sort the input dictionary

    Args:
        dictionary (dict): input dictionary

    Returns:
        dict: sorted dictionary
    """
    sorted_dict = dict(sorted(dictionary.items(), key=lambda value: value[1]))
    return dict(reversed(list(sorted_dict.items())))


def export_as_csv(dictionary: dict) -> NoReturn:
    """Export dictionary as csv file

    Args:
        dictionary (dict): input dictionary

    Returns:
        NoReturn: Export the csv file to data folder in project root directory
    """
    export = {"word": list(dictionary.keys()), "frequency": list(dictionary.values())}
    export_dataframe = pd.DataFrame(export)
    export_dataframe.to_csv("../data/word_frequency.csv")


def update_frequency(element: str, dictionary: dict) -> dict:
    """Update frequency of each word in the dictionary

    Args:
        element (str): word to be updated
        dictionary (dict): the dictionary containing words and their frequency count

    Returns:
        dict: updated dictionary
    """
    if element in dictionary:
        dictionary[element] += 1
    else:
        dictionary.update({element: 1})
    return dictionary


def get_word_frequency(news_list: list) -> NoReturn:
    """Find the frequency of words in the article

    Args:
        news_list (list): list of articles parsed from the webpage

    Returns:
        NoReturn: export word frequncy as a csv file
    """
    word_frequency = {}
    for news in news_list:
        news = news.replace("/\r?\n|\r/g", " ")

    for news in news_list:
        cleaned_article = clean_article(news)
        cleaned_article = lower_case(cleaned_article)

        for word in cleaned_article:
            if len(word) > 2:
                word_frequency = update_frequency(word, word_frequency)

    word_frequency_sorted = sort_dictionary(dict(word_frequency))
    export_as_csv(word_frequency_sorted)


## Testing to parse moneycontrol website

In [3]:
# Setting the url string for a particular stock

URL = "https://www.moneycontrol.com/news/tags/"
stock_name = "Aditya Birla"
resultant_url = URL + stock_name.replace(" ", "-") + ".html"
resultant_url


'https://www.moneycontrol.com/news/tags/Aditya-Birla.html'

Get the value of number of pages of news a particular stock has

In [4]:
number_of_pages = int(get_page(resultant_url).find_all('a', class_="last")[-1]['data-page'])

Get links of all pages of news of specific stock

In [6]:
page_links = []
for i in range(1, number_of_pages + 1):
    page_links.append(resultant_url + "/page-" + str(i) + "/")
page_links

['https://www.moneycontrol.com/news/tags/Aditya-Birla.html/page-1/',
 'https://www.moneycontrol.com/news/tags/Aditya-Birla.html/page-2/',
 'https://www.moneycontrol.com/news/tags/Aditya-Birla.html/page-3/',
 'https://www.moneycontrol.com/news/tags/Aditya-Birla.html/page-4/']

In [24]:
headline_list = []
for link in page_links:
    article_list = get_page(link).find('ul', id="cagetory").find_all('a')
    for article_block in article_list:
        headline_list.append(article_block.text) 
    headline_list = list(filter(('').__ne__, headline_list))
headline_list

['Aditya Birla Fashion and Retail to buy 51% stake in House of Masaba Lifestyle',
 'Aditya Birla, Kotak backed ARCs among 12 interested in Rs 1,597 crore SBI loan to Coastal Energen',
 'Birla favours insolvency for Vodafone Idea if situation worsens: Report',
 '3 Point Analysis | Is Jaypore and TG Apparel a good buy for ABFRL?',
 'Grasim Industries to acquire Soktas India for Rs 165 crore',
 "CCI clears Amazon-Witzig bid to acquire Aditya Birla's retail store chain",
 'Aditya Birla group plans $5-bn capex over next three years',
 'Kumar Mangalam Birla scouts for acquisition targets in US, Europe and India',
 'Podcast | The business of family: Building with the Birlas',
 'AB Group may sell up to 20% stake in Idea Payments Bank for Rs 200 crore',
 'Prefer Trent over Shoppers Stop, says Abhimanyu Sofat',
 'Vodafone, Idea likely to seal merger pact within a month',
 'Sensex ends rangebound session flat ahead of Budget; Idea up 25%',
 'Vodafone says in Indian merger talks with Idea Cellular

Scrape the news from page

In [25]:
article_links = []
for link in page_links:
    page = get_page(link)
    article_tabs = page.find('div', class_='fleft').find_all('li', class_='clearfix')
    article_links = [tab.a.get('href') for tab in article_tabs]

In [26]:
article_links

['https://www.moneycontrol.com/news/business/stocks/-2124863.html',
 'https://www.moneycontrol.com/news/business/stocks/-2123813.html',
 'https://www.moneycontrol.com/news/business/companies/-2144379.html',
 'https://www.moneycontrol.com/news/business/companies/-2146391.html',
 'https://www.moneycontrol.com/news/business/companies/-2031737.html',
 'https://www.moneycontrol.com/news/business/earnings/-2112227.html',
 'https://www.moneycontrol.com/news/world/-1096223.html',
 'https://www.moneycontrol.com/news/business/companies/-1099967.html',
 'https://www.moneycontrol.com/news/business/earnings/-1261867.html']

NameError: name 'lower_case_title' is not defined