In [1]:
import pandas as pd
import data_preprocessing
from database_records import NewsArticles,ProcessedNewsArticle
from rule_based_sentimental_analysis import text_blob_sentiment, vader_sentiment
import pymongo
from mongoengine import connect, disconnect
import yaml
from utils import check_record_exist
from tqdm import tqdm

In [2]:
missing_links = pd.read_excel("CNBC-Missing.xlsx")

In [3]:
missing_links.shape

(8549, 1)

In [4]:
with open('config.yaml') as f:
    config_dict = yaml.safe_load(f)
    
database_connection_params = config_dict['cnbc_database_details']

In [5]:
# connect to database connection through mongoengine
connect(db=database_connection_params['db_name'],
        username=database_connection_params['user_name'],
        password=database_connection_params['password'],
        host=database_connection_params['connection_string'])


MongoClient(host=['cluster1-shard-00-01.ngjps.mongodb.net:27017', 'cluster1-shard-00-00.ngjps.mongodb.net:27017', 'cluster1-shard-00-02.ngjps.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, authsource='admin', replicaset='atlas-aoug1h-shard-0', ssl=True, read_preference=Primary())

In [6]:
def process_database_records(document):
    """
    function to process database records

    :param database_connection_params: database connection strings
    """  
    try:
        
        # get the article text
        article_text = document['article_text']

        # data pre-processing block
        article_text = data_preprocessing.remove_html_tags(article_text)
        article_text = data_preprocessing.lower_text(article_text)
        article_text = data_preprocessing.remove_urls(article_text)
        article_text = data_preprocessing.remove_accented_chars(article_text)
        article_text = data_preprocessing.expand_contractions(article_text)
        article_text = data_preprocessing.remove_special_characters(article_text)
        article_text = data_preprocessing.remove_stopwords(article_text)
        article_text = data_preprocessing.stemming_text(article_text)

        # making document for processed news article
        current_article = ProcessedNewsArticle()
        
        current_article.cleaned_source_name = document['source_name']
        current_article.cleaned_article_title = document['article_title']
        original_article_authors = ",".join(document['article_authors'])
        
        current_article.cleaned_article_authors = data_preprocessing.find_persons(original_article_authors)
        current_article.cleaned_article_published_date = document['article_published_date']
        current_article.cleaned_images_link = document['images_link']
        current_article.cleaned_video_link = document['video_link']
        current_article.cleaned_article_summary = document['article_summary']
        current_article.cleaned_article_url = document['article_url']

        # named entity recognition
        current_article.cleaned_article_text = article_text
        ner_results = data_preprocessing.named_entity_recognition(document['article_text'])
        current_article.cleaned_recognized_entity = ner_results

        current_article.cleaned_article_keywords = document['article_keywords']

        # rule based sentiment analysis
        current_article.text_blob_sentiment = text_blob_sentiment(article_text)
        current_article.vader_sentiment = vader_sentiment(article_text)
        
        current_article.save()

    except Exception as e:
        print(e)

In [7]:
for index, row in tqdm(missing_links.iterrows(), total=missing_links.shape[0]):
    current_doc = NewsArticles.objects(article_url=row['article_links']).first()
    process_database_records(current_doc)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8549/8549 [52:59<00:00,  2.69it/s]


In [36]:
disconnect()

In [6]:
missing_links

Unnamed: 0,article_links
0,https://www.nytimes.com/2021/08/07/sports/olym...
1,https://www.nytimes.com/2021/08/07/world/asia/...
2,https://www.nytimes.com/2021/08/07/sports/olym...
3,https://www.nytimes.com/article/tokyo-olympics...
4,https://www.nytimes.com/2021/08/07/world/ameri...
...,...
40982,https://www.nytimes.com/2021/05/17/world/europ...
40983,https://www.nytimes.com/2021/05/17/us/politics...
40984,https://www.nytimes.com/2021/05/17/world/asia/...
40985,https://www.nytimes.com/video/world/middleeast...
