# QUICK LOOK AT NEWS ARTICLES

In [1]:
import os
import pandas as pd
from pymongo import MongoClient

import sys
sys.path.append("../")

from src import config

In [4]:
def get_raw_data_from_mongo(output_filename=os.path.join("../", config.DATA_DIR, "raw", "raw.csv"))->str:
    """
    Get raw data from mongoDB
    :param output_filename: str
    :return: path
    """
    
    conn = MongoClient(config.CONNECTION_STRING)
    db = conn.get_database("NewsArticleDB")
    collection = db.get_collection("news_data")
    data = pd.DataFrame(list(collection.find())).drop("_id", axis=1)


    _columns = list(map(lambda x: x.lower(), data.columns))
    data.columns = _columns

    # remove all articles with no titles and no text body
    data = data[(~data['title'].isnull()) & (~data['text'].isnull())]

    # run quick text normalization
    data["category"] = data["category"].apply(lambda x: x.lower())
    data["subcategory"] = data["subcategory"].apply(lambda x: x.lower())

    # create artificial identifier
    data["article_id"] = list(range(data.shape[0]))
    data[["article_id"] + _columns].to_csv(output_filename, index=False)
    
    return output_filename


In [5]:
filename = get_raw_data_from_mongo()
df = pd.read_csv(filename, nrows=1000)

In [6]:
df.loc[0].to_dict()

{'article_id': 0,
 'article_id.1': 0,
 'category': 'asia media',
 'subcategory': 'asia - business & finance',
 'title': 'EY achieves highest growth in nearly two decades, reports record global revenue of US$45.4b',
 'published date': '2022-09-21 07:00:00',
 'text': 'US$3.2b invested in audit quality, innovation, technology and people in FY22 – part of a US$10b three-year commitment announced in FY21\n\nMore than 81m lives positively impacted through corporate responsibility program EY Ripples\n\nCarbon negative status reached in FY21, on track to achieve net-zero ambition in 2025\n\nLooking ahead, EY leaders have reached the decision to move forward with partner votes for the initiative to separate into two distinct, multidisciplinary organizations\n\nEY today announces combined global revenues of US$45.4b for the financial year ending June 2022 (FY22), an increase of 16.4% in local currency (13.7% in US dollars). This marks one of the most successful years in the history of the organi

In [5]:
df.loc[100].to_dict()

{'article_id': 100,
 'article_id.1': 100,
 'category': 'asia media',
 'subcategory': 'asia - computers & it business',
 'title': 'RETINA-AI Health, Inc. Closes $2.6M Bridge Round; Totaling $8.1M Raised to Build Galaxy™ Multi-Camera-Compatible Diabetic Retinopathy AI System. Starts Clinical Trial.',
 'published date': '2022-12-12 08:00:00',
 'text': 'HOUSTON, Dec. 12, 2022 /PRNewswire/ -- RETINA-AI Health, Inc. closes its bridge financing round during which it raised $2.6M, bringing the total raised so far by the company to $8.1M for the development of RETINA-AI Galaxy™ v2.0, a U.S. patent-protected multi-camera-compatible autonomous Artificial Intelligence (AI) diabetic retinopathy detection system. On Nov 3rd 2022, the company began enrollment in its Pivotal (Phase III) multi-center prospective clinical trial of the RETINA-AI Galaxy™ v2.0 (ClinicalTrials.gov ID: NCT05368623).\n\nRETINA-AI Health, Inc. closes its bridge financing round during which it raised $2.6M, bringing the total r