In [6]:
pip install newspaper3k



In [7]:
from newspaper import Article

In [8]:
import os
import json
import time
import pandas as pd

# Directory containing the JSON files
json_dir = '/content/drive/MyDrive/vnp-project/raw-news'

# Initialize an empty list to store the data
data = []
failed_data = []
counter_positive = 0
counter_negative = 0

# Loop through all files in the directory
for filename in os.listdir(json_dir):
    if filename.endswith('.json'):
        # Extract the company name from the filename
        company_name = os.path.splitext(filename)[0].split('_')[0]

        # Full path to the JSON file
        file_path = os.path.join(json_dir, filename)

        # Read the JSON file
        with open(file_path, 'r') as file:
            json_data = json.load(file)

            # Extract the links from the JSON data
            for news_item in json_data:
                  title = news_item.get('title')
                  link = news_item.get('link')
                  source = news_item.get('source')
                  content = news_item.get('content')
                  datetime = news_item.get('datetime')
                  time_of = news_item.get('time')
                  article_type = news_item.get('articleType')
                  if content != "":
                      data.append(
                          {
                          'company': company_name,
                          'title': title,
                          'link': link,
                          'source': source,
                          'content': content,
                          'publish_date': datetime,
                          'time': time_of,
                          'article_type': article_type,
                          })
                      counter_positive += 1
                  else:
                    try:
                      article = Article(link)
                      article.download()
                      article.parse()
                      content = article.text
                      data.append(
                          {
                          'company': company_name.split('_')[0],
                          'title': title,
                          'link': link,
                          'source': source,
                          'content': content,
                          'publish_date': datetime,
                          'time': time_of,
                          'article_type': article_type,
                          })
                      counter_positive += 1
                      print(f'Reading for url: {link} succeeded')
                    except Exception as e:
                      failed_data.append(
                          {
                          'company': company_name.split('_')[0],
                          'title': title,
                          'link': link,
                          'source': source,
                          'content': "",
                          'publish_date': datetime,
                          'time': time_of,
                          'article_type': article_type,
                          })
                      counter_negative += 1
                      print(f'Reading for url: {link} failed. Reason: {e}')

print(f'{counter_positive} articles have content.')
print(f'{counter_negative} articles do not have content.')

# Create a DataFrame from the data
news_with_content = pd.DataFrame(data)
news_without_content = pd.DataFrame(failed_data)

Reading for url: https://www.reuters.com/business/autos-transportation/tesla-berlin-suspend-most-production-two-weeks-over-red-sea-supply-gap-2024-01-11/ failed. Reason: Article `download()` failed with 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/business/autos-transportation/tesla-berlin-suspend-most-production-two-weeks-over-red-sea-supply-gap-2024-01-11/ on URL https://www.reuters.com/business/autos-transportation/tesla-berlin-suspend-most-production-two-weeks-over-red-sea-supply-gap-2024-01-11/
Reading for url: https://www.cnn.com/2024/04/02/business/tesla-sales/index.html succeeded
Reading for url: https://apnews.com/article/tesla-recall-icons-too-small-997c381dcfade6cb246e51cd39627cf0 succeeded
Reading for url: https://news.samsung.com/global/samsung-announces-collaboration-with-tesla-at-ces-2024-for-smartthings-energy succeeded
Reading for url: https://www.forbes.com/sites/larrymagid/2024/03/21/review-of-2024-tesla-model-3-highland/ failed. Reason: Article 

In [9]:
news_with_content.to_csv("/content/drive/MyDrive/vnp-project/raw-news/csv_files/news_with_content.csv", index = True)
news_without_content.to_csv("/content/drive/MyDrive/vnp-project/raw-news/csv_files/news_without_content.csv", index = True)