In [13]:
pip install newspaper3k



In [14]:
from newspaper import Article

In [15]:
import os
import json
import time
import pandas as pd

# Directory containing the JSON files
json_dir = '/content/drive/MyDrive/vnp-project/raw-news'

# Initialize an empty list to store the data
data = []
failed_data = []
counter_positive = 0
counter_negative = 0

# Loop through all files in the directory
for filename in os.listdir(json_dir):
    if filename.endswith('.json'):
        # Extract the company name from the filename
        company_name = os.path.splitext(filename)[0].split('_')[0]

        # Full path to the JSON file
        file_path = os.path.join(json_dir, filename)

        # Read the JSON file
        with open(file_path, 'r') as file:
            json_data = json.load(file)

            # Extract the links from the JSON data
            for news_item in json_data:
                  title = news_item.get('title')
                  link = news_item.get('link')
                  source = news_item.get('source')
                  content = news_item.get('content')
                  datetime = news_item.get('datetime')
                  time_of = news_item.get('time')
                  article_type = news_item.get('articleType')
                  if content != "":
                      data.append(
                          {
                          'company': company_name,
                          'title': title,
                          'link': link,
                          'source': source,
                          'content': content,
                          'publish_date': datetime,
                          'time': time_of,
                          'article_type': article_type,
                          })
                      counter_positive += 1
                  else:
                    try:
                      article = Article(link)
                      article.download()
                      article.parse()
                      content = article.text
                      data.append(
                          {
                          'company': company_name.split('_')[0],
                          'title': title,
                          'link': link,
                          'source': source,
                          'content': content,
                          'publish_date': datetime,
                          'time': time_of,
                          'article_type': article_type,
                          })
                      counter_positive += 1
                      print(f'Reading for url: {link} succeeded')
                    except Exception as e:
                      failed_data.append(
                          {
                          'company': company_name.split('_')[0],
                          'title': title,
                          'link': link,
                          'source': source,
                          'content': "",
                          'publish_date': datetime,
                          'time': time_of,
                          'article_type': article_type,
                          })
                      counter_negative += 1
                      print(f'Reading for url: {link} failed. Reason: {e}')

print(f'{counter_positive} articles have content.')
print(f'{counter_negative} articles do not have content.')

# Create a DataFrame from the data
news_with_content = pd.DataFrame(data)
news_without_content = pd.DataFrame(failed_data)

Reading for url: https://www.reuters.com/business/autos-transportation/tesla-berlin-suspend-most-production-two-weeks-over-red-sea-supply-gap-2024-01-11/ failed. Reason: Article `download()` failed with 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/business/autos-transportation/tesla-berlin-suspend-most-production-two-weeks-over-red-sea-supply-gap-2024-01-11/ on URL https://www.reuters.com/business/autos-transportation/tesla-berlin-suspend-most-production-two-weeks-over-red-sea-supply-gap-2024-01-11/
Reading for url: https://www.cnn.com/2024/04/02/business/tesla-sales/index.html succeeded
Reading for url: https://apnews.com/article/tesla-recall-icons-too-small-997c381dcfade6cb246e51cd39627cf0 succeeded
Reading for url: https://news.samsung.com/global/samsung-announces-collaboration-with-tesla-at-ces-2024-for-smartthings-energy succeeded
Reading for url: https://www.forbes.com/sites/larrymagid/2024/03/21/review-of-2024-tesla-model-3-highland/ failed. Reason: Article 

In [20]:
pip install transformers



In [21]:
from transformers import pipeline

pipe = pipeline("text-classification", model="mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/933 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [23]:
results = pipe("Every creative process begins with an idea—and that idea starts with you. Today we’re announcing that the Microsoft Designer app is now generally available with a personal Microsoft account, with new features that help you create and edit like never before. You can express yourself in brand new ways and bring your most creative ideas to life in seconds—all with the help of AI. A blank canvas no longer has to be intimidating—just describe what you want to see, and Designer can create it for you. And if you’re not sure what that is, Designer can help you get started.\nThe true power of Designer is the ability to access it whenever and wherever it’s most helpful to you in your daily life to keep you in your creative flow. Designer now integrates seamlessly with Microsoft products including Word and PowerPoint through Microsoft Copilot1 and Microsoft Photos2 to keep you in your flow when inspiration strikes. Designer is now supported in more than 80 languages on the web, available as a free mobile app, and as an app in Windows.\nUnleash your creativity—create and edit anything you can imagine with AI.\nUse Designer across many Microsoft apps and on the go wherever inspiration strikes\nDesigner is now available through Copilot across some of your favorite Microsoft 365 apps on web and PC apps to help you uplevel your slides and documents.1 With a Copilot Pro subscription, when you’re in Word and PowerPoint you can create images and designs right in the heart of your workflow. From Word or PowerPoint, click on the Copilot icon and describe an image you’d like to create. In Word, coming soon, you can even ask to create a banner for your document and a design will be generated for you based on the content of your document.\nDesigner is now available as a free mobile app (iOS and Android). It’s packed with AI-powered features to unlock your creativity on the go—including creating images and designs with words and editing images to make them pop.\nWe’re also bringing Designer’s generative AI editing and creation capabilities to more Microsoft apps where you edit your photos, starting with Microsoft Photos available to Windows Insiders today.2 Without leaving Photos, you can edit your photos to erase objects, remove backgrounds, auto crop, make adjustments, apply filters, markup, or even add text without leaving your flow. For more details check out the Windows Insider blog. In the future, similar capabilities will be rolling out to Microsoft Edge for convenient use right from the browser.\nNew Designer app features offer innovative ways to create\nWhether you use Designer in the mobile app or on the web, the experience starts with a new homepage—redesigned based on feedback we gathered from you during preview—to help you jump right into whatever you want to create or edit. We’re also introducing new ways to create and help you get even more from Designer, available now:\nEveryone has come up against the dreaded blank page. New prompt templates help jumpstart the creative process. These templates are pre-populated with ideas, styles, and descriptions that you can experiment with and customize, helping you get the hang of how to prompt. We are now rolling out prompt templates across more features to help you create with AI. When you’re ready, you can even share templates with friends or fellow creators and build on each other’s ideas, sparking inspiration across your creative community.\nYou can share ideas, thoughts, or phrases, and Designer will create custom stickers that help you stand out on places like messaging apps and social. You can also create emojis, clip art, wallpapers, monograms, avatars, and more—all starting with a simple description.\nMake the perfect greeting card. From birthday cards to holiday cards and beyond, create custom cards with personalized messages—even when you’re at a loss for words—by describing what you want to convey. Similarly, create personalized invitations for birthdays, graduations, anniversaries, and more, simply by describing what you want to see.\nTransform any photo into a work of art with Restyle image. Upload an image, choose from a set of styles, and write in any extra details you want to see to get a brand-new image created just for you.\nCreate custom image frames to turn your photos into shareable memories. With Frame image, upload an image and write a description or choose from a set of styles to get a personalized frame. For multiple images, bring your memories together with collages by selecting your photos, choosing from a set of styles, and adding a description to customize even more.\nWe are always looking for ways to improve and empower your creativity, and will be adding more features over time. Soon, we will be rolling out Replace background in preview in select markets. We look forward to your feedback.\nEasily replace the background of your still life photos like a pro. Upload your photo, explain the vision for your background, and AI will create it for you. Perfect for showing off your craft and hobby projects in a new light.\nToday, Designer comes with 15 free daily boosts that you can use to create or edit AI-powered images and designs faster. Boosts are automatically used whenever you’re creating or editing images or designs both in the Designer app and where Designer is integrated across Microsoft apps. You can upgrade to a Copilot Pro subscription to receive 100 boosts per day.\nAt Microsoft, we are focused on building tools that harness the incredible potential of generative AI while providing a safe experience for our users. We are committed to ensuring that our systems are used in a responsible and ethical manner. We have implemented a responsible AI process and taken actions to mitigate negative outcomes and further prevent misuse, including guardrails, threat monitoring, and abuse detection, and provenance technology, and we are continuously working to strengthen our safety systems to help create a safer environment. Read more here: Making our generative AI products safer for consumers.\nGet started with Microsoft Designer today to unleash your creativity and start designing and editing anything you can imagine with AI. If you can describe it, you can design it.\n1A Copilot Pro subscription unlocks the use of Copilot in Microsoft 365 apps like Word and PowerPoint.  Those who have a separate Microsoft 365 Personal or Family subscription get the added benefit of using Copilot in the more fully featured PC apps. Creating banners with Copilot in Word will be available soon.\n2Designer’s editing experience in Microsoft Photos is currently limited to Windows Insiders with language set to English, Spanish, French, German, Italian, or Portuguese (Brazil), and available in most countries. To get the latest Photos experience, update your app to version number 2024.11070.12001.0 or higher."[:512])
print(results)

[{'label': 'neutral', 'score': 0.9993695616722107}]


In [24]:
results = pipe("New ways to get creative with Microsoft Designer, powered by AI")
print(results)

[{'label': 'neutral', 'score': 0.9996452331542969}]


In [35]:
def split_string_into_chunks(text, chunk_size=500):
    chunks = []
    for i in range(0, len(text), chunk_size):
        chunks.append(text[i:i + chunk_size])
    return chunks

In [43]:
content_results = []
for index, row in news_with_content.iterrows():
  content = row['content']
  results = pipe(content[:512])
  label = results[0]['label']
  score = results[0]['score']
  content_results.append(
      {
          'url': row['link'],
          'title': row['title'],
          'label': label,
          'score': score
      }
  )

df = pd.DataFrame(content_results)
df.to_csv('distilroberta-ffnsa-results.csv', index=False)

In [44]:
content_results = []
for index, row in news_with_content.iterrows():
  content = row['title']
  results = pipe(content[:512])
  label = results[0]['label']
  score = results[0]['score']
  content_results.append(
      {
          'url': row['link'],
          'title': row['title'],
          'label': label,
          'score': score
      }
  )

df = pd.DataFrame(content_results)
df.to_csv('distilroberta-ffnsa-headlines-results.csv', index=False)

In [45]:
from transformers import pipeline

pipe = pipeline("text-classification", model="chrommium/rubert-base-cased-sentence-finetuned-headlines_X")

config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/712M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/530 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.62M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [47]:
content_results = []
for index, row in news_with_content.iterrows():
  content = row['title']
  results = pipe(content[:512])
  label = results[0]['label']
  score = results[0]['score']
  content_results.append(
      {
          'url': row['link'],
          'title': row['title'],
          'label': label,
          'score': score
      }
  )

df = pd.DataFrame(content_results)
df.to_csv('rubert-bcsfhx-headlines-results.csv', index=False)

In [52]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-classification", model="shashanksrinath/News_Sentiment_Analysis")

config.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.54k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [54]:
content_results = []
for index, row in news_with_content.iterrows():
  content = row['content']
  results = pipe(content[:512])
  label = results[0]['label']
  score = results[0]['score']
  content_results.append(
      {
          'url': row['link'],
          'title': row['title'],
          'label': label,
          'score': score
      }
  )

df = pd.DataFrame(content_results)
df.to_csv('nsa-results.csv', index=False)

In [55]:
content_results = []
for index, row in news_with_content.iterrows():
  content = row['title']
  results = pipe(content[:512])
  label = results[0]['label']
  score = results[0]['score']
  content_results.append(
      {
          'url': row['link'],
          'title': row['title'],
          'label': label,
          'score': score
      }
  )

df = pd.DataFrame(content_results)
df.to_csv('nsa-headlines-results.csv', index=False)

In [64]:
from transformers import pipeline

pipe = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [66]:
content_results = []
for index, row in news_with_content.iterrows():
  content = row['content']
  results = pipe(content[:512])
  label = results[0]['label']
  score = results[0]['score']
  content_results.append(
      {
          'url': row['link'],
          'title': row['title'],
          'label': label,
          'score': score
      }
  )

df = pd.DataFrame(content_results)
df.to_csv('distilbert-bu-results.csv', index=False)

In [68]:
content_results = []
for index, row in news_with_content.iterrows():
  content = row['title']
  results = pipe(content[:512])
  label = results[0]['label']
  score = results[0]['score']
  content_results.append(
      {
          'url': row['link'],
          'title': row['title'],
          'label': label,
          'score': score
      }
  )

df = pd.DataFrame(content_results)
df.to_csv('distilbert-bu-headlines-results.csv', index=False)