In [1]:
%pip install duckduckgo-search

StatementMeta(, 923191d3-1280-47c3-8482-e0734fef6bac, 7, Finished, Available, Finished)

Collecting duckduckgo-search
  Downloading duckduckgo_search-8.1.1-py3-none-any.whl.metadata (16 kB)
Collecting click>=8.1.8 (from duckduckgo-search)
  Downloading click-8.3.1-py3-none-any.whl.metadata (2.6 kB)
Collecting primp>=0.15.0 (from duckduckgo-search)
  Downloading primp-0.15.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting lxml>=5.3.0 (from duckduckgo-search)
  Downloading lxml-6.0.2-cp311-cp311-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl.metadata (3.6 kB)
Downloading duckduckgo_search-8.1.1-py3-none-any.whl (18 kB)
Downloading click-8.3.1-py3-none-any.whl (108 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.3/108.3 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading lxml-6.0.2-cp311-cp311-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl (5.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h

In [9]:
import json
import os
from datetime import datetime
from duckduckgo_search import DDGS

# --- CONFIGURATION ---
# 1. Who are we watching?
COMPETITORS = ["Microsoft Fabric", "Databricks", "Snowflake Data Cloud"]
# 2. Where do we save the data? (The Bronze Path)
# /lakehouse/default/Files/ is the magic path in Fabric
SAVE_PATH = "/lakehouse/default/Files/Bronze/Landing"

# --- THE FUNCTION ---
def fetch_news_data():
    print("Starting News Ingestion...")
    
    # Create the folder if it doesn't exist
    os.makedirs(SAVE_PATH, exist_ok=True)
    
    current_time = datetime.now().isoformat()
    batch_id = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    all_results = []
    
    # Initialize the Search Engine
    with DDGS() as ddgs:
        for comp in COMPETITORS:
            print(f"Searching for: {comp}...")
            # specific search for 'news' results
            results = list(ddgs.news(keywords=comp, max_results=5))
            
            # Add metadata (Crucial for Engineering)
            for r in results:
                r['competitor_tag'] = comp
                r['ingestion_time'] = current_time
                r['batch_id'] = batch_id
                all_results.extend(results)
    
    # --- SAVE TO JSON ---
    filename = f"news_batch_{batch_id}.json"
    full_path = f"{SAVE_PATH}/{filename}"
    
    with open(full_path, "w") as f:
        json.dump(all_results, f)
        
    print(f"SUCCESS: Saved {len(all_results)} articles to:")
    print(full_path)

# --- EXECUTE ---
fetch_news_data()

StatementMeta(, 923191d3-1280-47c3-8482-e0734fef6bac, 16, Finished, Available, Finished)

  with DDGS() as ddgs:


Searching for: Databricks...
Searching for: Snowflake Data Cloud...
SUCCESS: Saved 75 articles to:
/lakehouse/default/Files/Bronze/Landing/news_batch_20251207_084727.json


In [6]:
import pandas as pd

df = pd.read_json('/lakehouse/default/Files/Bronze/Landing/news_batch_20251207_081942.json')
df.head()

StatementMeta(, 923191d3-1280-47c3-8482-e0734fef6bac, 13, Finished, Available, Finished)

Unnamed: 0,date,title,body,url,image,source,competitor_tag,ingestion_time,batch_id
0,2025-12-02 19:15:00+00:00,SAS' leading decision intelligence capabilitie...,What it is: SAS Decision Builder on Microsoft ...,https://www.tmcnet.com/usubmit/2025/12/02/1029...,https://mma.prnewswire.com/media/2836749/SAS_D...,TMCnet,Microsoft Fabric,2025-12-07 08:19:42.479283,20251207081942
1,2025-12-04 07:03:44+00:00,From complexity to clarity - why Microsoft Fab...,Microsoft Fabric provides a strategic response...,https://www.msn.com/en-za/news/other/from-comp...,https://www.itweb.co.za/static/pictures/2025/1...,ITWeb,Microsoft Fabric,2025-12-07 08:19:42.479283,20251207081942
2,2025-05-14 00:01:00+00:00,Informatica Unveils New Innovations with Micro...,"REDWOOD CITY, Calif., May 14, 2025--(BUSINESS ...",https://finance.yahoo.com/news/informatica-unv...,,Yahoo Finance,Microsoft Fabric,2025-12-07 08:19:42.479283,20251207081942
3,2024-04-17 00:01:00+00:00,Microsoft Kills 30-Day Data Retention Policy f...,Microsoft this week announced several usabilit...,https://redmondmag.com/articles/2024/04/17/mic...,,Redmond Magazine,Microsoft Fabric,2025-12-07 08:19:42.479283,20251207081942
4,2025-11-18 16:16:00+00:00,Microsoft updates database portfolio and adds ...,Microsoft Corp. today is introducing updates a...,https://siliconangle.com/2025/11/18/microsoft-...,,SiliconANGLE,Microsoft Fabric,2025-12-07 08:19:42.479283,20251207081942
