In [None]:
import subprocess
import sys
import json
import os
import random
from datetime import datetime, timedelta

# Silent install - suppress output
try:
    import duckduckgo_search
except ImportError:
    subprocess.check_call(
        [sys.executable, "-m", "pip", "install", "duckduckgo_search", "-q"],
        stdout=subprocess.DEVNULL,
        stderr=subprocess.DEVNULL
    )

from duckduckgo_search import DDGS

# --- CONFIGURATION ---
SAVE_PATH = "/lakehouse/default/Files/Bronze/Landing"

# --- SYNTHETIC DATA GENERATOR ---
def generate_synthetic_news():
    competitors = ["Microsoft Fabric", "Databricks", "Snowflake", "Google Cloud", "AWS"]
    topics = ["released new AI features", "reported quarterly earnings", "suffered minor outage", 
              "announced partnership with OpenAI", "launched new data lake tool"]
    sources = ["TechCrunch", "TheVerge", "Bloomberg", "DataEngineeringWeekly", "CNBC"]
    
    data = []
    for i in range(15):
        comp = random.choice(competitors)
        topic = random.choice(topics)
        
        article = {
            "title": f"{comp} has {topic}",
            "url": f"https://fake-news-source.com/{comp.lower()}-{i}",
            "snippet": f"Breaking news: {comp} has just {topic}. Analysts are watching closely.",
            "source": random.choice(sources),
            "date": (datetime.now() - timedelta(hours=random.randint(1, 48))).isoformat(),
            "competitor_tag": comp,
            "ingestion_time": datetime.now().isoformat()
        }
        data.append(article)
    
    return data

# --- MAIN EXECUTION ---
try:
    news_data = generate_synthetic_news()
    batch_id = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"news_batch_{batch_id}.json"
    
    os.makedirs(SAVE_PATH, exist_ok=True)
    full_path = f"{SAVE_PATH}/{filename}"
    
    with open(full_path, "w") as f:
        json.dump(news_data, f)
    
    # Prepare success response
    result = {
        "status": "success",
        "records_generated": len(news_data),
        "file_path": full_path,
        "batch_id": batch_id,
        "timestamp": datetime.now().isoformat()
    }
    
    # Exit with JSON output for pipeline
    mssparkutils.notebook.exit(json.dumps(result))

except Exception as e:
    # Exit with error for pipeline
    error_result = {
        "status": "failed",
        "error": str(e),
        "timestamp": datetime.now().isoformat()
    }
    mssparkutils.notebook.exit(json.dumps(error_result))

StatementMeta(, 923191d3-1280-47c3-8482-e0734fef6bac, 16, Finished, Available, Finished)

  with DDGS() as ddgs:


Searching for: Databricks...
Searching for: Snowflake Data Cloud...
SUCCESS: Saved 75 articles to:
/lakehouse/default/Files/Bronze/Landing/news_batch_20251207_084727.json


In [6]:
import pandas as pd

df = pd.read_json('/lakehouse/default/Files/Bronze/Landing/news_batch_20251207_081942.json')
df.head()

StatementMeta(, 923191d3-1280-47c3-8482-e0734fef6bac, 13, Finished, Available, Finished)

Unnamed: 0,date,title,body,url,image,source,competitor_tag,ingestion_time,batch_id
0,2025-12-02 19:15:00+00:00,SAS' leading decision intelligence capabilitie...,What it is: SAS Decision Builder on Microsoft ...,https://www.tmcnet.com/usubmit/2025/12/02/1029...,https://mma.prnewswire.com/media/2836749/SAS_D...,TMCnet,Microsoft Fabric,2025-12-07 08:19:42.479283,20251207081942
1,2025-12-04 07:03:44+00:00,From complexity to clarity - why Microsoft Fab...,Microsoft Fabric provides a strategic response...,https://www.msn.com/en-za/news/other/from-comp...,https://www.itweb.co.za/static/pictures/2025/1...,ITWeb,Microsoft Fabric,2025-12-07 08:19:42.479283,20251207081942
2,2025-05-14 00:01:00+00:00,Informatica Unveils New Innovations with Micro...,"REDWOOD CITY, Calif., May 14, 2025--(BUSINESS ...",https://finance.yahoo.com/news/informatica-unv...,,Yahoo Finance,Microsoft Fabric,2025-12-07 08:19:42.479283,20251207081942
3,2024-04-17 00:01:00+00:00,Microsoft Kills 30-Day Data Retention Policy f...,Microsoft this week announced several usabilit...,https://redmondmag.com/articles/2024/04/17/mic...,,Redmond Magazine,Microsoft Fabric,2025-12-07 08:19:42.479283,20251207081942
4,2025-11-18 16:16:00+00:00,Microsoft updates database portfolio and adds ...,Microsoft Corp. today is introducing updates a...,https://siliconangle.com/2025/11/18/microsoft-...,,SiliconANGLE,Microsoft Fabric,2025-12-07 08:19:42.479283,20251207081942
