## Test Deepseek API

In [15]:
import os
import requests
import json

# Set API key as a string
DEEPSEEK_API_KEY = "sk-a1612c7e6dcd470d8740197fd52068b9"
os.environ["deepseek_api_key"] = DEEPSEEK_API_KEY  # Set environment variable

API_ENDPOINT = "https://api.deepseek.com/v1/chat/completions"

# Test connection with raw request
try:
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {DEEPSEEK_API_KEY}"
    }
    
    payload = {
        "model": "deepseek-chat",
        "messages": [
            {
                "role": "user",
                "content": "Hello"
            }
        ],
        "temperature": 0.7,
        "max_tokens": 100
    }
    
    response = requests.post(
        API_ENDPOINT,
        headers=headers,
        json=payload
    )
    
    if response.status_code == 200:
        print("Connection successful!")
        print(response.json())
    else:
        print(f"Connection failed with status code: {response.status_code}")
        print(f"Response: {response.text}")
        
except Exception as e:
    print(f"Connection failed: {str(e)}")

Connection successful!
{'id': 'd66e3324-312e-4336-b6db-cea6d1d6abda', 'object': 'chat.completion', 'created': 1736152666, 'model': 'deepseek-chat', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': 'Hello! How can I assist you today? 😊'}, 'logprobs': None, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 4, 'completion_tokens': 11, 'total_tokens': 15, 'prompt_cache_hit_tokens': 0, 'prompt_cache_miss_tokens': 4}, 'system_fingerprint': 'fp_3a5770e1b4'}


## Test DuckDuckGo API

In [6]:
from typing import List, Dict, Optional, Union
from duckduckgo_search import DDGS

class SearchUtils:
    """Wrapper for DuckDuckGo search functionality"""
    
    def __init__(self, timeout: int = 10, proxies: Optional[str] = None):
        self.ddgs = DDGS(timeout=timeout, proxy=proxies)

    def text_search(
        self,
        query: str,
        max_results: int = 10,
        region: str = "wt-wt",
        safesearch: str = "moderate",
        timelimit: Optional[str] = None
    ) -> List[Dict[str, str]]:
        try:
            results = []
            for r in self.ddgs.text(
                keywords=query,
                region=region,
                safesearch=safesearch,
                timelimit=timelimit
            ):
                result = {
                    'title': r.get('title', 'No title'),
                    'url': r.get('link', 'No URL'),
                    'description': r.get('body', 'No description')
                }
                results.append(result)
                if len(results) >= max_results:
                    break
            return results
        except Exception as e:
            print(f"Search failed: {str(e)}")
            return []

    def news_search(
        self,
        query: str,
        max_results: int = 10,
        region: str = "wt-wt",
        timelimit: Optional[str] = None
    ) -> List[Dict[str, str]]:
        try:
            results = []
            for r in self.ddgs.news(
                keywords=query,
                region=region,
                timelimit=timelimit
            ):
                results.append(r)
                if len(results) >= max_results:
                    break
            return results
        except Exception as e:
            print(f"News search failed: {str(e)}")
            return []

    def image_search(
        self,
        query: str,
        max_results: int = 10,
        region: str = "wt-wt",
        safesearch: str = "moderate",
        size: Optional[str] = None,
        color: Optional[str] = None,
        type_image: Optional[str] = None,
        layout: Optional[str] = None
    ) -> List[Dict[str, str]]:
        try:
            results = []
            for r in self.ddgs.images(
                keywords=query,
                region=region,
                safesearch=safesearch,
                size=size,
                color=color,
                type_image=type_image,
                layout=layout
            ):
                results.append(r)
                if len(results) >= max_results:
                    break
            return results
        except Exception as e:
            print(f"Image search failed: {str(e)}")
            return []

# Example usage
if __name__ == "__main__":
    search = SearchUtils()
    
    # Text search example
    text_results = search.text_search(
        query="Python programming",
        max_results=5
    )
    print("\nText Search Results:")
    for r in text_results:
        print(f"\nTitle: {r['title']}")
        print(f"URL: {r['url']}")
        print(f"Description: {r['description']}")

    # News search example
    news_results = search.news_search(
        query="artificial intelligence",
        max_results=5,
        timelimit="d"  # Last 24 hours
    )
    print("\nNews Search Results:")
    for r in news_results:
        print(f"\nTitle: {r.get('title', 'No title')}")
        print(f"URL: {r.get('url', 'No URL')}")
        print(f"Date: {r.get('date', 'No date')}")
        print(f"Description: {r.get('description', 'No description')}")



Text Search Results:

Title: Welcome to Python.org
URL: No URL
Description: Python is a programming language that lets you work quickly and integrate systems more effectively. Learn More. Get Started. Whether you're new to programming or an experienced developer, it's easy to learn and use Python. Start with our Beginner's Guide. Download.

Title: Python For Beginners | Python.org
URL: No URL
Description: Learn how to get started with Python, a popular and easy-to-use programming language. Find out how to install, edit, and use Python, and explore its libraries, documentation, and community resources.

Title: Python Tutorial - W3Schools
URL: No URL
Description: W3Schools offers a comprehensive and interactive Python tutorial with examples, exercises, quizzes, and references. You can learn Python basics, web applications, file handling, database handling, and more.

Title: Python (programming language) - Wikipedia
URL: No URL
Description: Python is a high-level, general-purpose program

In [3]:
from typing import Dict, List, Optional
from dataclasses import dataclass, asdict
from datetime import datetime
import duckdb
from pathlib import Path
import os
import json  # Add missing json import

# Environment setup
PROJECT_ROOT = Path(os.getenv('PROJECT_ROOT', '/Users/srvo/notebooks'))
DATA_DIR = PROJECT_ROOT / 'data'
DB_PATH = DATA_DIR / 'raw.db'

# Ensure data directory exists
DATA_DIR.mkdir(parents=True, exist_ok=True)

@dataclass
class CompanyBasicInfo:
    name: str
    industry: Optional[str] = None
    headquarters: Optional[str] = None
    description: Optional[str] = None
    website: Optional[str] = None
    founded: Optional[str] = None

@dataclass
class ResearchFindings:
    timestamp: str
    source: str
    category: str
    summary: str
    url: Optional[str] = None
    relevance_score: Optional[float] = None

class ResearchDB:
    def __init__(self, db_path: Path = DB_PATH):
        self.db_path = db_path
        self.conn = duckdb.connect(str(db_path))
        self.init_schema()
    
    def init_schema(self):
        """Initialize database schema with version tracking"""
        # Schema version tracking
        self.conn.execute("""
            CREATE SEQUENCE IF NOT EXISTS company_id_seq;
            
            CREATE TABLE IF NOT EXISTS companies (
                id INTEGER PRIMARY KEY DEFAULT nextval('company_id_seq'),
                name VARCHAR UNIQUE,
                industry VARCHAR,
                headquarters VARCHAR,
                description TEXT,
                website VARCHAR,
                founded VARCHAR,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            )
        """)
        
        self.conn.execute("""
            CREATE SEQUENCE IF NOT EXISTS finding_id_seq;
            
            CREATE TABLE IF NOT EXISTS research_findings (
                id INTEGER PRIMARY KEY DEFAULT nextval('finding_id_seq'),
                company_id INTEGER,
                category VARCHAR,
                summary TEXT,
                source VARCHAR,
                url VARCHAR,
                relevance_score FLOAT,
                raw_data JSON,
                discovered_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                FOREIGN KEY (company_id) REFERENCES companies(id)
            )
        """)

    def add_company(self, company_info: CompanyBasicInfo) -> int:
        """Add or update a company in the database"""
        try:
            # Check if company exists
            existing = self.conn.execute(
                "SELECT id FROM companies WHERE name = ?", 
                (company_info.name,)
            ).fetchone()
            
            if existing:
                # Update existing company
                self.conn.execute("""
                    UPDATE companies 
                    SET industry = ?,
                        headquarters = ?,
                        description = ?,
                        website = ?,
                        founded = ?,
                        updated_at = CURRENT_TIMESTAMP
                    WHERE id = ?
                """, (
                    company_info.industry,
                    company_info.headquarters,
                    company_info.description,
                    company_info.website,
                    company_info.founded,
                    existing[0]
                ))
                return existing[0]
            else:
                # Insert new company
                result = self.conn.execute("""
                    INSERT INTO companies (
                        name, industry, headquarters, description, website, founded
                    )
                    VALUES (?, ?, ?, ?, ?, ?)
                    RETURNING id
                """, (
                    company_info.name,
                    company_info.industry,
                    company_info.headquarters,
                    company_info.description,
                    company_info.website,
                    company_info.founded
                )).fetchone()
                return result[0]
        except Exception as e:
            print(f"Error managing company: {str(e)}")
            return None

# Test the fixed code
if __name__ == "__main__":
    # Initialize database
    db = ResearchDB()
    
    # Add a test company
    tesla_info = CompanyBasicInfo(
        name="Tesla",
        industry="Automotive & Energy",
        headquarters="Austin, Texas",
        website="tesla.com",
        founded="2003",
        description="Electric vehicle and clean energy company"
    )
    
    company_id = db.add_company(tesla_info)
    print(f"Added Tesla with ID: {company_id}")
    
    if company_id:
        # Add a test finding
        finding = ResearchFindings(
            timestamp=datetime.now().isoformat(),
            source="EPA Database",
            category="environmental_safety",
            summary="Environmental impact assessment for Gigafactory",
            url="https://example.com/report",
            relevance_score=0.85
        )
        
        raw_data = {
            "document_type": "impact_assessment",
            "facility": "Gigafactory Texas",
            "assessment_date": "2023-12-01",
            "key_findings": ["water usage concerns", "wildlife impact mitigation"]
        }
        
        db.add_finding(company_id, finding, raw_data)
        
        # Verify the data
        findings = db.get_company_findings("Tesla", category="environmental_safety")
        print("\nTesla Environmental Safety Findings:")
        for f in findings:
            print(f"\nCategory: {f['category']}")
            print(f"Summary: {f['summary']}")
            print(f"Source: {f['source']}")
            print(f"Relevance: {f['relevance_score']}")
            if f['raw_data']:
                print("Raw Data:", f['raw_data'])

Error managing company: Constraint Error: NOT NULL constraint failed: companies.id
Added Tesla with ID: None


In [8]:
from typing import Dict, List, Optional
from dataclasses import dataclass, asdict
from datetime import datetime
import json

@dataclass
class CompanyBasicInfo:
    name: str
    industry: Optional[str] = None
    headquarters: Optional[str] = None
    description: Optional[str] = None
    website: Optional[str] = None
    founded: Optional[str] = None
    
@dataclass
class NewsItem:
    title: str
    url: str
    date: str
    source: str
    description: Optional[str] = None
    
@dataclass
class SearchStrategy:
    category: str  # environmental_safety, financial_legal, or ethical_social
    query: str
    sources: List[str]
    date_range: Optional[str] = None
    keywords: List[str] = None
    
@dataclass
class ResearchFindings:
    timestamp: str
    source: str
    category: str
    summary: str
    url: Optional[str] = None
    relevance_score: Optional[float] = None

@dataclass
class CompanyProfile:
    basic_info: CompanyBasicInfo
    news: List[NewsItem]
    search_strategies: List[SearchStrategy]
    findings: List[ResearchFindings]
    last_updated: str
    research_status: str  # 'initial', 'in_progress', 'completed'
    
class CompanyDatabase:
    def __init__(self, save_path: str = "company_research.json"):
        self.save_path = save_path
        self.companies: Dict[str, CompanyProfile] = {}
        self.load_database()
    
    def load_database(self):
        """Load existing research data from JSON file"""
        try:
            with open(self.save_path, 'r') as f:
                data = json.load(f)
                for company_name, profile_data in data.items():
                    self.companies[company_name] = CompanyProfile(**profile_data)
        except FileNotFoundError:
            print(f"No existing database found at {self.save_path}")
    
    def save_database(self):
        """Save current research data to JSON file"""
        with open(self.save_path, 'w') as f:
            json.dump({name: asdict(profile) for name, profile in self.companies.items()}, 
                     f, indent=2)
    
    def add_company(self, name: str, initial_info: Dict = None):
        """Add a new company to the database"""
        if name in self.companies:
            print(f"Company {name} already exists in database")
            return
        
        basic_info = CompanyBasicInfo(
            name=name,
            **(initial_info or {})
        )
        
        self.companies[name] = CompanyProfile(
            basic_info=basic_info,
            news=[],
            search_strategies=[],
            findings=[],
            last_updated=datetime.now().isoformat(),
            research_status='initial'
        )
        self.save_database()
    
    def update_company_info(self, name: str, info_type: str, data: Dict):
        """Update specific information for a company"""
        if name not in self.companies:
            print(f"Company {name} not found in database")
            return
        
        company = self.companies[name]
        
        if info_type == 'basic_info':
            company.basic_info = CompanyBasicInfo(**data)
        elif info_type == 'news':
            company.news.append(NewsItem(**data))
        elif info_type == 'search_strategy':
            company.search_strategies.append(SearchStrategy(**data))
        elif info_type == 'finding':
            company.findings.append(ResearchFindings(**data))
        
        company.last_updated = datetime.now().isoformat()
        self.save_database()
    
    def get_company_summary(self, name: str) -> Dict:
        """Get a summary of all research data for a company"""
        if name not in self.companies:
            return {"error": f"Company {name} not found"}
        
        company = self.companies[name]
        return {
            "basic_info": asdict(company.basic_info),
            "news_count": len(company.news),
            "latest_news": [asdict(n) for n in company.news[-5:]] if company.news else [],
            "search_strategies": len(company.search_strategies),
            "findings_count": len(company.findings),
            "latest_findings": [asdict(f) for f in company.findings[-5:]] if company.findings else [],
            "last_updated": company.last_updated,
            "research_status": company.research_status
        }

# Example usage
if __name__ == "__main__":
    # Initialize database
    db = CompanyDatabase()
    
    # Add some companies
    companies_to_research = [
        "Tesla",
        "Apple",
        "Microsoft",
        "Amazon",
        "Google"
    ]
    
    for company in companies_to_research:
        db.add_company(company)
        
    # Example of adding initial information
    tesla_info = {
        "industry": "Automotive & Energy",
        "headquarters": "Austin, Texas",
        "website": "tesla.com",
        "founded": "2003"
    }
    db.update_company_info("Tesla", "basic_info", tesla_info)
    
    # Example of adding a research finding
    finding = {
        "timestamp": datetime.now().isoformat(),
        "source": "EPA Database",
        "category": "environmental_safety",
        "summary": "Found environmental impact assessment for Gigafactory",
        "url": "https://example.com/report",
        "relevance_score": 0.85
    }
    db.update_company_info("Tesla", "finding", finding)
    
    # Print summary for a company
    print("\nTesla Research Summary:")
    print(json.dumps(db.get_company_summary("Tesla"), indent=2))

No existing database found at company_research.json


TypeError: CompanyBasicInfo.__init__() missing 1 required positional argument: 'name'

In [7]:
x

def generate_search_prompts(company_name):
    prompt_configs = [
        {
            "type": "environmental_safety",
            "system": """You are a search prompt generator focused on uncovering corporate environmental and safety violations. Generate:
1. A focused search query using Boolean operators (AND, OR, NOT)
2. A list of recommended sources to check (e.g., EPA database, OSHA records)
3. Specific date ranges or notable incidents if applicable

For {company}, focus on: environmental damage, toxic spills, worker safety violations, workplace accidents, public health hazards.
Include keywords like: toxic, spill, contamination, "safety violation", accident, death, injury, "OSHA citation"."""
        },
        {
            "type": "financial_legal",
            "system": """You are a search prompt generator focused on uncovering corporate financial and legal misconduct. Generate:
1. A focused search query using Boolean operators (AND, OR, NOT)
2. A list of recommended sources to check (e.g., SEC filings, court records)
3. Specific date ranges or notable cases if applicable

For {company}, focus on: fraud, embezzlement, insider trading, bribery, corruption, regulatory violations, lawsuits.
Include keywords like: fraud, "SEC investigation", lawsuit, settlement, fine, "criminal charges", "regulatory action"."""
        },
        {
            "type": "ethical_social",
            "system": """You are a search prompt generator focused on uncovering corporate ethical and social controversies. Generate:
1. A focused search query using Boolean operators (AND, OR, NOT)
2. A list of recommended sources to check (e.g., news archives, labor board records)
3. Specific date ranges or notable incidents if applicable

For {company}, focus on: discrimination, harassment, labor disputes, human rights violations, political activities, social impact.
Include keywords like: discrimination, harassment, "labor violation", protest, boycott, controversy, scandal."""
        }
    ]

    prompts = []
    for config in prompt_configs:
        response = client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {"role": "system", "content": config["system"].format(company=company_name)},
                {"role": "user", "content": f"Generate a structured search strategy for {company_name}"}
            ],
            max_tokens=4000,
            temperature=0.2,
            stream=False
        )
        prompts.append({
            "type": config["type"],
            "content": response.choices[0].message.content
        })
    
    return prompts

# Example usage
company_name = "Example Corp"
results = generate_search_prompts(company_name)

# Print results in a structured format
for result in results:
    print(f"\n=== {result['type'].upper()} SEARCH STRATEGY ===")
    print(result['content'])
    print("=" * 80)


# Generate search prompts for different types of corporate controversies
prompts = [
    client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "system", "content": "You are a search prompt generator focused on uncovering corporate environmental and safety violations. When given a company name, generate a detailed search query to uncover incidents of environmental damage, toxic spills, worker safety violations, workplace accidents, and public health hazards. Focus on keywords like 'toxic', 'spill', 'contamination', 'safety violation', 'accident', 'death', 'injury', 'OSHA citation'."},
            {"role": "user", "content": "Hello"}
        ],
        max_tokens=4000,
        temperature=0,
        stream=False
    ),
    client.chat.completions.create(
        model="deepseek-chat", 
        messages=[
            {"role": "system", "content": "You are a search prompt generator focused on uncovering corporate financial and legal misconduct. When given a company name, generate a detailed search query to uncover fraud, embezzlement, insider trading, bribery, corruption, regulatory violations, and major lawsuits. Focus on keywords like 'fraud', 'SEC investigation', 'lawsuit', 'settlement', 'fine', 'criminal charges', 'regulatory action'."},
            {"role": "user", "content": "Hello"}
        ],
        max_tokens=4000,
        temperature=0,
        stream=False
    ),
    client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "system", "content": "You are a search prompt generator focused on uncovering corporate ethical and social controversies. When given a company name, generate a detailed search query to uncover discrimination, harassment, labor disputes, human rights violations, controversial political activities, and negative social impact. Focus on keywords like 'discrimination', 'harassment', 'labor violation', 'protest', 'boycott', 'controversy', 'scandal'."},
            {"role": "user", "content": "Hello"}
        ],
        max_tokens=4000,
        temperature=0,
        stream=False
    )
]

# Print all generated prompts
for i, response in enumerate(prompts, 1):
    print(f"\nPrompt {i}:")
    print(response.choices[0].message.content)
    print("-" * 80)

KeyboardInterrupt: 