## use local files and online marketing news to build a situational awareness model

In [17]:
# Import libraries
# data_loader.py
import os
from pathlib import Path
from embedding import *

class DataLoader:
    def __init__(self, base_folder:str):
        self.base_folder = Path(base_folder)

    def list_files(self, subfolder: str, extensions: list):
        folder_path = self.base_folder / subfolder
        return [
            file for file in folder_path.glob("**/*") if file.is_file() and file.suffix.lower() in extensions
        ]
    def read_all_files(self, subfolder, extensions):
        # from embedding.text_extractor import TextExtractor

        extractor = TextExtractor()
        contents = []

        for file in self.list_files(subfolder, extensions):
            text = extractor.extract(file)
            if text:
                contents.append((str(file), text))

        return contents





In [18]:
# text_extractor.py
import textract
import pandas as pd
from pathlib import Path

class TextExtractor:
    def extract(self, file_path: Path):
        try:
            ext = file_path.suffix.lower()
            if ext in ['.txt']:
                return file_path.read_text(encoding='utf-8', errors='ignore')
            elif ext in ['.docx', '.doc', '.pdf']:
                return textract.process(str(file_path)).decode('utf-8')
            elif ext in ['.csv', 'xlsx']:
                df = pd.read_csv(file_path) if ext == '.csv' else pd.read_excel(file_path)
                return df.to_string(index=False)
        except Exception as e:
            print(f"Failed to extract {file_path}: {e}")
            return ""

In [19]:
# embedder.py
from sentence_transformers import SentenceTransformer
import numpy as np

class Embedder:
    def __init__(self, model_name: str="all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)
    def embed_text(self, texts: str):
        return self.model.encode(texts, convert_to_numpy=True)   # convert to tensor or numpy array, which could have better performance?
    def embed_from_files(self, data: list[tuple[str, str]]) -> dict:
        texts = [text for _, text in data]
        embeddings = self.embed_text(texts)
        return {filename: emb for (filename, _), emb in zip(data, embeddings)}

In [24]:
BASE_FOLDER = "/Users/eshan/PycharmProjects/FastAPIProject2/test_data/"

In [25]:

# from embedding.data_loader import DataLoader
# from embedding.embedder import Embedder


# Load data
loader = DataLoader(BASE_FOLDER)
documents = loader.read_all_files("pdf_files", [".pdf"])

# Generate embeddings
embedder = Embedder()
embeddings = embedder.embed_from_files(documents)

# Show results
for filename, vector in embeddings.items():
    print(f"{filename} -> vector shape: {vector.shape}")

/Users/eshan/PycharmProjects/FastAPIProject2/test_data/pdf_files/Trends_Artificial_Intelligence.pdf -> vector shape: (384,)
/Users/eshan/PycharmProjects/FastAPIProject2/test_data/pdf_files/The Forrester Wave™_ Data Management For Analytics Platforms, Q2 2025 _ 001a000001AEj6nAAD _ 53381efb.pdf -> vector shape: (384,)


# LLM Part I - Summarization

In [None]:
'''
seperate the code into two parts, one for summarization and one for strategy modules, for
1. Modularity: for better upgrade/replace one without affecting the other
2. Reusability: The summarizer can be reuse for other tasks, such as report generation, logs, etc.
3. Focus: Each module can specialize (summarizer: compression; strategist: reasoning)
4. Pipelining: Easy to compose into an end-to-end pipeline:
   - Summarization -> Embedding -> Strategy Module
'''

In [None]:
'''
Input: Raw text (PDF, news, platform texts, etc.
Output: Key sentences, bullet points, insights, structured results
Model Choice:
    1. OpenAI GPT-3.5/4 -> Prompt-based summarization
    2. HuggingFace Transformers (e.g., BART, T5) -> Fine-tuned for summarization tasks
    3. Custom-trained models (if domain-specific) -> Open-source models or fine-tuned on specific datasets
'''

In [None]:
'''
Examples prompt for summarization:
Example 1:
You are an expert financial analyst. Summarize the following text into key insights, bullet points, and structured results. Focus on:
1. Key financial metrics
2. Market trends
3. Company performance
4. Risk factors

Example 2:
Given the following text, extract key sentences and bullet points that highlight the most important information. Focus on:
1. Financial performance
2. Market trends
3. Strategic insights ###
'''

# LLM Part II - Strategy Modules

In [None]:
'''
Input: Summary + Trend Forecast + Market Data
Output: Natural language strategy, (todo: trading signals, risk assessment)
Model Choice:
    1. OpenAI GPT-3.5/4 -> Prompt-based summary + prediction for strategy generation
    2. Mixtral/Command R -> Open-source models for alternative strategy generation
    3. Custom fine-tuned models -> On business decision-making datasets (Optional, e.g., financial reports, market analysis)
'''

In [None]:
'''
Examples prompt for strategy generation:
Example 1:
You are an expert financial strategist. Based on the provided summary and market data, generate a comprehensive strategy for the next quarter. Consider the following factors:
1. Market trends
2. Economic indicators
3. Company performance
4. Risk factors
Generate a detailed strategy that includes:
- Key actions to take
- Risk management strategies
- Expected outcomes

Example 2:
Given the business summary and the forecast above, suggest a short-term and long-term strategy to optimize performance. Consider cost management, regional focus, and product timeline.

'''