In [1]:
# Step 1: Auto-install missing packages
import importlib
import subprocess
import sys

def install_if_missing(package_name, import_name=None):
    try:
        importlib.import_module(import_name if import_name else package_name)
    except ImportError:
        print(f"Installing {package_name}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])

# List of required packages
required_packages = [
    ("pandas", "pd"),
    ("numpy", "np"),
    ("PyPDF2", "PyPDF2"),
    ("nltk", "nltk"),
    ("scikit-learn", "sklearn"),
    ("yfinance", "yfinance"),
    ("matplotlib", "matplotlib")
]

for pkg, imp in required_packages:
    install_if_missing(pkg, imp)

# Step 2: Import necessary libraries
import os
import pandas as pd
import numpy as np
import PyPDF2
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import yfinance as yf
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import pickle
import warnings
warnings.filterwarnings('ignore')

# Step 3: Clean and re-download NLTK resources
import shutil

# Fully remove nltk_data if it exists (forces a clean download)
nltk_data_path = '/content/nltk_data'
shutil.rmtree(nltk_data_path, ignore_errors=True)
os.makedirs(nltk_data_path, exist_ok=True)

# Set NLTK to look here first
nltk.data.path.clear()
nltk.data.path.append(nltk_data_path)

# Download required corpora
nltk.download('punkt', download_dir=nltk_data_path)
nltk.download('stopwords', download_dir=nltk_data_path)
nltk.download('wordnet', download_dir=nltk_data_path)


Installing pandas...
Installing numpy...
Installing PyPDF2...
Installing yfinance...


[nltk_data] Downloading package punkt to /content/nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to /content/nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to /content/nltk_data...


True

In [None]:
from google.colab import files
uploaded = files.upload()


ModuleNotFoundError: No module named 'google.colab'

In [5]:
import os
import shutil

# Create necessary directories
os.makedirs('data/books', exist_ok=True)
os.makedirs('data/processed', exist_ok=True)
os.makedirs('data/knowledge_base', exist_ok=True)
os.makedirs('data/models', exist_ok=True)
os.makedirs('data/results', exist_ok=True)

# Move all uploaded PDFs to data/books/
for filename in uploaded.keys():
    if filename.lower().endswith(".pdf"):
        shutil.move(filename, f"data/books/{filename}")


In [6]:
!ls data/books


 little-book-that-still-beats-the-market-the-joel-greenblatt.pdf
'security-analysis-benjamin-graham-6th-edition-pdf-february-24-2010-12-08-am-3-0-meg (1).pdf'
'the-intelligent-investor (1).pdf'
'TJA-Trading-In-The-Zone-master-the-market-with-confidence-discipline-and-a-winning-attitude-by-Mark-Douglas-Book-Novel-by-www.indianpdf.com_-Download-PDF-Online-Free (1).pdf'
 what-works-on-wall-street.pdf


In [3]:
!pip install PyPDF2
import os, re, PyPDF2


Defaulting to user installation because normal site-packages is not writeable


In [8]:
# Extract text from each PDF book
def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text += page.extract_text()
        return text
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {str(e)}")
        return ""

# Clean and preprocess text
def clean_text(text):
    """Clean and preprocess the extracted text."""
    # Convert to lowercase
    text = text.lower()

    # Remove special characters and numbers (but keep periods and other sentence-ending punctuation)
    text = re.sub(r'[^\w\s.,;:!?()\'\"-]', ' ', text)

    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)

    return text.strip()

# Process all PDF books
def process_all_books():
    """Process all PDF books in the data/books directory."""
    print("Processing PDF books...")
    books_data = {}

    for filename in os.listdir('data/books'):
        if filename.lower().endswith('.pdf'):
            book_name = os.path.splitext(filename)[0]
            file_path = os.path.join('data/books', filename)
            print(f"Extracting text from {book_name}...")

            book_text = extract_text_from_pdf(file_path)

            if book_text:
                # Save raw text
                with open(f"data/processed/{book_name}_raw.txt", 'w', encoding='utf-8') as f:
                    f.write(book_text)

                # Clean text
                book_text = clean_text(book_text)
                books_data[book_name] = book_text

                # Save cleaned text
                with open(f"data/processed/{book_name}_cleaned.txt", 'w', encoding='utf-8') as f:
                    f.write(book_text)

                print(f"Successfully processed {book_name} ({len(book_text)} characters)")
            else:
                print(f"Failed to extract text from {book_name}")

    return books_data

In [9]:
processed_books = process_all_books()

Processing PDF books...
Extracting text from TJA-Trading-In-The-Zone-master-the-market-with-confidence-discipline-and-a-winning-attitude-by-Mark-Douglas-Book-Novel-by-www.indianpdf.com_-Download-PDF-Online-Free (1)...
Successfully processed TJA-Trading-In-The-Zone-master-the-market-with-confidence-discipline-and-a-winning-attitude-by-Mark-Douglas-Book-Novel-by-www.indianpdf.com_-Download-PDF-Online-Free (1) (436374 characters)
Extracting text from security-analysis-benjamin-graham-6th-edition-pdf-february-24-2010-12-08-am-3-0-meg (1)...
Successfully processed security-analysis-benjamin-graham-6th-edition-pdf-february-24-2010-12-08-am-3-0-meg (1) (1706461 characters)
Extracting text from the-intelligent-investor (1)...
Successfully processed the-intelligent-investor (1) (1276740 characters)
Extracting text from little-book-that-still-beats-the-market-the-joel-greenblatt...
Successfully processed little-book-that-still-beats-the-market-the-joel-greenblatt (209046 characters)
Extracting t

In [4]:
# Extract investment principles and rules
def extract_investment_principles(books_data):
    """Extract investment principles and rules from book texts."""
    print("\nExtracting investment principles...")

    # Keywords related to investment principles
    rule_indicators = [
        'rule', 'principle', 'strategy', 'method', 'approach', 'technique',
        'important', 'essential', 'crucial', 'critical', 'key', 'fundamental',
        'always', 'never', 'must', 'should', 'recommend', 'suggest',
        'buy when', 'sell when', 'invest in', 'avoid', 'consider',
        'indicator', 'signal', 'pattern', 'trend', 'analysis', 'valuation',
        'price-to-earnings', 'p/e', 'dividend', 'yield', 'growth', 'value',
        'bullish', 'bearish', 'market', 'stock', 'share', 'investment'
    ]

    all_principles = {}

    for book_name, text in books_data.items():
        print(f"Extracting principles from {book_name}...")
        book_principles = []

        # Split text into sentences
        sentences = sent_tokenize(text)

        # For each sentence, check if it contains rule indicators
        for sentence in sentences:
            sentence = sentence.strip()

            # Skip short sentences
            if len(sentence) < 25:
                continue

            # Check if sentence contains rule indicators
            if any(indicator in sentence.lower() for indicator in rule_indicators):
                # Additional filtering to ensure quality
                # Make sure sentence is complete
                if sentence[-1] in ['.', '!', '?', ':', ';']:
                    book_principles.append(sentence)

        # Remove duplicates and near-duplicates
        book_principles = remove_duplicates(book_principles)

        # Store principles
        all_principles[book_name] = book_principles

        # Save principles to file
        with open(f"data/knowledge_base/{book_name}_principles.txt", 'w', encoding='utf-8') as f:
            for principle in book_principles:
                f.write(principle + "\n\n")

        print(f"Extracted {len(book_principles)} principles from {book_name}")

    return all_principles

# Remove duplicate or very similar principles
def remove_duplicates(principles):
    """Remove duplicate or very similar principles."""
    unique_principles = []

    if not principles:
        return unique_principles

    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer()

    for i, principle in enumerate(principles):
        if i == 0:
            unique_principles.append(principle)
            continue

        # Calculate similarity with existing unique principles
        tfidf_matrix = vectorizer.fit_transform([principle] + unique_principles)
        similarity_matrix = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])

        # If similarity is below threshold, add to unique principles
        if not any(sim > 0.7 for sim in similarity_matrix[0]):
            unique_principles.append(principle)

    return unique_principles

In [5]:
# Manually load and patch the PunktSentenceTokenizer to avoid the phantom "punkt_tab" error
import nltk.tokenize.punkt
tokenizer_path = os.path.join(nltk_data_path, 'tokenizers/punkt/english.pickle')

try:
    with open(tokenizer_path, 'rb') as f:
        punkt_tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer(pickle.load(f))
        nltk.tokenize.sent_tokenize = punkt_tokenizer.tokenize
except Exception as e:
    print(f"Manual punkt patch failed: {e}")

# Download 'punkt_tab' resource explicitly
nltk.download('punkt_tab', download_dir=nltk_data_path)

[nltk_data] Downloading package punkt_tab to /content/nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [12]:
investment_principles = extract_investment_principles(processed_books)


Extracting investment principles...
Extracting principles from TJA-Trading-In-The-Zone-master-the-market-with-confidence-discipline-and-a-winning-attitude-by-Mark-Douglas-Book-Novel-by-www.indianpdf.com_-Download-PDF-Online-Free (1)...
Extracted 961 principles from TJA-Trading-In-The-Zone-master-the-market-with-confidence-discipline-and-a-winning-attitude-by-Mark-Douglas-Book-Novel-by-www.indianpdf.com_-Download-PDF-Online-Free (1)
Extracting principles from security-analysis-benjamin-graham-6th-edition-pdf-february-24-2010-12-08-am-3-0-meg (1)...
Extracted 5807 principles from security-analysis-benjamin-graham-6th-edition-pdf-february-24-2010-12-08-am-3-0-meg (1)
Extracting principles from the-intelligent-investor (1)...
Extracted 4202 principles from the-intelligent-investor (1)
Extracting principles from little-book-that-still-beats-the-market-the-joel-greenblatt...
Extracted 666 principles from little-book-that-still-beats-the-market-the-joel-greenblatt
Extracting principles from 

In [13]:
# Categorize principles into relevant groups
def categorize_principles(all_principles):
    """Categorize principles into relevant groups for stock analysis."""
    print("\nCategorizing principles...")

    categories = {
        'value_investing': ['value', 'intrinsic', 'margin of safety', 'undervalued', 'overvalued', 'book value', 'p/e', 'price-to-earnings', 'price to earnings'],
        'growth_investing': ['growth', 'earnings growth', 'revenue growth', 'expanding', 'scalable', 'future potential'],
        'technical_analysis': ['chart', 'pattern', 'trend', 'moving average', 'resistance', 'support', 'volume', 'momentum', 'oscillator'],
        'fundamental_analysis': ['fundamental', 'balance sheet', 'income statement', 'cash flow', 'earnings', 'revenue', 'profit', 'margin'],
        'risk_management': ['risk', 'diversification', 'allocation', 'portfolio', 'loss', 'hedge', 'protection', 'downside'],
        'market_timing': ['timing', 'entry', 'exit', 'buy signal', 'sell signal', 'overbought', 'oversold', 'market condition'],
        'psychological_factors': ['psychology', 'emotion', 'fear', 'greed', 'discipline', 'patience', 'confidence', 'contrarian'],
        'company_quality': ['management', 'competitive advantage', 'moat', 'leadership', 'industry position', 'innovation', 'brand', 'market share']
    }

    categorized_principles = {category: [] for category in categories}

    # Process each book's principles
    for book_name, principles in all_principles.items():
        for principle in principles:
            # Assign to categories based on keywords
            for category, keywords in categories.items():
                if any(keyword in principle.lower() for keyword in keywords):
                    # Add book source to principle
                    categorized_principles[category].append({
                        'principle': principle,
                        'source': book_name
                    })

    # Save categorized principles
    for category, principles in categorized_principles.items():
        if principles:
            with open(f"data/knowledge_base/{category}_principles.txt", 'w', encoding='utf-8') as f:
                for principle_data in principles:
                    f.write(f"SOURCE: {principle_data['source']}\n")
                    f.write(f"PRINCIPLE: {principle_data['principle']}\n\n")

    # Create a unified knowledge base
    create_unified_knowledge_base(categorized_principles)

    return categorized_principles

# Create a unified knowledge base
def create_unified_knowledge_base(categorized_principles):
    """Create a unified knowledge base from all categorized principles."""
    unified_kb = []

    for category, principles in categorized_principles.items():
        for principle_data in principles:
            unified_kb.append({
                'category': category,
                'principle': principle_data['principle'],
                'source': principle_data['source']
            })

    # Save unified knowledge base
    with open("data/knowledge_base/unified_knowledge_base.pkl", 'wb') as f:
        pickle.dump(unified_kb, f)

    # Also save as text
    with open("data/knowledge_base/unified_knowledge_base.txt", 'w', encoding='utf-8') as f:
        for entry in unified_kb:
            f.write(f"CATEGORY: {entry['category']}\n")
            f.write(f"SOURCE: {entry['source']}\n")
            f.write(f"PRINCIPLE: {entry['principle']}\n\n")

    print(f"Created unified knowledge base with {len(unified_kb)} principles")

In [14]:
categorized_principles = categorize_principles(investment_principles)


Categorizing principles...
Created unified knowledge base with 7046 principles


In [15]:
# Load the knowledge base
def load_knowledge_base():
    """Load the unified knowledge base."""
    try:
        with open("data/knowledge_base/unified_knowledge_base.pkl", 'rb') as f:
            return pickle.load(f)
    except FileNotFoundError:
        print("Knowledge base not found. Please run the extraction process first.")
        return []

import yfinance as yf
import pandas as pd

# Example: Get stock data for a ticker (e.g., Apple 'AAPL')
#ticker = 'AAPL'
#ticker_data = yf.download(ticker, period='30d', interval='1d')  # Last 30 days of data

def calculate_financial_metrics(ticker_data, period=7):
    """Calculate key financial metrics for a stock over the given period (in days)."""
    # Make sure we have enough data
    if len(ticker_data) < period:
        print(f"Not enough data points. Need at least {period} days.")
        return None

    # Get the most recent 'period' days of data
    recent_data = ticker_data.tail(period)

    # Debugging: Print the type and structure of 'recent_data' and 'recent_data['Close']'
    print("Recent Data (last 7 days):")
    print(recent_data)

    # Ensure 'Close' is treated as a Series
    close_data = recent_data['Close']
    print("Type of close_data:", type(close_data))
    print("Is close_data empty?", close_data.empty)

    if 'Close' not in recent_data.columns:
        print("Error: 'Close' column is missing from the data.")
        return None

     # Check if 'Close' data contains NaN values
    if close_data.isnull().any().any():  # <--- Change here
        print("Warning: 'Close' data contains NaN values. Please clean the data.")
        return None

    metrics = {}

    # Price metrics
    closing_price = close_data.iloc[-1]  # Get the last row's closing price
    metrics['closing_price'] = closing_price
    metrics['avg_price'] = close_data.mean()
    metrics['price_change'] = closing_price - close_data.iloc[0]  # Get the first row's closing price
    metrics['price_change_pct'] = (metrics['price_change'] / close_data.iloc[0]) * 100

    # Volume metrics
    metrics['volume'] = recent_data['Volume'].iloc[-1]  # Get the last row's volume
    metrics['avg_volume'] = recent_data['Volume'].mean()

    # Volume change percentage
    # Get the last element of the avg_volume Series to use for comparison
    avg_volume_value = metrics['avg_volume']
    # If the avg_volume Series has more than one value, use the last one
    if isinstance(avg_volume_value, pd.Series):
        avg_volume_value = avg_volume_value.iloc[-1]
    metrics['volume_change_pct'] = (
        (metrics['volume'] - metrics['avg_volume']) / metrics['avg_volume']) * 100 if avg_volume_value > 0 else 0


    # Trend metrics (ensure correct scalar comparison)
    metrics['uptrend'] = close_data.iloc[-1] > close_data.iloc[0]  # Compare scalars (individual values)

    # Volatility metrics
    metrics['volatility'] = close_data.std()
    metrics['volatility_pct'] = (metrics['volatility'] / metrics['avg_price']) * 100

    # Technical indicators (Simple Moving Average)
    metrics['sma_5'] = close_data.rolling(window=min(5, period)).mean().iloc[-1]

    # Check for enough data to calculate 20-day SMA
    if len(ticker_data) >= 20:
        metrics['sma_20'] = ticker_data['Close'].rolling(window=20).mean().iloc[-1]
        # Moving average crossover signal
        metrics['ma_crossover_signal'] = metrics['sma_5'] > metrics['sma_20']
    else:
        metrics['sma_20'] = None
        metrics['ma_crossover_signal'] = None

    return metrics

In [20]:
# Load the knowledge base after extraction
knowledge_base = load_knowledge_base()

# Check if knowledge base was successfully loaded
if knowledge_base:
    print(f"Loaded knowledge base with {len(knowledge_base)} principles.")
else:
    print("No knowledge base found. Please extract principles first.")


Loaded knowledge base with 7046 principles.


In [None]:


# Example: Get stock data for a ticker (e.g., Apple 'AAPL')
ticker = 'AAPL'
ticker_data = yf.download(ticker, period='30d', interval='1d')  # Last 30 days of data

# Calculate financial metrics over the last 7 days
metrics = calculate_financial_metrics(ticker_data, period=7)

# Display the calculated metrics
if metrics:
    print(f"Calculated metrics for {ticker}:")
    print(metrics)
else:
    print(f"Not enough data to calculate metrics for {ticker}.")

[*********************100%***********************]  1 of 1 completed
ERROR:yfinance:
1 Failed download:
ERROR:yfinance:['AAPL']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


Not enough data points. Need at least 7 days.
Not enough data to calculate metrics for AAPL.


In [22]:
# Function to analyze stock based on knowledge base principles
def analyze_stock_with_principles(ticker_symbol, knowledge_base, period=7):
    """Analyze a stock using principles from the knowledge base."""
    print(f"\nAnalyzing {ticker_symbol} using knowledge-based principles...")

    try:
        # Get stock data for analysis
        end_date = datetime.now()
        start_date = end_date - timedelta(days=period * 2)  # Get more data than needed for calculations

        ticker_data = yf.download(ticker_symbol, start=start_date, end=end_date)

        if ticker_data.empty:
            print(f"No data found for {ticker_symbol}")
            return None

        # Calculate financial metrics
        metrics = calculate_financial_metrics(ticker_data, period)

        if not metrics:
            return None

        # Analyze based on principles in the knowledge base
        analysis_results = {
            'ticker': ticker_symbol,
            'analysis_date': datetime.now().strftime("%Y-%m-%d"),
            'metrics': metrics,
            'insights': [],
            'signals': {'buy': 0, 'hold': 0, 'sell': 0}
        }

        # Apply principles from each category
        for entry in knowledge_base:
            category = entry['category']
            principle = entry['principle']

            # Generate insights based on principles and metrics
            if category == 'technical_analysis':
                if 'trend' in principle.lower() and metrics['uptrend']:
                    analysis_results['insights'].append({
                        'category': category,
                        'principle': "Uptrend detected in recent price movement.",
                        'signal': 'buy'
                    })
                    analysis_results['signals']['buy'] += 1

                if 'moving average' in principle.lower() and metrics['sma_20'] and metrics['ma_crossover_signal']:
                    analysis_results['insights'].append({
                        'category': category,
                        'principle': "Short-term moving average crossed above long-term moving average, suggesting potential upward momentum.",
                        'signal': 'buy'
                    })
                    analysis_results['signals']['buy'] += 1

                if 'volume' in principle.lower() and metrics['volume_change_pct'] > 20:
                    analysis_results['insights'].append({
                        'category': category,
                        'principle': "Significant increase in trading volume, suggesting strong interest.",
                        'signal': 'buy' if metrics['price_change_pct'] > 0 else 'sell'
                    })
                    if metrics['price_change_pct'] > 0:
                        analysis_results['signals']['buy'] += 1
                    else:
                        analysis_results['signals']['sell'] += 1

            elif category == 'value_investing':
                # We don't have P/E and other fundamental data in our simple model
                # But we can look at price trends
                if 'undervalued' in principle.lower() and metrics['price_change_pct'] < -5:
                    analysis_results['insights'].append({
                        'category': category,
                        'principle': "Recent price decline may indicate potential value opportunity if fundamentals are strong.",
                        'signal': 'buy'
                    })
                    analysis_results['signals']['buy'] += 1

            elif category == 'risk_management':
                if 'volatility' in principle.lower() and metrics['volatility_pct'] > 3:
                    analysis_results['insights'].append({
                        'category': category,
                        'principle': "High volatility detected, suggesting increased risk.",
                        'signal': 'hold'
                    })
                    analysis_results['signals']['hold'] += 1

        # Generate additional insights based on metrics
        if metrics['price_change_pct'] > 5:
            analysis_results['insights'].append({
                'category': 'price_momentum',
                'principle': "Strong positive price momentum in the analysis period.",
                'signal': 'buy'
            })
            analysis_results['signals']['buy'] += 1

        if metrics['price_change_pct'] < -5:
            analysis_results['insights'].append({
                'category': 'price_momentum',
                'principle': "Significant price decline in the analysis period.",
                'signal': 'sell'
            })
            analysis_results['signals']['sell'] += 1

        # Calculate probabilities
        total_signals = sum(analysis_results['signals'].values())
        if total_signals > 0:
            analysis_results['probabilities'] = {
                'up': analysis_results['signals']['buy'] / total_signals,
                'neutral': analysis_results['signals']['hold'] / total_signals,
                'down': analysis_results['signals']['sell'] / total_signals
            }
        else:
            analysis_results['probabilities'] = {'up': 0.33, 'neutral': 0.34, 'down': 0.33}

        # Generate recommendation
        if analysis_results['probabilities']['up'] > 0.5:
            analysis_results['recommendation'] = 'BUY'
        elif analysis_results['probabilities']['down'] > 0.5:
            analysis_results['recommendation'] = 'SELL'
        else:
            analysis_results['recommendation'] = 'HOLD'

        # Create explanation
        analysis_results['explanation'] = f"Analysis of {ticker_symbol} based on investment principles suggests a {analysis_results['recommendation']} recommendation. "
        analysis_results['explanation'] += f"The probability of price increase is {analysis_results['probabilities']['up']:.2%}, "
        analysis_results['explanation'] += f"neutral is {analysis_results['probabilities']['neutral']:.2%}, "
        analysis_results['explanation'] += f"and decrease is {analysis_results['probabilities']['down']:.2%}."

        # Save results
        with open(f"data/results/{ticker_symbol}_analysis.pkl", 'wb') as f:
            pickle.dump(analysis_results, f)

        print(f"Analysis complete for {ticker_symbol}. Recommendation: {analysis_results['recommendation']}")
        return analysis_results

    except Exception as e:
        print(f"Error analyzing {ticker_symbol}: {str(e)}")
        return None

In [23]:
# Assuming the knowledge base is already loaded and available
ticker_symbol = 'AAPL'  # Example ticker
period = 7  # Analysis period (default is 7 days)

# Call the function to analyze the stock based on knowledge base principles
analysis_results = analyze_stock_with_principles(ticker_symbol, knowledge_base, period)

# Print the analysis results
if analysis_results:
    print(f"Analysis for {ticker_symbol}:")
    print(analysis_results)
else:
    print(f"No analysis results for {ticker_symbol}.")



Analyzing AAPL using knowledge-based principles...


[*********************100%***********************]  1 of 1 completed
ERROR:yfinance:
1 Failed download:
ERROR:yfinance:['AAPL']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


No data found for AAPL
No analysis results for AAPL.


In [24]:
# Function to display analysis results
def display_analysis_results(analysis_results):
    """Display analysis results in a readable format."""
    if not analysis_results:
        print("No analysis results to display.")
        return

    print("\n" + "="*50)
    print(f"STOCK ANALYSIS: {analysis_results['ticker']}")
    print(f"Date: {analysis_results['analysis_date']}")
    print("="*50)

    print("\nMETRICS:")
    metrics = analysis_results['metrics']
    print(f"Current Price: ${metrics['closing_price']:.2f}")
    print(f"Price Change (week): {metrics['price_change_pct']:.2f}%")
    print(f"Average Volume: {metrics['avg_volume']:.0f}")
    print(f"Volatility: {metrics['volatility_pct']:.2f}%")

    print("\nINSIGHTS:")
    for i, insight in enumerate(analysis_results['insights'], 1):
        print(f"{i}. {insight['principle']} ({insight['category'].replace('_', ' ').title()}) - Signal: {insight['signal'].upper()}")

    print("\nPROBABILITIES:")
    print(f"Price Up: {analysis_results['probabilities']['up']:.2%}")
    print(f"Price Neutral: {analysis_results['probabilities']['neutral']:.2%}")
    print(f"Price Down: {analysis_results['probabilities']['down']:.2%}")

    print("\nRECOMMENDATION:")
    print(analysis_results['recommendation'])

    print("\nEXPLANATION:")
    print(analysis_results['explanation'])
    print("="*50)

In [25]:
# Assuming the analysis results are available from the analyze_stock_with_principles function
ticker_symbol = 'AAPL'  # Example ticker
period = 7  # Analysis period (default is 7 days)

# First, analyze the stock based on knowledge base principles
analysis_results = analyze_stock_with_principles(ticker_symbol, knowledge_base, period)

# Now, display the analysis results
display_analysis_results(analysis_results)



Analyzing AAPL using knowledge-based principles...


[*********************100%***********************]  1 of 1 completed
ERROR:yfinance:
1 Failed download:
ERROR:yfinance:['AAPL']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


No data found for AAPL
No analysis results to display.


In [26]:
# Main function to run the model
def run_rule_based_model():
    """Run the rule-based stock prediction model."""
    print("\nRunning the Rule-Based Stock Prediction Model (Model 1)...")

    # Check if knowledge base exists
    if not os.path.exists("data/knowledge_base/unified_knowledge_base.pkl"):
        # Process books and extract principles
        books_data = process_all_books()
        all_principles = extract_investment_principles(books_data)
        categorized_principles = categorize_principles(all_principles)

    # Load knowledge base
    knowledge_base = load_knowledge_base()

    if not knowledge_base:
        print("Knowledge base is empty. Please check the extraction process.")
        return

    print(f"Loaded knowledge base with {len(knowledge_base)} principles.")

    # Define stocks to analyze
    stocks_to_analyze = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'META']

    # Analyze each stock
    analysis_results = {}
    for ticker in stocks_to_analyze:
        result = analyze_stock_with_principles(ticker, knowledge_base)
        if result:
            analysis_results[ticker] = result
            display_analysis_results(result)

    return analysis_results

# Function to analyze a specific stock (for external usage)
def analyze_specific_stock(ticker_symbol, period=7):
    """Analyze a specific stock using the rule-based model."""
    # Load knowledge base
    knowledge_base = load_knowledge_base()

    if not knowledge_base:
        print("Knowledge base is empty. Please run the full model first.")
        return None

    # Analyze the stock
    result = analyze_stock_with_principles(ticker_symbol, knowledge_base, period)

    # Display results
    if result:
        display_analysis_results(result)

    return result

In [27]:
# Call the run_rule_based_model to analyze a set of stocks
run_rule_based_model()



Running the Rule-Based Stock Prediction Model (Model 1)...
Loaded knowledge base with 7046 principles.

Analyzing AAPL using knowledge-based principles...


[*********************100%***********************]  1 of 1 completed
ERROR:yfinance:
1 Failed download:
ERROR:yfinance:['AAPL']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


No data found for AAPL

Analyzing MSFT using knowledge-based principles...


[*********************100%***********************]  1 of 1 completed
ERROR:yfinance:
1 Failed download:
ERROR:yfinance:['MSFT']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


No data found for MSFT

Analyzing GOOGL using knowledge-based principles...


[*********************100%***********************]  1 of 1 completed
ERROR:yfinance:
1 Failed download:
ERROR:yfinance:['GOOGL']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


No data found for GOOGL

Analyzing AMZN using knowledge-based principles...


[*********************100%***********************]  1 of 1 completed
ERROR:yfinance:
1 Failed download:
ERROR:yfinance:['AMZN']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


No data found for AMZN

Analyzing META using knowledge-based principles...


[*********************100%***********************]  1 of 1 completed
ERROR:yfinance:
1 Failed download:
ERROR:yfinance:['META']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


No data found for META


{}

In [28]:
# Example of calling analyze_specific_stock for a single stock (e.g., 'AAPL')
ticker_symbol = 'AAPL'  # Stock ticker to analyze
period = 7  # Default analysis period is 7 days

result = analyze_specific_stock(ticker_symbol, period)



Analyzing AAPL using knowledge-based principles...


[*********************100%***********************]  1 of 1 completed
ERROR:yfinance:
1 Failed download:
ERROR:yfinance:['AAPL']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


No data found for AAPL


In [None]:
# Example usage to analyze a stock report
def analyze_stock_report(ticker_symbol, report_text=None, period=7):
    """
    Analyze a stock report using the rule-based model.
    If report_text is provided, it will be considered in the analysis.
    Otherwise, only historical data will be used.
    """
    # Load knowledge base
    knowledge_base = load_knowledge_base()

    if not knowledge_base:
        print("Knowledge base is empty. Please run the full model first.")
        return None

    # Analyze based on historical data
    result = analyze_stock_with_principles(ticker_symbol, knowledge_base, period)

    # If report text is provided, enhance analysis with it
    if report_text and result:
        # Analyze report for sentiment and key information
        sentiment_score = 0

        # Simple keyword-based sentiment analysis
        positive_keywords = ['growth', 'profit', 'increase', 'exceed', 'beat', 'positive', 'strong', 'success', 'innovation']
        negative_keywords = ['decline', 'loss', 'decrease', 'miss', 'below', 'negative', 'weak', 'failure', 'risk']

        # Count occurrences
        pos_count = sum(report_text.lower().count(word) for word in positive_keywords)
        neg_count = sum(report_text.lower().count(word) for word in negative_keywords)

        # Calculate sentiment score (-1 to 1)
        total_count = pos_count + neg_count
        if total_count > 0:
            sentiment_score = (pos_count - neg_count) / total_count

        # Add report analysis to results
        result['report_analysis'] = {
            'sentiment_score': sentiment_score,
            'sentiment': 'Positive' if sentiment_score > 0.2 else ('Negative' if sentiment_score < -0.2 else 'Neutral')
        }

        # Adjust probabilities based on report sentiment
        if sentiment_score > 0.3:
            result['probabilities']['up'] = min(0.9, result['probabilities']['up'] + 0.2)
            result['probabilities']['down'] = max(0.1, result['probabilities']['down'] - 0.2)
        elif sentiment_score < -0.3:
            result['probabilities']['down'] = min(0.9, result['probabilities']['down'] + 0.2)
            result['probabilities']['up'] = max(0.1, result['probabilities']['up'] - 0.2)

        # Normalize probabilities
        total_prob = sum(result['probabilities'].values())
        for key in result['probabilities']:
            result['probabilities'][key] /= total_prob

        # Update recommendation
        if result['probabilities']['up'] > 0.5:
            result['recommendation'] = 'BUY'
        elif result['probabilities']['down'] > 0.5:
            result['recommendation'] = 'SELL'
        else:
            result['recommendation'] = 'HOLD'

        # Update explanation
        result['explanation'] = f"Analysis of {ticker_symbol} based on investment principles and recent report "
        result['explanation'] += f"suggests a {result['recommendation']} recommendation. "
        result['explanation'] += f"The report sentiment is {result['report_analysis']['sentiment']} ({result['report_analysis']['sentiment_score']:.2f}). "
        result['explanation'] += f"The probability of price increase is {result['probabilities']['up']:.2%}, "
        result['explanation'] += f"neutral is {result['probabilities']['neutral']:.2%}, "
        result['explanation'] += f"and decrease is {result['probabilities']['down']:.2%}."

    # Display results
    if result:
        display_analysis_results(result)

    return result

In [None]:
# Analyze stock without a report (only using historical data)
ticker_symbol = 'AAPL'  # Example stock ticker
period = 7  # Default period (7 days)

# Call the function
result = analyze_stock_report(ticker_symbol, report_text=None, period=period)


[*********************100%***********************]  1 of 1 completed


Analyzing AAPL using knowledge-based principles...
Recent Data (last 7 days):
Price            Close        High         Low        Open     Volume
Ticker            AAPL        AAPL        AAPL        AAPL       AAPL
Date                                                                 
2025-04-14  202.520004  212.940002  201.160004  211.440002  101352900
2025-04-15  202.139999  203.509995  199.800003  201.860001   51343900
2025-04-16  194.270004  200.699997  192.369995  198.360001   59732400
2025-04-17  196.979996  198.830002  194.419998  197.199997   51334300
2025-04-21  193.160004  193.800003  189.809998  193.270004   46742500
2025-04-22  199.740005  201.589996  195.970001  196.119995   52976400
2025-04-23  204.600006  208.000000  202.798996  206.000000   51988230
Type of close_data: <class 'pandas.core.frame.DataFrame'>
Is close_data empty? False
Error analyzing AAPL: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().





In [None]:
# This cell demonstrates how to use the model
# You can uncomment the appropriate part to run

# Run the complete model to process books and analyze stocks
analysis_results = run_rule_based_model()

# OR analyze a specific stock
# result = analyze_specific_stock('TSLA')

# OR analyze a stock with a news report
# example_report = """
# Tesla reported strong quarterly earnings that exceeded analyst expectations.
# Revenue grew by 20% year-over-year, driven by increased vehicle deliveries and growth in the energy business.
# The company also announced plans to expand production capacity in coming quarters.
# """
# result = analyze_stock_report('TSLA', example_report)

print("Model is ready to use! Uncomment the example usage above to test it.")

[*********************100%***********************]  1 of 1 completed


Running the Rule-Based Stock Prediction Model (Model 1)...
Loaded knowledge base with 5951 principles.

Analyzing AAPL using knowledge-based principles...
Recent Data (last 7 days):
Price            Close        High         Low        Open     Volume
Ticker            AAPL        AAPL        AAPL        AAPL       AAPL
Date                                                                 
2025-04-14  202.520004  212.940002  201.160004  211.440002  101352900
2025-04-15  202.139999  203.509995  199.800003  201.860001   51343900
2025-04-16  194.270004  200.699997  192.369995  198.360001   59732400
2025-04-17  196.979996  198.830002  194.419998  197.199997   51334300
2025-04-21  193.160004  193.800003  189.809998  193.270004   46742500
2025-04-22  199.740005  201.589996  195.970001  196.119995   52976400
2025-04-23  204.600006  208.000000  202.798996  206.000000   51988230
Type of close_data: <class 'pandas.core.frame.DataFrame'>
Is close_data empty? False
Error analyzing AAPL: The truth 


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Recent Data (last 7 days):
Price            Close        High         Low        Open    Volume
Ticker            MSFT        MSFT        MSFT        MSFT      MSFT
Date                                                                
2025-04-14  387.809998  394.649994  384.209991  393.220001  19251200
2025-04-15  385.730011  391.890015  384.160004  388.510010  17199900
2025-04-16  371.609985  381.609985  368.000000  380.670013  21967800
2025-04-17  367.779999  374.320007  366.890015  373.750000  20943700
2025-04-21  359.119995  364.480011  355.670013  362.820007  20807300
2025-04-22  366.820007  367.769989  359.859985  363.380005  19485000
2025-04-23  374.390015  380.390015  373.029999  376.059998  20161345
Type of close_data: <class 'pandas.core.frame.DataFrame'>
Is close_data empty? False
Error analyzing MSFT: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

Analyzing GOOGL using knowledge-based principles...
Recent Data (last 7 days):
P

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

Recent Data (last 7 days):
Price            Close        High         Low        Open    Volume
Ticker            AMZN        AMZN        AMZN        AMZN      AMZN
Date                                                                
2025-04-14  182.119995  187.440002  179.229996  186.839996  48002500
2025-04-15  179.589996  182.350006  177.929993  181.410004  43642000
2025-04-16  174.330002  179.100006  171.410004  176.289993  51875300
2025-04-17  172.610001  176.210007  172.000000  176.000000  44468400
2025-04-21  167.320007  169.600006  165.289993  169.600006  48126100
2025-04-22  173.179993  176.779999  169.350006  169.850006  56607200
2025-04-23  180.600006  187.380005  180.199997  183.440002  61598390
Type of close_data: <class 'pandas.core.frame.DataFrame'>
Is close_data empty? False
Error analyzing AMZN: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

Analyzing META using knowledge-based principles...
Recent Data (last 7 days):
Pr


