In [14]:
# Install necessary packages
!pip install langchain langchain_community langchain_openai faiss-cpu tiktoken requests beautifulsoup4 tenacity

# Import required libraries
import os
import re
import time
import random
import requests
from bs4 import BeautifulSoup
from tenacity import retry, stop_after_attempt, wait_exponential
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

# Set your OpenAI API key
os.environ["OPENAI_API_KEY"] = ""

# Define function to fetch and parse SEC filings with retry logic
@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=60))
def fetch_sec_filing(url):
    # Rotate user agents to avoid detection
    user_agents = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0'
    ]

    headers = {
        'User-Agent': random.choice(user_agents),
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
    }

    # Add random delay to avoid rate limiting
    time.sleep(random.uniform(1, 3))

    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"Failed to fetch the URL: {url}, Status code: {response.status_code}")

    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract text content, removing script and style elements
    for script in soup(["script", "style"]):
        script.extract()

    text = soup.get_text()

    # Clean up text
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = '\n'.join(chunk for chunk in chunks if chunk)

    return text

# URLs for the 10-K SEC filings
tesla_url = "https://www.sec.gov/Archives/edgar/data/1318605/000162828024002390/tsla-20231231.htm"
gm_url = "https://www.sec.gov/Archives/edgar/data/1467858/000146785824000031/gm-20231231.htm"

# Fetch and parse the SEC filings
try:
    print("Fetching Tesla 10-K filing...")
    tesla_text = fetch_sec_filing(tesla_url)
    print(f"Tesla 10-K length: {len(tesla_text)} characters")

    print("Fetching GM 10-K filing...")
    gm_text = fetch_sec_filing(gm_url)
    print(f"GM 10-K length: {len(gm_text)} characters")
except Exception as e:
    print(f"Error fetching SEC filings: {e}")
    # Fallback to sample text if fetching fails
    tesla_text = "This is a placeholder for Tesla's 10-K filing content."
    gm_text = "This is a placeholder for GM's 10-K filing content."

# Create Document objects
tesla_doc = Document(page_content=tesla_text, metadata={"source": "Tesla 10-K", "company": "Tesla"})
gm_doc = Document(page_content=gm_text, metadata={"source": "GM 10-K", "company": "General Motors"})

# Combine documents
all_docs = [tesla_doc, gm_doc]

# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
)

all_splits = text_splitter.split_documents(all_docs)
print(f"Split {len(all_docs)} documents into {len(all_splits)} chunks")

# Create embeddings and vector store
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(all_splits, embeddings)

# Save the vector store for future use
vectorstore.save_local("faiss_index")

# Create a retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 15})

# Define a custom prompt template
template = """
You are a financial analyst specializing in the automotive industry. Your task is to provide a detailed comparison of Tesla and GM's approaches to manufacturing and production, with special attention to electric vehicles.

Compare Tesla and GM's approaches to manufacturing and production, particularly for electric vehicles. Analyze their manufacturing locations, production methods, and safety standards.

Use only the following retrieved context to answer the question. If you don't know the answer or if the information is not provided in the context, say that you don't know.

Retrieved context:
{context}

Question: {question}

Your detailed analysis should include:
1. Manufacturing locations for both companies
2. Production methods, especially for electric vehicles
3. Safety standards followed in their vehicles
4. Key differences and similarities in their approaches

Your analysis:
"""

PROMPT = PromptTemplate(
    template=template,
    input_variables=["context", "question"]
)

# Create the QA chain
llm = ChatOpenAI(model_name="gpt-4", temperature=0)
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT}
)

# Define the main question
query = """How do Tesla and GM's approaches to manufacturing and production compare, particularly for electric vehicles?
Where are their vehicles produced? What are the safety standards followed in their vehicles?"""

# Run the query
result = qa_chain({"query": query})

# Print the answer
print("\n\nFinal Answer:")
print(result["result"])

# Print the source documents used for the answer
print("\nSources:")
for i, doc in enumerate(result["source_documents"][:5]):
    print(f"Source {i+1}: {doc.metadata['source']} (Company: {doc.metadata['company']})")
    print(f"Content snippet: {doc.page_content[:150]}...\n")


Fetching Tesla 10-K filing...
Tesla 10-K length: 422470 characters
Fetching GM 10-K filing...
GM 10-K length: 487665 characters
Split 2 documents into 1191 chunks


Final Answer:
1. Manufacturing Locations: Tesla's manufacturing facilities are located in California, Nevada, Texas, China, and Germany. They also have plans for future sites such as Mexico. Tesla's active production models are produced at the Fremont Factory, Gigafactory Shanghai, Gigafactory Berlin-Brandenburg, and Gigafactory Texas. On the other hand, GM has announced plans to convert Orion Assembly in Orion Township, Michigan to build electric pickups, with production starting in 2025. GM also has a facility in Canada (CAMI Assembly) and is producing battery cells through a joint venture in Warren, Ohio; Spring Hill, Tennessee; and Lansing, Michigan.

2. Production Methods: Tesla focuses on cost reduction efforts, cost innovation strategies, and localized procurement and manufacturing to make their vehicles affordable. 