### Generate Article Using Grok

In [None]:
import os
import pandas as pd
import time
from langchain_groq import ChatGroq
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.documents import Document
from dotenv import load_dotenv

In [None]:
class ArticleGenerator:
    def __init__(self, groq_api_key=None, model_name="Llama3-8b-8192", vector_store_path="vector_db"):
        load_dotenv()  # Load environment variables from .env file
        self.groq_api_key = groq_api_key or os.getenv('GROQ_API_KEY')  # Get Groq API key from argument or environment
        if not self.groq_api_key:
            raise ValueError("Groq API key not provided or found.")
        
        self.llm = ChatGroq(
            groq_api_key=self.groq_api_key,
            model_name=model_name
        )  # Initialize Groq LLM with API key and model name
        self.vector_store = None
        self.vector_store_path = vector_store_path  # Path to save/load FAISS vector store
        self.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")  # Embedding model for semantic search

        self.article_prompt = ChatPromptTemplate.from_template("""
        You are an expert article writer. Generate a well-structured article based on the provided title.
        Use the following similar articles as reference to match the style, tone, and format, but create original content.

        REFERENCE ARTICLES:
        {context}

        TITLE TO GENERATE ARTICLE FOR:
        {input}

        Write a comprehensive, engaging, and well-structured article for the given title. 
        The article should be factually accurate, well-researched, and follow a logical flow.
        Include an introduction, several body paragraphs with relevant subheadings, and a conclusion.
        Aim for approximately 800-1000 words.
        Do not mention that you're using reference articles - write as if you are the original author.
        """)  # Prompt template for article generation

    def load_article_data(self, file_path):
        try:
            df = pd.read_csv(file_path)  # Load articles from CSV
            print(f"Loaded {len(df)} articles")
            if 'clean_title' not in df.columns or 'clean_text' not in df.columns:
                raise ValueError("CSV must contain 'clean_title' and 'clean_text' columns")
            return df
        except Exception as e:
            print(f"Data loading failed: {e}")
            return None

    def create_vector_embeddings(self, df, sample_size=None):

        if sample_size is not None and len(df) > sample_size:
            df_sample = df.sample(sample_size, random_state=42)  # Randomly sample articles if sample_size is set
            print(f"Using sample of {sample_size} articles for vector DB")
        else:
            df_sample = df
            print(f"Using all {len(df_sample)} articles for vector DB")

        documents = [
            Document(page_content=f"Title: {row['clean_title']}\n\n{row['clean_text']}", metadata={"title": row['clean_title']})
            for _, row in df_sample.iterrows()
        ]  # Create Document objects for each article
        splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)  # Split articles into chunks for embedding
        chunks = splitter.split_documents(documents)
        return FAISS.from_documents(chunks, self.embeddings)  # Build FAISS vector store from document chunks

    def setup_vector_database(self, file_path, sample_size=None):
        df = self.load_article_data(file_path)  # Load article data from CSV
        if df is None:
            raise ValueError("Article data loading failed.")
        print("Creating FAISS vector DB...")
        self.vector_store = self.create_vector_embeddings(df, sample_size)  # Create and store vector database
        self.vector_store.save_local(self.vector_store_path)  # Save FAISS vector store to disk
        print("Vector DB saved at:", self.vector_store_path)

    def load_vector_database(self):
        if not os.path.exists(self.vector_store_path):
            raise FileNotFoundError("Vector store not found. Run setup first.")
        self.vector_store = FAISS.load_local(self.vector_store_path, self.embeddings)  # Load FAISS vector store from disk
        print("Vector DB loaded from disk.")

    def generate_article(self, title, num_similar_articles=3):
        if self.vector_store is None:
            raise ValueError("Vector store not loaded. Call load_vector_database() first.")

        start = time.time()  # Start timing

        retriever = self.vector_store.as_retriever(search_kwargs={"k": num_similar_articles})  # Retrieve similar articles
        chain = create_retrieval_chain(retriever, create_stuff_documents_chain(self.llm, self.article_prompt))  # Create retrieval and generation chain
        response = chain.invoke({"input": title})  # Generate article using LLM and similar articles
        duration = time.time() - start  # Calculate generation time

        return {
            "title": title,
            "article": response["answer"],
            "generation_time_seconds": round(duration, 2)
        }

In [None]:
generator = ArticleGenerator()  # Instantiate the article generator
generator.setup_vector_database("final_nlp_data.csv", sample_size=1000)  # Build and save the vector database from CSV