### Data Preparation 

In [1]:
import os

def load_articles_from_folder(folder_path):
    """Load all .txt files from the given folder and return them as a list of texts"""
    texts = []
    file_names = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".txt"):
            file_names.append(file_name)
            with open(os.path.join(folder_path, file_name), "r", encoding="utf-8") as file:
                texts.append(file.read())
    return file_names, texts

# Set the folder path where your articles are stored
folder_path = "articles/"  # Place your .txt files in this folder
file_names, articles = load_articles_from_folder(folder_path)

print(f" Loaded {len(articles)} articles.")


 Loaded 3 articles.


### Preprocess the Text

In [2]:
import re
import nltk

nltk.download('punkt')

def preprocess_text(text):
    """Clean and normalize text"""
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces and newlines
    text = re.sub(r'[^a-zA-Z0-9.,!?;:\'\s]', '', text)  # Remove special characters
    return text

# Apply preprocessing to all articles
cleaned_articles = [preprocess_text(article) for article in articles]

print(" Text preprocessing completed.")


 Text preprocessing completed.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\radhi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Chunk the Text 
Since the articles may be too large, we need to split them into 200-300 word chunks for better retrieval.

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def chunk_text(texts, chunk_size=500, overlap=100):
    """Chunk text into smaller parts"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=overlap
    )
    chunks = []
    for text in texts:
        chunks.extend(text_splitter.split_text(text))
    return chunks

# Apply chunking
text_chunks = chunk_text(cleaned_articles)

print(f" Generated {len(text_chunks)} text chunks.")


 Generated 29 text chunks.


In [4]:
import pickle

# Save text chunks to a file
with open("preprocessed_text_chunks.pkl", "wb") as file:
    pickle.dump(text_chunks, file)
