In [None]:
!pip install --upgrade pip
!pip install haystack
!pip install logging
!pip install --upgrade pydantic
!pip install faiss-cpu
!pip install farm-haystack

!pip install farm-haystack[inference]
!pip install datasets
!pip install nltk==3.2.4

In [None]:
!pip install farm-haystack[preprocessing] nltk
import nltk
nltk.download('punkt')

In [None]:
import os
import sys
from dataclasses import dataclass
import pickle

from haystack.document_stores import InMemoryDocumentStore
from haystack.pipelines.standard_pipelines import TextIndexingPipeline
from haystack.nodes import FARMReader
from haystack.nodes import BM25Retriever
from haystack.pipelines import ExtractiveQAPipeline
import logging

In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

In [None]:
import os

dataset_path = '/kaggle/input'
print(os.listdir(dataset_path))  # This will list the contents of the dataset folder

In [None]:
import os
import logging
import pickle
from dataclasses import dataclass

from haystack.document_stores import InMemoryDocumentStore
from haystack.pipelines.standard_pipelines import TextIndexingPipeline
from haystack.nodes import FARMReader, BM25Retriever
from haystack.pipelines import ExtractiveQAPipeline

# Enable logging
logging.basicConfig(level=logging.INFO)

@dataclass
class ModelTrainerConfig:
    trained_pipe_file_path: str
import re
import string

def preprocess_text(text: str) -> str:
    """Preprocess the text by removing URLs, punctuation, and converting to lowercase."""
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text

def load_and_preprocess_documents(data_path: str) -> list:
    """Load and preprocess documents from the given directory."""
    files_to_index = [os.path.join(data_path, f) for f in os.listdir(data_path) if f.endswith('.txt')]
    documents = []
    
    for file_path in files_to_index:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            preprocessed_text = preprocess_text(text)
            documents.append({'content': preprocessed_text})
    
    return documents

class ModelTrainer:
    def __init__(self, trained_file_path: str):
        """Initializes the Model Trainer class with path to save the model"""
        self.model_trainer_config = ModelTrainerConfig(trained_pipe_file_path=trained_file_path)

    def initiate_model_trainer(self, data_path: str) -> str:
        """Train model on data and save pipeline"""
        try:
            logging.info("Initializing InMemory Document Store")
            document_store = InMemoryDocumentStore(use_bm25=True)

            logging.info("Loading and Preprocessing Documents")
            documents = load_and_preprocess_documents(data_path)

            logging.info("Indexing Documents")
            document_store.write_documents(documents)

            logging.info("Initializing Reader & Retriever")
            reader = FARMReader(model_name_or_path='deepset/roberta-base-squad2', use_gpu=False)
            retriever = BM25Retriever(document_store=document_store)

            logging.info("Initializing QA Pipeline")
            pipe = ExtractiveQAPipeline(reader, retriever)

            logging.info(f"Saving Model Pipeline to {self.model_trainer_config.trained_pipe_file_path}")
            with open(self.model_trainer_config.trained_pipe_file_path, 'wb') as f:
                pickle.dump(pipe, f)

            logging.info("Model Training Completed")
            return self.model_trainer_config.trained_pipe_file_path

        except Exception as e:
            logging.exception(e)
            raise e

# The rest of your code remains unchanged
@dataclass
class TrainPipelineConfig:
     model_save_path: str = os.path.join('trained_pipe.pkl')
     clean_data_path: str = os.path.join('/kaggle/input', 'python-dataset')  # Updated path


class TrainPipeline:
    def __init__(self):
        """Initializes the Training Pipeline"""
        self.train_pipeline_config = TrainPipelineConfig()
        self.model_trainer = ModelTrainer(self.train_pipeline_config.model_save_path)

    def train(self) -> str:
        """Train and save model"""
        try:
            logging.info("Starting Model Training")
            model_path = self.model_trainer.initiate_model_trainer(self.train_pipeline_config.clean_data_path)
            logging.info("Training Completed Successfully")
            return model_path
        except Exception as e:
            logging.exception(e)
            raise e

# ✅ Run Training
# if __name__ == "__main__":
train_pipeline = TrainPipeline()
logging.info("Training model...")
trained_model_path = train_pipeline.train()
logging.info(f"Model saved at {trained_model_path}")

In [None]:
import pickle
import logging
from haystack.pipelines import ExtractiveQAPipeline

# Load the trained model
trained_pipe_path = "/kaggle/working/trained_pipe.pkl"

logging.info(f"Loading trained model from {trained_pipe_path}")
with open(trained_pipe_path, "rb") as f:
    qa_pipeline = pickle.load(f)

logging.info("Model loaded successfully")

In [None]:
# this is not relate for dataset
query = "Who is Goku?"
prediction = qa_pipeline.run(query=query, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}})

# Print Answers
for answer in prediction["answers"]:
    print(f"Answer: {answer.answer}, Score: {answer.score}")

In [None]:
# query = "numpy?"
# prediction = qa_pipeline.run(query=query, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}})

# # Print Answers
# for answer in prediction["answers"]:
#     print(f"Answer: {answer.answer}, Score: {answer.score}")

In [None]:
import torch
torch.cuda.empty_cache()


In [None]:
query = "introduced the sets module?"
prediction = qa_pipeline.run(query=query, params={"Retriever": {"top_k": 5}, "Reader": {"top_k": 3}})

# Print Answers
for answer in prediction["answers"]:
    print(f"Answer: {answer.answer}, Score: {answer.score}")

In [None]:
query = "indroduction of numpy?"
prediction = qa_pipeline.run(query=query, params={"Retriever": {"top_k": 5}, "Reader": {"top_k": 3}})

# Print Answers
for answer in prediction["answers"]:
    print(f"Answer: {answer.answer}, Score: {answer.score}")

In [None]:
query = "what is python ?"
prediction = qa_pipeline.run(query=query, params={"Retriever": {"top_k": 5}, "Reader": {"top_k": 3}})

# Print Answers
for answer in prediction["answers"]:
    print(f"Answer: {answer.answer}, Score: {answer.score}")