In [1]:
import torch

# Check if a GPU is available
if torch.cuda.is_available():
    print("GPU is available.")
else:
    print("GPU is not available.")
print(torch.cuda.is_available())  # Should return True
print(torch.version.cuda)  # To check if CUDA is available in PyTorch
print(torch.cuda.device_count())  # Should show the number of available GPUs


GPU is available.
True
12.4
1


In [2]:
import os
from PyPDF2 import PdfReader
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BartForConditionalGeneration, BartTokenizer, AdamW
from tqdm import tqdm
import spacy

# Set up spaCy for text preprocessing
nlp = spacy.load("en_core_web_sm")

# Load pre-trained BART model and tokenizer
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to extract text from PDFs
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    extracted_text = ""
    for page in reader.pages:
        extracted_text += page.extract_text()
    return extracted_text

# Function to preprocess text (remove stop words, lowercase)
def preprocess_text(text):
    doc = nlp(text.lower())
    return ' '.join([token.text for token in doc if not token.is_stop])

# Custom Dataset class for PDFs
class PDFDataset(Dataset):
    def __init__(self, pdf_folder):
        self.pdf_files = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) if f.endswith('.pdf')]
        self.data = []
        
        for file_path in tqdm(self.pdf_files, desc="Processing PDFs"):
            text = extract_text_from_pdf(file_path)
            if not text.strip():
                print(f"Warning: {file_path} is empty or contains no extractable text.")
                continue
            processed_text = preprocess_text(text)
            summary = ' '.join(processed_text.split()[:50])
            self.data.append((processed_text, summary))
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text, summary = self.data[idx]
        inputs = tokenizer(text, max_length=1024, truncation=True, padding="max_length", return_tensors="pt")
        targets = tokenizer(summary, max_length=150, truncation=True, padding="max_length", return_tensors="pt")
        inputs = {k: v.squeeze(0) for k, v in inputs.items()}  # Remove batch dimension
        targets = targets['input_ids'].squeeze(0)
        return inputs, targets

# Path to folder containing PDFs
folder_path = r"C:\Users\pv437\Desktop\Data Scince Folder\Pankaj Assignments\Wasserstoff\Deployment\Flask\Downloaded_pdfs"

# Load dataset and create DataLoader
dataset = PDFDataset(folder_path)
train_loader = DataLoader(dataset, batch_size=2, shuffle=True)

# Training settings
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
model.train()

# Training loop
for epoch in range(num_epochs):
    total_loss = 0
    for batch_idx, (inputs, labels) in enumerate(train_loader):
        # Move inputs and labels to device (GPU/CPU)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        if batch_idx % 10 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{batch_idx}/{len(train_loader)}], Loss: {loss.item():.4f}")
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}] completed. Average Loss: {avg_loss:.4f}")

# Save model using PyTorch
torch.save(model.state_dict(), "./fine_tuned_summary_model.pth")

# Save tokenizer using Huggingface's native method
tokenizer.save_pretrained("./fine_tuned_summary_tokenizer")


Processing PDFs: 100%|█████████████████████████████████████████████████████████████████| 18/18 [01:49<00:00,  6.08s/it]
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch [1/3], Step [0/9], Loss: 7.1358
Epoch [1/3] completed. Average Loss: 3.4120
Epoch [2/3], Step [0/9], Loss: 0.6857
Epoch [2/3] completed. Average Loss: 0.7172
Epoch [3/3], Step [0/9], Loss: 0.3059
Epoch [3/3] completed. Average Loss: 0.2640


('./fine_tuned_summary_tokenizer\\tokenizer_config.json',
 './fine_tuned_summary_tokenizer\\special_tokens_map.json',
 './fine_tuned_summary_tokenizer\\vocab.json',
 './fine_tuned_summary_tokenizer\\merges.txt',
 './fine_tuned_summary_tokenizer\\added_tokens.json')

In [None]:
import os
import re
import torch
import pymongo
from transformers import BartForConditionalGeneration, BartTokenizer
import pdfplumber
from pymongo import MongoClient

# MongoDB connection
client = MongoClient("mongodb://localhost:27017/")
db = client['pdf_summaries']  # Database name
collection = db['summaries']  # Collection name

# Check if a GPU is available and use it
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the fine-tuned model and tokenizer
model_path = './fine_tuned_summary_model'
tokenizer_path = './fine_tuned_summary_tokenizer'

model = BartForConditionalGeneration.from_pretrained(model_path).to(device)
tokenizer = BartTokenizer.from_pretrained(tokenizer_path)

# Function to extract text from a PDF file and get metadata
def extract_text_and_metadata_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ''
        for page in pdf.pages:
            text += page.extract_text() or ''  # Handle None return value
        # Get PDF metadata: file size and number of pages
        pdf_info = {
            'name': os.path.basename(pdf_path),
            'size': os.path.getsize(pdf_path),  # File size in bytes
            'pages': len(pdf.pages)
        }
    return text, pdf_info

# Function to detect and remove structured text like logs, dates, and numbers
def filter_structured_text(text):
    # Regular expression to match patterns like dates, times, transaction IDs, etc.
    pattern = r'(\d{1,2}-\d{1,2}-\d{2,4}|\d{1,2}:\d{2}:\d{2}|\d{1,2}/\d{1,2}/\d{2,4}|\d{3,}|[Bb]uy|[Ss]ell|price|log|rate|order id|qty|trd id)'
    filtered_text = re.sub(pattern, '', text)  # Remove matches of the pattern
    return filtered_text.strip()  # Strip leading/trailing whitespace

# Function to chunk large text into smaller pieces
def chunk_text(text, max_tokens=1024):
    tokens = text.split()
    return [' '.join(tokens[i:i + max_tokens]) for i in range(0, len(tokens), max_tokens)]

# Function to tokenize text for model input
def tokenize_text(text, tokenizer):
    return tokenizer(text, max_length=1024, return_tensors='pt', truncation=True).to(device)

# Function to generate a summary for each chunk
def generate_summary(inputs, model, tokenizer):
    summary_ids = model.generate(
        inputs['input_ids'], 
        max_length=150, 
        min_length=30, 
        length_penalty=1.5, 
        num_beams=5, 
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Summarize a single PDF and store summary in MongoDB
def summarize_and_store_pdf(pdf_path, model, tokenizer):
    text, pdf_info = extract_text_and_metadata_from_pdf(pdf_path)
    
    # Check if there's enough text to summarize
    if not text.strip():
        return "No text extracted for summary."
    
    # Apply filtering only for pdf16 and pdf17
    pdf_name = pdf_info['name'].lower()
    if "pdf16" in pdf_name or "pdf17" in pdf_name:
        print(f"Applying structured text filter for {pdf_name}...")
        text = filter_structured_text(text)
    
    # Split the text into chunks if it's too large
    chunks = chunk_text(text, max_tokens=500)

    summaries = []
    for chunk in chunks:
        inputs = tokenize_text(chunk, tokenizer)
        summary = generate_summary(inputs, model, tokenizer)
        summaries.append(summary)
    
    # Combine all chunk summaries
    final_summary = ' '.join(summaries)
    
    # Add the summary and PDF metadata to MongoDB
    pdf_summary_data = {
        'pdf_name': pdf_info['name'],
        'pdf_size_bytes': pdf_info['size'],
        'pdf_pages': pdf_info['pages'],
        'summary': final_summary
    }
    collection.insert_one(pdf_summary_data)
    
    print(f"Summary for {pdf_info['name']} has been stored in MongoDB.")

# Summarize all PDFs in a folder and store them in MongoDB
def summarize_all_pdfs_in_folder_and_store(folder_path, model, tokenizer):
    pdf_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.pdf')]
    
    for pdf_file in pdf_files:
        try:
            print(f"Processing {pdf_file}...")
            summarize_and_store_pdf(pdf_file, model, tokenizer)
        except Exception as e:
            print(f"Error processing {pdf_file}: {e}")

# Folder containing the PDFs
folder_path = r"C:\Users\pv437\Desktop\Data Scince Folder\Pankaj Assignments\Wasserstoff\Deployment\Flask\Downloaded_pdfs"

# Summarize all PDFs in the folder and store the results in MongoDB
summarize_all_pdfs_in_folder_and_store(folder_path, model, tokenizer)




Processing C:\Users\pv437\Desktop\Data Scince Folder\Pankaj Assignments\Wasserstoff\Deployment\Flask\Downloaded_pdfs\pdf1.pdf...




Summary for pdf1.pdf has been stored in MongoDB.
Processing C:\Users\pv437\Desktop\Data Scince Folder\Pankaj Assignments\Wasserstoff\Deployment\Flask\Downloaded_pdfs\pdf10.pdf...
Summary for pdf10.pdf has been stored in MongoDB.
Processing C:\Users\pv437\Desktop\Data Scince Folder\Pankaj Assignments\Wasserstoff\Deployment\Flask\Downloaded_pdfs\pdf11.pdf...
Summary for pdf11.pdf has been stored in MongoDB.
Processing C:\Users\pv437\Desktop\Data Scince Folder\Pankaj Assignments\Wasserstoff\Deployment\Flask\Downloaded_pdfs\pdf12.pdf...
Summary for pdf12.pdf has been stored in MongoDB.
Processing C:\Users\pv437\Desktop\Data Scince Folder\Pankaj Assignments\Wasserstoff\Deployment\Flask\Downloaded_pdfs\pdf13.pdf...
Summary for pdf13.pdf has been stored in MongoDB.
Processing C:\Users\pv437\Desktop\Data Scince Folder\Pankaj Assignments\Wasserstoff\Deployment\Flask\Downloaded_pdfs\pdf14.pdf...
Summary for pdf14.pdf has been stored in MongoDB.
Processing C:\Users\pv437\Desktop\Data Scince Folde

In [None]:
import pymongo
from pymongo import MongoClient  # Import MongoClient

# MongoDB connection
client = MongoClient("mongodb://localhost:27017/")
db = client['pdf_summaries']  # Database name
collection = db['summaries']  # Collection name


# Function to get PDF summary by name
def get_pdf_summary(pdf_name):
    # Query the database for the specified PDF name
    pdf_record = collection.find_one({'pdf_name': pdf_name})
    
    if pdf_record:
        print(f"Summary for {pdf_record['pdf_name']}:")
        print(f"File Size: {pdf_record['pdf_size_bytes']} bytes")
        print(f"Number of Pages: {pdf_record['pdf_pages']}")
        print(f"Summary: {pdf_record['summary']}")
    else:
        print(f"No summary found for the PDF named '{pdf_name}'.")

# Get user input for the PDF name
user_input_pdf_name = input("Enter the PDF name (including .pdf extension): ")
get_pdf_summary(user_input_pdf_name)
