In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install transformers accelerate bitsandbytes huggingface_hub datasets pymupdf

In [None]:
from huggingface_hub import login

login(token="HUGGINGFACE_TOKEN")

In [5]:
from transformers import pipeline, BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer
import torch
import pandas as pd
import json
import time
import pymupdf
import os

In [None]:
model_id = "meta-llama/Llama-3.2-3B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side = "left")
tokenizer.pad_token_id = tokenizer.eos_token_id
bnb_config = BitsAndBytesConfig(load_in_8bit=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    quantization_config=bnb_config,
    device_map="auto"
 )
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

In [None]:
def pdf_to_text(filepath):
    text = ""
    try:
        doc = pymupdf.open(filepath)
        for page in doc:
            try:
                text += page.get_text("text") + "\n"
            except Exception as e:
                print(f"Skipping problematic page in {filepath}: {e}")
                continue  # Skip the problematic page and continue processing

    except pymupdf.fitz.FileDataError:
        print(f"Error opening PDF: {filepath} - Possibly corrupted file.")
        return None
    except pymupdf.fitz.FileAccessError:
        print(f"Permission error or file in use: {filepath}")
        return None
    except Exception as e:
        print(f"Unexpected error in {filepath}: {e}")
        return None

    return text if text.strip() else None

In [None]:
def process_pdfs(directory):
    texts = []

    for filename in os.listdir(directory):
        if filename.endswith(".pdf"):
            filepath = os.path.join(directory, filename)
            print(f"Processing: {filepath}")
            
            # Extract text from PDF
            text = pdf_to_text(filepath)
            if text:
                texts.append(text)

    return texts

In [8]:
def text_to_json(text):
    try:
        # Check if the text has curly braces
        if not text.startswith("{"):
            text = "{" + text + "}"
        if not text.endswith("}"):
            text = text + "}"

        extracted_json = json.loads(text)
        if isinstance(extracted_json, dict):
            return extracted_json
        else:
            print(f"INVALID JSON: {text}")
            return {
                "title": "",
                "authors": [],
                "abstract": ""
            }
    except json.JSONDecodeError:
        print(f"INVALID JSON: {text}")
        return {
                "title": "",
                "authors": [],
                "abstract": ""
            }

In [9]:
def set_prompts(texts):
  system_prompt = {
      "role": "system",
      "content": "You are an academic document extraction assistant. Your primary objective is to extract specific information from research articles or documents. Extract the following information from the given text:\n1. Title: Extract title exactly as it appears in the text, without any modifications or additions.\n2. Authors: Extract authors exactly as they appear in the text, with no changes to characters or formatting.\n3. Abstract: Extract all of the abstract part exactly as it appears in the text, without modifications, additions, or exclusions. Ensure the abstract text is extracted verbatim without modifications. \nEnsure no additional explanation or commentary is included in the response.\nAlways respond in valid JSON format with the following structure:\n\n{\n\"title\": \"Extracted Title\",\n\"authors\": [\"Author 1\", \"Author 2\", \"Author 3\"],\n\"abstract\": \"Extracted abstract text.\"}"
  }

  prompts = [
      [
          system_prompt,
          {"role": "user", "content": f"Article:\n{text}"}
      ]
      for text in texts
  ]

  return prompts

In [None]:
def generate_outputs(prompts, batch_size=2):
    results = []

    for i in range(0, len(prompts), batch_size):
        start_time = time.time()

        # Create minibatch
        mini_batch = prompts[i:i+batch_size]
        batch_prompts = tokenizer.apply_chat_template(mini_batch, add_generation_prompt=True, tokenize=False)
        # Tokenize inputs
        inputs = tokenizer(
            batch_prompts,
            return_tensors="pt",
            padding=True,
            truncation=True,
        )
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        temp_results = tokenizer.batch_decode(inputs["input_ids"], skip_special_tokens=True)

        with torch.no_grad():
          outputs = model.generate(
              **inputs,
              max_new_tokens=1024,
              pad_token_id=tokenizer.eos_token_id,
              eos_token_id=tokenizer.eos_token_id,
              do_sample=True,
              temperature=0.3,
              top_p=0.8
          )
          # Decode generated outputs
          decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
          decoded_outputs = [i[len(temp_results[idx]):] for idx, i in enumerate(decoded_outputs)]
          # Parse and validate outputs
          parsed_outputs = [text_to_json(item) for item in decoded_outputs]
          results.extend(parsed_outputs)

        # Clear RAM
        # del inputs, batch_prompts, temp_results, outputs, decoded_outputs, parsed_outputs
        torch.cuda.empty_cache()

        end_time = time.time()
        print(f"Elapsed time: {end_time - start_time} seconds")

    return results

## Demo of the article metadata extractor

In [None]:
directory = "/content/"
output_csv = "/content/makale.csv"

# Extract metadata from PDFs
texts = process_pdfs(directory)
prompts = set_prompts(texts)
outputs = generate_outputs(prompts)

# Save metadata to CSV
data = []
for metadata in outputs:
    data.append({
        "title": metadata.get("title", ""),
        "authors": ", ".join(metadata.get("authors", [])),
        "abstract": metadata.get("abstract", "")
    })
df = pd.DataFrame(data)

df.to_csv(output_csv, index=False, encoding="utf-8")
print(f"Metadata saved to {output_csv}")