In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install transformers accelerate bitsandbytes huggingface_hub datasets pymupdf

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting pymupdf
  Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2023.5.0 (from huggingface_hub)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl (69.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[?25hDown

In [4]:
from huggingface_hub import login

login(token="HUGGINGFACE_TOKEN")

In [5]:
from transformers import pipeline, BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer
import torch
import pandas as pd
import json
import time
import pymupdf

In [6]:
model_id = "meta-llama/Llama-3.2-3B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side = "left")
tokenizer.pad_token_id = tokenizer.eos_token_id
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto")
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [15]:
def pdf_to_text(filepath):
    text = ""
    try:
        doc = pymupdf.open(filepath)
        for page in doc:
            text += page.get_text() + "\n"
    except Exception as e:
        print(f"Error processing PDF: {e}")
        return None
    return text

In [8]:
def text_to_json(text):
    try:
        # Check if the text has curly braces
        if not text.startswith("{"):
            text = "{" + text + "}"
        if not text.endswith("}"):
            text = text + "}"

        extracted_json = json.loads(text)
        if isinstance(extracted_json, dict):
            return extracted_json
        else:
            print(f"INVALID JSON: {text}")
            return {
                "title": "",
                "authors": [],
                "abstract": ""
            }
    except json.JSONDecodeError:
        print(f"INVALID JSON: {text}")
        return {
                "title": "",
                "authors": [],
                "abstract": ""
            }

In [9]:
def set_prompts(texts):
  system_prompt = {
      "role": "system",
      "content": "You are an academic document extraction assistant. Your primary objective is to extract specific information from research articles or documents. Extract the following information from the given text:\n1. Title: Extract title exactly as it appears in the text, without any modifications or additions.\n2. Authors: Extract authours exactly as they appear in the text, with no changes to characters or formatting.\n3. Abstract: Extract all of the abstract part exactly as it appears in the text, without modifications, additions, or exclusions. Ensure the abstract text is extracted verbatim without modifications. \nEnsure no additional explanation or commentary is included in the response.\nAlways respond in valid JSON format with the following structure:\n\n{\n\"title\": \"Extracted Title\",\n\"authors\": [\"Author 1\", \"Author 2\", \"Author 3\"],\n\"abstract\": \"Extracted abstract text.\"}"
  }

  prompts = [
      [
          system_prompt,
          {"role": "user", "content": f"Article:\n{text}"}
      ]
      for text in texts
  ]

  return prompts

In [10]:
def generate_outputs(prompts, batch_size=4):
    results = []

    for i in range(0, len(prompts), batch_size):
        start_time = time.time()

        # Create minibatch
        mini_batch = prompts[i:i+batch_size]
        batch_prompts = tokenizer.apply_chat_template(mini_batch, add_generation_prompt=True, tokenize=False)
        # Tokenize inputs
        inputs = tokenizer(
            batch_prompts,
            return_tensors="pt",
            padding=True,
            truncation=True,
        )
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        temp_results = tokenizer.batch_decode(inputs["input_ids"], skip_special_tokens=True)

        with torch.no_grad():
          outputs = model.generate(
              **inputs,
              max_new_tokens=1024,
              pad_token_id=tokenizer.eos_token_id,
              eos_token_id=tokenizer.eos_token_id,
              do_sample=True,
              temperature=0.3,
              top_p=0.8
          )
          # Decode generated outputs
          decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
          decoded_outputs = [i[len(temp_results[idx]):] for idx, i in enumerate(decoded_outputs)]
          # Parse and validate outputs
          parsed_outputs = [text_to_json(item) for item in decoded_outputs]
          results.extend(parsed_outputs)

        # Clear RAM
        # del inputs, batch_prompts, temp_results, outputs, decoded_outputs, parsed_outputs
        torch.cuda.empty_cache()

        end_time = time.time()
        print(f"Elapsed time: {end_time - start_time} seconds")

    return results

## Demo of the article metadata exxtractor

In [59]:
filepath = "/content/Dogan (2020) Impact of EKC in Europe.pdf"
article_raw = pdf_to_text(filepath)

In [60]:
article_raw

"RESEARCH ARTICLE\nThe impact of economic structure to the environmental Kuznets\ncurve (EKC) hypothesis: evidence from European countries\nEyup Dogan1 & Roula Inglesi-Lotz2\nReceived: 27 November 2019 /Accepted: 27 January 2020\n# Springer-Verlag GmbH Germany, part of Springer Nature 2020\nAbstract\nThe purpose of this study is to examine the role of economic structure of European countries into testing the Environmental\nKuznets Curve (EKC) hypothesis for European countries for the period 1980 to 2014. This study is inspired by the work of Lin\net al. (J Clean Prod 133:712–724, 2016), which made the first effort to investigate the phenomenon looking only at African\ncountries. The main finding of the study is that the overall economic growth is the factor with which CO2 emissions exhibit an\ninverted U-shaped relationship in the studied country group. On the contrary, when using their industrial share as a proxy to\ncapture the countries’ economic structure, the EKC hypothesis is not

In [61]:
articles = [article_raw]
prompts = set_prompts(articles)

In [62]:
results = generate_outputs(prompts)

Elapsed time: 10.34089469909668 seconds


In [63]:

df = pd.DataFrame(results)
df

Unnamed: 0,title,authors,abstract
0,Sustainable development of small- and medium-s...,"[Francesca Bassi, José G. Dias]",This paper focuses on the implementation of ci...
1,The Inﬂuence of Proactive Green Innovation and...,"[Yu-Shan Chen, Tai-Wei Chang, Chun-Yu Lin, Pi-...",This study ﬁlls the research gap in the explor...
2,The impact of economic structure to the enviro...,"[Eyup Dogan, Roula Inglesi-Lotz]",The purpose of this study is to examine the ro...


In [64]:
df.to_csv("output.csv", index=False)