In [None]:
# Install dependencies
%pip install huggingface_hub PyPDF2 transformers nltk fpdf google

In [2]:
# Get your huggingface token and input it here!
# https://huggingface.co/settings/tokens
import os
os.environ['HF_TOKEN'] = 'HUGGINGFACE_TOKEN'

In [3]:
# Read file
import PyPDF2

with open('ai_future_jobs_report.pdf', 'rb') as file:
    pdf_reader = PyPDF2.PdfReader(file)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()

In [4]:
# Check number of Tokens
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn") 
tokens = tokenizer.tokenize(text)

print("Token count (tokenize):", len(tokens))

  from .autonotebook import tqdm as notebook_tqdm


Token count (tokenize): 206779


In [5]:
# Chunk it
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize

sentences = sent_tokenize(text)
max_words = 510
max_tokens = 1020

chunks = []
current_chunk = []
current_length = 0

for sentence in sentences:
    tokenized = tokenizer.tokenize(sentence)
    sentence_length = len(tokenized)
    
    if sentence_length > max_tokens:
        # Flush current_chunk
        if current_chunk:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_length = 0

        # Now split the long sentence
        for i in range(0, sentence_length, max_tokens):
            sub_tokens = tokenized[i:i+max_tokens]
            sub_text = tokenizer.decode(tokenizer.convert_tokens_to_ids(sub_tokens), skip_special_tokens=True)
            chunks.append(sub_text)

    elif current_length + sentence_length <= max_tokens:
        current_chunk.append(sentence)
        current_length += sentence_length
    else:
        chunks.append(" ".join(current_chunk))
        current_chunk = [sentence]
        current_length = sentence_length

if current_chunk:
    chunks.append(" ".join(current_chunk))

print("Chunk count", len(chunks))

max_tokens = 0
for chunk in chunks:
    if max_tokens < len(tokenizer.tokenize(chunk)):
        max_tokens = len(tokenizer.tokenize(chunk))
print("Max token count in a chunk:", max_tokens)

for i, chunk in enumerate(chunks):
    token_count = len(tokenizer.tokenize(chunk))
    print(f"Chunk {i+1}: {token_count} tokens")


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Chunk count 226
Max token count in a chunk: 1020
Chunk 1: 989 tokens
Chunk 2: 1004 tokens
Chunk 3: 1016 tokens
Chunk 4: 135 tokens
Chunk 5: 1020 tokens
Chunk 6: 491 tokens
Chunk 7: 1009 tokens
Chunk 8: 999 tokens
Chunk 9: 997 tokens
Chunk 10: 1000 tokens
Chunk 11: 983 tokens
Chunk 12: 997 tokens
Chunk 13: 987 tokens
Chunk 14: 1001 tokens
Chunk 15: 854 tokens
Chunk 16: 995 tokens
Chunk 17: 870 tokens
Chunk 18: 1013 tokens
Chunk 19: 966 tokens
Chunk 20: 1017 tokens
Chunk 21: 675 tokens
Chunk 22: 1013 tokens
Chunk 23: 997 tokens
Chunk 24: 995 tokens
Chunk 25: 1000 tokens
Chunk 26: 998 tokens
Chunk 27: 987 tokens
Chunk 28: 755 tokens
Chunk 29: 806 tokens
Chunk 30: 1003 tokens
Chunk 31: 831 tokens
Chunk 32: 810 tokens
Chunk 33: 823 tokens
Chunk 34: 625 tokens
Chunk 35: 843 tokens
Chunk 36: 677 tokens
Chunk 37: 979 tokens
Chunk 38: 1001 tokens
Chunk 39: 1016 tokens
Chunk 40: 1012 tokens
Chunk 41: 988 tokens
Chunk 42: 1015 tokens
Chunk 43: 883 tokens
Chunk 44: 977 tokens
Chunk 45: 1011 tokens

In [6]:
# Summarize the PDF
# Use BART
import torch
from transformers import pipeline
# torch.mps.empty_cache() # for Apple Mx Silicon
torch.cuda.empty_cache() # for NVIDIA GPUs, non-CUDA systems

summarizer = pipeline("summarization", model="facebook/bart-large-cnn", framework="pt")

summary_output = summarizer(chunks[0], max_length=300, min_length=100, do_sample=False)
summary_text = summary_output[0]['summary_text']

def extract_summary(x):
    result = summarizer(x, max_length=300, min_length=200, do_sample=False)
    if result[0]['summary_text']:
        return result[0]['summary_text']
    else:
        " "

chunks_summaries = list(map(extract_summary, chunks[0:5]))

print("Przyklad streszczenia \n", chunks_summaries[0])

print("Reszta streszczenia \n")
print(*chunks_summaries, sep = " ")


Device set to use cpu
Your max_length is set to 300, but your input_length is only 137. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=68)


Przyklad streszczenia 
 The 2025 Index is our most comprehensive to date and arrives at an important moment. New in this year’s report are in-depth analyses of the evolving landscape of AI hardware and novel estimates of inference costs. We also introduce fresh data on corporate adoption of responsible AI practices. The Index continues to lead in tracking and interpreting the most critical trends shaping the field. It has been cited in major media outlets such as The New York Times, Bloomberg, and The Guardian. It is referenced in hundreds of academic papers; and used by policymakers and government agencies around the world. We continue to serve as an independent source of insights for the global AI ecosystem. Explore the report and see for yourself this year's edition of the AI Index report. For more information on the report, visit the report's website or read it in its entirety here:  AI Index Report 2025Artificial Intelligence Index Report 20251, 20253, 2026, 2027, 2028, 2029, 2030

In [8]:
# Use Gemini if you have API_KEY
from google import genai
from time import sleep

prompt = "Please translate the following text to German. Return only the translated text without any explanations or additional comments: "

translations = []
client = genai.Client(api_key="API_KEY")
for i, chunk_summary in enumerate(chunks_summaries):
    response = client.models.generate_content(
        model="gemini-2.0-flash",
        contents=prompt + chunk_summary,
    )
    translations.append(response.text)
    sleep(3)

print(F"Translation\n: {translations[0]}")


ModuleNotFoundError: No module named 'google'

In [None]:
# Use local LMStudio model
# Run LMStudio model
# TODO use LMStudio SDK
import requests

def translate_text(text):
    url = "http://localhost:1234/v1/chat/completions"
    headers = {
        "Content-Type": "application/json"
    }

    data = {
        "model": "bielik-7b-instruct-v0.1",
        "messages": [
            {"role": "system", "content": "You are a translation assistant. Translate the given English text to Polish. I want you to give me only translated text, do not add anything else."},
            {"role": "user", "content": f"{text}" }
        ],
        "temperature": 0.7,
        "max_tokens": -1,
        "stream": False
    }

    response = requests.post(url, json=data, headers=headers)
    result = response.json()

    translated_text = result["choices"][0]["message"]["content"]
    # model adds <s> token at the beginning of the translated text so we need too remove it
    translated_text = translated_text.replace("<s>", "").strip()

    return translated_text

bielik_translations = []
for chunk_summary in chunks_summaries:
    bielik_translations.append(translate_text(chunk_summary))

print("Example translation: \n", bielik_translations[0])



In [11]:
# Create PDF with translated summary
from fpdf import FPDF

pdf = FPDF()
pdf.set_auto_page_break(auto=True, margin=15)
pdf.add_page()

pdf.add_font("DejaVu", "", "DejaVuSans.ttf", uni=True)
pdf.set_font("DejaVu", size=12)

# for i, chunk in enumerate(bielik_translations):
for i, chunk in enumerate(chunks_summaries):
    pdf.multi_cell(0, 10, f"\n{chunk}")
    pdf.ln()

pdf.output("article_summary.pdf")

''