In [5]:
import torch
from transformers import BertModel, BertTokenizer
from nltk.tokenize import sent_tokenize
import numpy as np
import nltk

nltk.download('punkt')

def bert_summarization(text, num_sentences=30):
    # Load pre-trained BERT model and tokenizer
    model_name = 'bert-base-uncased'
    model = BertModel.from_pretrained(model_name)
    tokenizer = BertTokenizer.from_pretrained(model_name)

    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    
    # Encode sentences
    inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True, max_length=512)

    # Generate embeddings for each sentence
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get the CLS token embeddings for each sentence
    sentence_embeddings = outputs.last_hidden_state[:, 0, :].numpy()

    # Calculate similarity of each sentence to the mean sentence embedding
    document_embedding = np.mean(sentence_embeddings, axis=0)
    similarities = np.dot(sentence_embeddings, document_embedding)
    
    # Rank sentences by similarity
    ranked_sentence_indices = np.argsort(similarities)[::-1]
    
    # Select top N sentences for the summary
    selected_indices = ranked_sentence_indices[:num_sentences]
    summary_sentences = [sentences[i] for i in selected_indices]
    
    # Join selected sentences into a summary
    summary = ' '.join(summary_sentences)
    return summary

# Example usage

import pymupdf # imports the pymupdf library
text = ""
doc = pymupdf.open("./app/first_chapter.pdf") # open a document
for page in doc: # iterate the document pages
  text += page.get_text() # get plain text encoded as UTF-8

summary = bert_summarization(text)
print("Summary:")
print(summary)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sathi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Summary:
Thanks to our talented, generous and civic-minded contributors from around 
the world who continue to stick with us and share their data and insight, and deep 
appreciation for our very own Verizon Threat Research Advisory Center (VTRAC) 
team (rock stars that they are). Welcome to Verizon’s 2024 Data Breach Investigations Report (DBIR). If you would like to provide people a copy of the 
report, we ask that you provide them a link to verizon.com/dbir rather than the PDF. 2024 DBIR Summary of findings
Figure 1. From the exploitation of well-known 
and far-reaching zero-day vulnerabilities, such as the one that affected MOVEit, to 
the much more mundane but still incredibly effective Ransomware and Denial of 
Service (DoS) attacks, criminals continue to do their utmost to prove the old adage 
“crime does not pay” wrong. The shifting landscape of cyber threats can be confusing and overwhelming. The past year has been a busy one for cybercrime. In short, those are 
breaches an org

In [7]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Sample text and keywords
text = "Your text to be summarized goes here. It should be relatively long to demonstrate the summarization."
keywords = ["keyword1", "keyword2"]

# Split text into sentences
sentences = text.split('. ')

# Encode sentences
def encode_sentence(sentence):
    inputs = tokenizer(sentence, return_tensors='pt')
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

sentence_embeddings = np.array([encode_sentence(sentence) for sentence in sentences])

# Calculate similarity with document embedding (average of all sentence embeddings)
doc_embedding = sentence_embeddings.mean(axis=0)
similarities = cosine_similarity([doc_embedding], sentence_embeddings).flatten()

# Rank sentences by similarity score
sentence_scores = list(zip(sentences, similarities))
sentence_scores.sort(key=lambda x: x[1], reverse=True)

# Ensure sentences with keywords are included
keyword_sentences = [sentence for sentence in sentences if any(keyword in sentence for keyword in keywords)]

# Combine top-ranked sentences and keyword sentences
selected_sentences = {sentence for sentence, _ in sentence_scores[:5]} | set(keyword_sentences)

# Maintain original order for the summary
ordered_summary = [sentence for sentence in sentences if sentence in selected_sentences]

summary = '. '.join(ordered_summary)

print(summary)


ValueError: Found array with dim 3. check_pairwise_arrays expected <= 2.

In [9]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Initialize the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to tokenize and encode sentences
def encode_sentences(sentences):
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    return model_output.last_hidden_state.mean(dim=1).numpy()

# Function to perform extractive summarization
def extractive_summarization(text, keywords, top_n=5):
    # Split the document into sentences
    sentences = text.split('. ')
    # Encode sentences
    sentence_embeddings = encode_sentences(sentences)
    
    # Encode keywords
    keyword_embeddings = encode_sentences(keywords)
    
    # Calculate cosine similarity between sentences and keywords
    similarities = []
    for sentence_embedding in sentence_embeddings:
        similarity = max(cosine_similarity([sentence_embedding], keyword_embeddings).flatten())
        similarities.append(similarity)
    
    # Rank sentences based on similarity scores
    ranked_sentences = [sentence for _, sentence in sorted(zip(similarities, sentences), reverse=True)]
    
    # Select top n sentences
    summary = '. '.join(ranked_sentences[:top_n])
    return summary

# Example usage
import pymupdf # imports the pymupdf library
text = ""
doc = pymupdf.open("./app/first_chapter.pdf") # open a document
for page in doc: # iterate the document pages
  text += page.get_text() # get plain text encoded as UTF-8
keywords = ["security", "threat", "crime"]
summary = extractive_summarization(text, keywords)
print(summary)


Select action varieties in Financial motive over time
. Phishing email report rate by click status
Figure 5. In short, those are 
breaches an organization could 
potentially mitigate or prevent by trying 
to select vendors with better security 
track records. Pure Extortion 
attacks have risen over the past year 
and are now a component of 9% of 
all breaches. As one might imagine, the main 
vector for those initial entry points was 
Web applications.
2024 DBIR Summary of findings
Figure 1
