In [4]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load pre-trained model tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def preprocess_text(text):
    sentences = text.split('. ')
    return sentences

def get_sentence_embeddings(sentences, tokenizer, model):
    embeddings = []
    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
    return embeddings

def rank_sentences(sentences, embeddings):
    sentence_scores = []
    for i, emb in enumerate(embeddings):
        score = cosine_similarity([emb], embeddings).mean()
        sentence_scores.append((score, i))
    ranked_sentences = sorted(sentence_scores, reverse=True, key=lambda x: x[0])
    return [sentences[i] for _, i in ranked_sentences]

def keyword_relevant_sentences(sentences, keywords):
    relevant_sentences = []
    for sentence in sentences:
        if any(keyword.lower() in sentence.lower() for keyword in keywords):
            relevant_sentences.append(sentence)
    return relevant_sentences

def summarize(text, keywords, num_sentences=30):
    sentences = preprocess_text(text)
    embeddings = get_sentence_embeddings(sentences, tokenizer, model)
    ranked_sentences = rank_sentences(sentences, embeddings)
    
    summary = ranked_sentences[:num_sentences]
    relevant_sentences = keyword_relevant_sentences(sentences, keywords)
    
    # Ensure unique sentences in the final summary
    final_summary_sentences = list(dict.fromkeys(summary + relevant_sentences))
    
    # Maintain the original order
    final_summary_sentences_sorted = sorted(final_summary_sentences, key=lambda x: sentences.index(x))
    
    return '. '.join(final_summary_sentences_sorted)

# Example usage
# text = """Your input text goes here. It should be a long paragraph with multiple sentences."""
# keywords = ["keyword1", "keyword2"]

# summary = summarize(text, keywords)
# print("Summary:", summary)

# text = """Climate change is one of the most pressing issues of our time. Rising global temperatures have led to a variety of environmental impacts, including more frequent and severe weather events. The polar ice caps are melting at an alarming rate, causing sea levels to rise and threatening coastal communities. Additionally, changing weather patterns are affecting agriculture, making it more difficult for farmers to grow crops. Governments and organizations around the world are working to address climate change through various measures. Renewable energy sources, such as solar and wind power, are being developed to reduce reliance on fossil fuels. International agreements, like the Paris Agreement, aim to unite countries in the fight against climate change. Public awareness and education on the issue are also crucial for driving change. It is essential that everyone takes part in efforts to mitigate the effects of climate change to ensure a sustainable future for generations to come."""
import pymupdf

extracted_text = ""
doc = pymupdf.open("first_chapter.pdf") # open a document
for page in doc: # iterate the document pages
    extracted_text += page.get_text() # get plain text encoded as UTF-8

keywords = ["threat", "ransomware"]

summary = summarize(extracted_text, keywords)
print("Summary:", summary)



Summary: 5
2024 DBIR Introduction
Introduction
Greetings! Welcome to Verizon’s 2024 Data Breach Investigations Report (DBIR). Thanks to our talented, generous and civic-minded contributors from around 
the world who continue to stick with us and share their data and insight, and deep 
appreciation for our very own Verizon Threat Research Advisory Center (VTRAC) 
team (rock stars that they are). These two groups enable us to examine and analyze 
relevant trends in cybercrime that play out on a global stage across organizations of 
all sizes and types.
From year to year, we see new and innovative attacks as well as variations on tried-
and-true attacks that still remain successful. From the exploitation of well-known 
and far-reaching zero-day vulnerabilities, such as the one that affected MOVEit, to 
the much more mundane but still incredibly effective Ransomware and Denial of 
Service (DoS) attacks, criminals continue to do their utmost to prove the old adage 
“crime does not pay” wron

In [3]:
summary.split(".")

['Thanks to our talented, generous and civic-minded contributors from around \nthe world who continue to stick with us and share their data and insight, and deep \nappreciation for our very own Verizon Threat Research Advisory Center (VTRAC) \nteam (rock stars that they are)',
 ' From the exploitation of well-known \nand far-reaching zero-day vulnerabilities, such as the one that affected MOVEit, to \nthe much more mundane but still incredibly effective Ransomware and Denial of \nService (DoS) attacks, criminals continue to do their utmost to prove the old adage \n“crime does not pay” wrong',
 '\nThe shifting landscape of cyber threats can be confusing and overwhelming',
 ' Enterprise floats of all shapes and sizes \ncruising past a large crowd of threat actors who are shouting out gleefully “Throw \nme some creds!” Of course, human nature being what it is, all too often, the folks \non the floats do just that',
 ' We \nanalyzed 30,458 real-world security incidents, of which 10,626 wer

In [11]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer

nltk.download('punkt')

def split_into_phrases(sentence):
    # Tokenize the sentence into words
    words = word_tokenize(sentence)
    
    # Detect the subject and verb
    subject_verb = []
    for word in words:
        if word.lower() not in ['and', 'or', ',']:
            subject_verb.append(word)
        else:
            break
    
    # Identify the subject and verb phrase
    subject_verb_phrase = TreebankWordDetokenizer().detokenize(subject_verb)
    
    # Create a list to store phrases
    phrases = []
    
    # Temporary list to store current phrase
    current_phrase = []
    
    # Iterate through words starting after the subject and verb
    for word in words[len(subject_verb):]:
        if word.lower() in ['and', 'or', ',']:
            if current_phrase:
                phrase = subject_verb_phrase + ' ' + TreebankWordDetokenizer().detokenize(current_phrase)
                phrases.append(phrase)
                current_phrase = []
        else:
            current_phrase.append(word)
    
    # Add the last phrase if it exists
    if current_phrase:
        phrase = subject_verb_phrase + ' ' + TreebankWordDetokenizer().detokenize(current_phrase)
        phrases.append(phrase)
    
    return phrases

# Test the function
sentence = "I like dogs and cats."
phrases = split_into_phrases(sentence)
for phrase in phrases:
    print(phrase)


I like dogs cats.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sathi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
