In [17]:
# Required Libraries
# import gensim
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import string
 
# Download stopwords if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sathi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sathi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [18]:

# # Sample text
# text = """
# Your sample text goes here.
# It can be multiple paragraphs and sentences.
# The longer, the better for a good summary.
# """

# with open('./first_chapter.pdf', 'r') as f:
#     text = f.read()

# import PyPDF2

# text  = " "
# pdf = PyPDF2.PdfReader("first_chapter.pdf")
# for page in pdf.pages:
#     text += page.extract_text()

# print(text)

import pymupdf # imports the pymupdf library
text = ""
doc = pymupdf.open("first_chapter.pdf") # open a document
for page in doc: # iterate the document pages
  text += page.get_text() # get plain text encoded as UTF-8

In [19]:
text

'5\n2024 DBIR Introduction\nIntroduction\nGreetings! Welcome to Verizon’s 2024 Data Breach Investigations Report (DBIR). \nThis year marks the 17th edition of this publication, and we are thrilled to welcome \nback our old friends and say hello to new readers. As always, the aim of the DBIR is \nto shine a light on the various Actor types, the tactics they utilize and the targets they \nchoose. Thanks to our talented, generous and civic-minded contributors from around \nthe world who continue to stick with us and share their data and insight, and deep \nappreciation for our very own Verizon Threat Research Advisory Center (VTRAC) \nteam (rock stars that they are). These two groups enable us to examine and analyze \nrelevant trends in cybercrime that play out on a global stage across organizations of \nall sizes and types.\nFrom year to year, we see new and innovative attacks as well as variations on tried-\nand-true attacks that still remain successful. From the exploitation of well-kn

In [20]:

# Preprocessing Function
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    sentences = sent_tokenize(text)
    cleaned_sentences = []
    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        words = [word for word in words if word.isalnum() and word not in stop_words]
        cleaned_sentences.append(words)
    return cleaned_sentences, sentences
 
# Get cleaned sentences and original sentences
cleaned_sentences, original_sentences = preprocess_text(text)

In [21]:

# Train Word2Vec Model
model = Word2Vec(cleaned_sentences, vector_size=100, window=5, min_count=1, workers=4, sg=1)

In [22]:
model

<gensim.models.word2vec.Word2Vec at 0x27666fe70a0>

In [23]:

# Function to get sentence vector
def get_sentence_vector(sentence, model):
    words = [word for word in word_tokenize(sentence.lower()) if word in model.wv.key_to_index]
    if not words:
        return np.zeros(model.vector_size)
    return np.mean([model.wv[word] for word in words], axis=0)
 
# Compute sentence vectors
sentence_vectors = np.array([get_sentence_vector(sentence, model) for sentence in original_sentences])

In [24]:
sentence_vectors

array([[ 4.4709756e-03, -1.1446456e-03, -2.6393358e-03, ...,
        -3.5235591e-03,  1.6300149e-03,  3.8390057e-03],
       [-5.7068036e-04, -2.0926401e-03,  1.5961392e-03, ...,
        -1.6243136e-03,  1.9475080e-03,  1.4308986e-03],
       [ 1.7785339e-03, -7.2972395e-04,  3.8468701e-04, ...,
        -1.6588650e-03, -6.9630929e-05, -1.9532049e-03],
       ...,
       [-7.7060089e-03,  6.6419081e-03,  2.7411073e-03, ...,
         8.0831554e-03, -1.7807402e-03, -7.3565086e-03],
       [ 2.6426851e-03,  5.3675869e-03, -1.5455144e-03, ...,
         1.2751862e-03,  1.2821684e-04, -1.4449992e-03],
       [ 2.4512855e-03,  1.6525927e-03,  2.3703310e-03, ...,
         2.1292856e-03, -3.4080192e-03,  2.4451914e-03]], dtype=float32)

In [25]:

# Keyword similarity function
def rank_sentences_by_keyword(keyword, sentence_vectors, original_sentences, model):
    keyword_vector = get_sentence_vector(keyword, model).reshape(1, -1)
    similarities = cosine_similarity(keyword_vector, sentence_vectors).flatten()
    ranked_indices = [index for index, _ in sorted(enumerate(similarities), key=lambda x: x[1], reverse=True)]
    ranked_sentences = [original_sentences[i] for i in ranked_indices]
    return ranked_sentences, ranked_indices

In [26]:

# Summarize based on keyword
def summarize(text, keyword, num_sentences=5):
    cleaned_sentences, original_sentences = preprocess_text(text)
    model = Word2Vec(cleaned_sentences, vector_size=100, window=5, min_count=1, workers=4,sg=1)
    sentence_vectors = np.array([get_sentence_vector(sentence, model) for sentence in original_sentences])
    ranked_sentences, ranked_indices = rank_sentences_by_keyword(keyword, sentence_vectors, original_sentences, model)
    top_indices = sorted(ranked_indices[:num_sentences])
    summary = ' '.join([original_sentences[i] for i in top_indices])
    return summary

In [27]:

# Example usage
keyword = "threat, malware, ransomware"
summary = summarize(text, keyword)
print(summary)

These attacks were 
primarily leveraged by Ransomware 
and other Extortion-related threat 
actors. Ransomware was 
a top threat across 92% of industries. We see this figure at 
15% this year, a 68% increase from the 
previous year, mostly fueled by the use 
of zero-day exploits for Ransomware 
and Extortion attacks. Over the past three years, the 
combination of Ransomware and 
other Extortion breaches accounted 
for almost two-thirds (fluctuating 
between 59% and 66%) of those 
attacks. According to the FBI’s 
Internet Crime Complaint Center 
(IC3) ransomware complaint data, 
the median loss associated with the 
combination of Ransomware and 
other Extortion breaches has been 
$46,000, ranging between $3 (three 
dollars) and $1,141,467 for 95% of the 
cases.
