In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

nltk.download('punkt')
nltk.download('stopwords')

from compound_to_simple import compound_to_simple

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\99013031\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\99013031\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


In [2]:

def preprocess_text(text):
    # Tokenize sentences
    sentences = sent_tokenize(text)
    # Tokenize words and remove stopwords
    stop_words = set(stopwords.words('english'))
    processed_sentences = [' '.join([word for word in word_tokenize(sentence.lower()) if word.isalnum() and word not in stop_words])
                           for sentence in sentences]
    return sentences, processed_sentences


In [3]:

def compute_cosine_similarity(sentences, processed_sentences):
    # Create TF-IDF vectorizer and transform sentences
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(processed_sentences)
    # Compute cosine similarity matrix
    cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    return cosine_sim_matrix


In [5]:

def extractive_summarization(text, top_n=3):
    # Preprocess text
    sentences, processed_sentences = preprocess_text(text)
    # Compute cosine similarity matrix
    cosine_sim_matrix = compute_cosine_similarity(sentences, processed_sentences)
    # Rank sentences based on their average cosine similarity to other sentences
    sentence_scores = np.sum(cosine_sim_matrix, axis=1)
    # Get top N sentences
    top_sentence_indices = np.argsort(sentence_scores)[-top_n:]
    top_sentences = [sentences[index] for index in sorted(top_sentence_indices)]
    # Combine top sentences to form the summary
    summary = ' '.join(top_sentences)
    return summary


In [6]:

# Example usage
text = compound_to_simple()

summary = extractive_summarization(text, top_n=30)
print("Summary:")
print(summary)

Summary:
Welcome to Verizon's 2024 Data Breach Investigations Report (DBIR). This is the 17th edition of the report. Data and insights from contributors around the world help us analyze cybercrime trends globally. We see new and innovative attacks as well as variations of older, successful attacks. Criminals continue to exploit vulnerabilities, such as the one that affected MOVEit, and use ransomware and denial-of-service (DoS) attacks. Cybercrime has been very active in the past year. We analyzed 30,458 security incidents, of which 10,626 were data breaches. The structure of the report remains similar, but there are some changes. We encourage new readers to review Appendix A before diving into the report. The 2024 DBIR dataset includes incidents from November 1, 2022, through October 31, 2023. The report focuses primarily on the 2023 data. The time between the data collection and report publication is spent acquiring, anonymizing, aggregating, analyzing, and writing the report. People

In [9]:
with open("summary.txt","w") as f:
    f.write(summary)