In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

nltk.download('punkt')
nltk.download('stopwords')

from compound_to_simple import compound_to_simple

ModuleNotFoundError: No module named 'nltk'

In [10]:

def preprocess_text(text):
    # Tokenize sentences
    sentences = sent_tokenize(text)
    # Tokenize words and remove stopwords
    stop_words = set(stopwords.words('english'))
    processed_sentences = [' '.join([word for word in word_tokenize(sentence.lower()) if word.isalnum() and word not in stop_words])
                           for sentence in sentences]
    return sentences, processed_sentences


In [11]:

def compute_cosine_similarity(sentences, processed_sentences):
    # Create TF-IDF vectorizer and transform sentences
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(processed_sentences)
    # Compute cosine similarity matrix
    cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    return cosine_sim_matrix


In [12]:

def extractive_summarization(text, top_n=3):
    # Preprocess text
    sentences, processed_sentences = preprocess_text(text)
    # Compute cosine similarity matrix
    cosine_sim_matrix = compute_cosine_similarity(sentences, processed_sentences)
    # Rank sentences based on their average cosine similarity to other sentences
    sentence_scores = np.sum(cosine_sim_matrix, axis=1)
    # Get top N sentences
    top_sentence_indices = np.argsort(sentence_scores)[-top_n:]
    top_sentences = [sentences[index] for index in sorted(top_sentence_indices)]
    # Combine top sentences to form the summary
    summary = ' '.join(top_sentences)
    return summary


In [14]:

# Example usage
text = compound_to_simple()

summary = extractive_summarization(text, top_n=30)
print("Summary:")
print(summary)

Summary:
## Cybersecurity Simplified: Understanding the 2024 Data Breach Investigations Report

**Welcome to Verizon's 2024 Data Breach Investigations Report (DBIR). ** This report is about cybercrime and its impact on organizations. It's in its 17th year and analyzes real-world security incidents from around the globe. ** These include zero-day vulnerabilities like the one that affected MOVEit, ransomware, and denial-of-service attacks. Cybercriminals continue to find ways to exploit vulnerabilities and steal data. **The report also emphasizes the human element in cybersecurity. **  Poorly protected passwords and human errors make organizations vulnerable to attacks. **The DBIR analyzed over 30,000 security incidents, with over 10,000 confirmed data breaches. **  

**The report wouldn't be possible without the contributions of global security experts and organizations. ** If you'd like to learn more about citing the report, the information is available. ## Cybersecurity: Simple Takeaw

In [None]:
with open("summary_stc","w") as f:
    f.write(summary)