
 a. BASIC TEXT SUMMARIZATION USING TF-IDF AND COSINE SIMILARITY


In [1]:
# 1. Import Required Libraries
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download necessary datasets for tokenization and stopwords
nltk.download('punkt')
nltk.download('stopwords')

# 2. Define Sample Text
text = """
Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial
intelligence concerned with the interactions between computers and human language, in
particular how to program computers to process and analyze large amounts of natural language
data.
Challenges in natural language processing frequently involve speech recognition, natural
language understanding, and natural language generation.
"""

# 3. Preprocess the Text
# Split the text into sentences
sentences = nltk.sent_tokenize(text)

# Get the set of stopwords in English
stop_words = set(stopwords.words('english'))

# Function to preprocess each sentence by removing stopwords
def preprocess_sentence(sentence):
    return ' '.join([word for word in sentence.split() if word.lower() not in stop_words])

# Preprocess all the sentences
preprocessed_sentences = [preprocess_sentence(sentence) for sentence in sentences]

# 4. Compute TF-IDF Matrix
# Create a TfidfVectorizer object
vectorizer = TfidfVectorizer()

# Transform the preprocessed sentences into TF-IDF features
tfidf_matrix = vectorizer.fit_transform(preprocessed_sentences)

# 5. Compute Cosine Similarity
# Compute cosine similarity between TF-IDF vectors of the sentences
cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

# 6. Generate Summary
# Function to generate a summary by ranking sentences based on their similarity scores
def generate_summary(sentences, sim_matrix, top_n=2):
    # Compute the sum of similarity scores for each sentence
    scores = sim_matrix.sum(axis=1)
    
    # Rank sentences based on the scores and select the top 'n' sentences
    ranked_sentences = [sentences[i] for i in scores.argsort()[-top_n:]]
    
    # Return the summary as a string
    return ' '.join(ranked_sentences)

# Generate and print the summary
summary = generate_summary(sentences, cosine_sim_matrix)
print("Summary:")
print(summary)


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Summary:

Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial
intelligence concerned with the interactions between computers and human language, in
particular how to program computers to process and analyze large amounts of natural language
data. Challenges in natural language processing frequently involve speech recognition, natural
language understanding, and natural language generation.


## b. ABSTRACTIVE TEXT SUMMARIZATION WITH TRANSFORMERS

In [2]:
! pip install transformers datasets



In [3]:
# complte code 7 week 2 question

# 1. Install required libraries (run this in your environment first)
# !pip install transformers datasets

# 2. Import Required Libraries
from transformers import BartForConditionalGeneration, BartTokenizer
from datasets import load_dataset

# 3. Load the Dataset
# Load the CNN/DailyMail dataset (test split, 1% for demonstration purposes)
dataset = load_dataset('cnn_dailymail', '3.0.0', split='test[:1%]')

# 4. Load Pre-trained BART Model and Tokenizer
# Load pre-trained BART model and tokenizer
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

# 5. Summarize Text
# Function to summarize text
def summarize(text):
    # Tokenize the input text
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
    
    # Generate the summary
    summary_ids = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    
    # Decode the generated summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Sample Input and Output
# Summarize a few sample articles from the dataset
for i in range(3):  # Loop through first 3 samples for demonstration
    article = dataset[i]['article']
    print(f"Original Text {i+1}: {article}\n")
    
    # Generate and print the summary
    summary = summarize(article)
    print(f"Summary {i+1}: {summary}\n")


README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



Original Text 1: (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesd