In [1]:
!pip install gensim



In [2]:
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from transformers import BartForConditionalGeneration, BartTokenizer
import gensim.downloader as api
from gensim.models import Word2Vec,KeyedVectors

In [3]:
word_2_vec = api.load("word2vec-google-news-300")



In [4]:
model_name = "facebook/bart-large-cnn"
tokenizer_t5 = BartTokenizer.from_pretrained(model_name)
model_t5 = BartForConditionalGeneration.from_pretrained(model_name)

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [5]:
nltk.download("punkt_tab")
nltk.download("stopwords")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
text = """
  Artificial Intelligence (AI) is rapidly transforming the landscape of modern education by enhancing both teaching and learning experiences. With personalized learning systems, AI can adapt educational content to match individual student needs, pacing, and learning styles. Platforms powered by AI analyze student performance in real-time, offering tailored feedback and identifying areas that need improvement. This not only boosts student engagement but also helps educators allocate attention where it's most needed. AI-driven chatbots and virtual tutors provide 24/7 assistance, ensuring students have support outside traditional classroom hours. In administrative tasks, AI automates scheduling, grading, and student tracking, freeing up valuable time for teachers to focus on pedagogy. Moreover, AI enables predictive analytics to identify at-risk students early, allowing timely interventions that can prevent dropout or failure. Language processing tools assist in improving reading comprehension and writing skills, especially for students learning in non-native languages. While the integration of AI raises concerns about data privacy and over-reliance on technology, when used responsibly, it can democratize access to quality education globally. As educational institutions continue to evolve, AI stands out as a powerful ally in shaping more efficient, inclusive, and personalized learning environments for students at every level.
"""

In [7]:
#Extraction of each sentence
def sentence_tokenize_(text: str):
  sentences = sent_tokenize(text)
  stop_words = set(stopwords.words("english"))
  clean_sentence = []
  for sentence in sentences:
    words = word_tokenize(sentence)
    words = [word for word in words if word.isalnum() and word not in stop_words]
    clean_sentence.append(words)
  return sentences,clean_sentence

#Sentences are converted in vectors
def sentence_vectors(text, model):
  valid_words = [word for word in text if word in model]
  if not valid_words:
    return np.zeros(model.vector_size)
  return np.mean([model[valid_word] for valid_word in valid_words], axis=0)

#Sentences are reorganized using pagerank algo
def scoring(vectors):
  sim_matrix = cosine_similarity(vectors)
  np.fill_diagonal(sim_matrix, 0)
  graph = nx.from_numpy_array(sim_matrix)
  scores = nx.pagerank(graph)
  return scores

#Sentences are reorganized according to the score
def reorganise_text(text,model,top_n = 3):
  sentences,clean_sentence = sentence_tokenize_(text)
  sentence_vec = [sentence_vectors(sentence,model) for sentence in clean_sentence]
  scores = scoring(sentence_vec)
  ranked_sentences = sorted(((scores[i],s) for i, s in enumerate(sentences)),reverse = True)
  selected = [s for _, s in ranked_sentences[:top_n]]
  selected.sort(key=lambda s: sentences.index(s))
  return " ".join(selected)

#Net Summarization of the text
def meta_summarize(summaries, tokenizer, model, max_output_length=1024):
    combined_summaries = " ".join(summaries)
    input_text = "summarize: " + combined_summaries
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True, max_length=1024).to(model.device)
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=max_output_length,
        min_length=max(20, int(0.3 * max_output_length)),
        num_beams=4,
        early_stopping=True
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

#Chunk wise summary
def hybrid_summary(text, w2v_model=word_2_vec, tokenizer=tokenizer_t5, summ_model=model_t5, top_n=3, max_output_length=512):
    extracted_text = reorganise_text(text, w2v_model, top_n=top_n)
    # Get chunk-based summaries
    sentences = sent_tokenize(extracted_text)
    chunks, current_chunk = [], ""
    for sentence in sentences:
        if len(tokenizer.encode(current_chunk + sentence.strip())) < 200:
            current_chunk += sentence + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + " "
    if current_chunk:
        chunks.append(current_chunk.strip())
    # Summarize each chunk
    summaries = []
    for chunk in chunks:
        input_text = "summarize: " + chunk
        inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True, max_length=1024).to(summ_model.device)
        summary_ids = summ_model.generate(
            inputs["input_ids"],
            max_length=max_output_length,
            min_length=int(max_output_length/1.5),
            num_beams=4,
            early_stopping=True
        )
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(summary)
    return meta_summarize(summaries, tokenizer, summ_model, max_output_length=max_output_length)

#Sentence wise summary
def sentence_wise_summary(text, w2v_model=word_2_vec, tokenizer=tokenizer_t5, summ_model=model_t5, top_n=3, max_output_length=512):
    extracted_text = reorganise_text(text, w2v_model, top_n=top_n)
    sentences = sent_tokenize(extracted_text)
    summaries = []
    for sentence in sentences:
        input_text = "summarize: " + sentence
        inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True, max_length=1024).to(summ_model.device)
        summary_ids = summ_model.generate(
            inputs["input_ids"],
            max_length=max_output_length,
            min_length=int(max_output_length/1.5),
            num_beams=4,
            early_stopping=True
        )
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(summary)
    return meta_summarize(summaries, tokenizer, summ_model, max_output_length=max_output_length)
#cleans useless info
def clean_summary(summary: str) -> str:
    junk_phrases = [
        "for more information",
        "click here",
        "back to",
        "visit the website",
        "page you came from"
    ]
    return ". ".join(
        sentence for sentence in summary.split(". ")
        if not any(junk in sentence.lower() for junk in junk_phrases)
    )

Hybrid Summary means first the text is reorganized by the meaning of the each sentences then it is summarized by chunks and it sends it into meta summariser which gives the final summary.

Sentence_wise_Summary just takes reorganised sentences summarises it sentence wise and then it sends it to meta summariser which gives the final summary.

In [8]:
length = len(tokenizer_t5.encode(text))
hybrid_summary_result = hybrid_summary(text, top_n=5, max_output_length= int(length/2) + 65 )
sentence_wise_summary_result = sentence_wise_summary(text, top_n=5, max_output_length= int(length/2) + 65)

In [9]:
text`

"\n  Artificial Intelligence (AI) is rapidly transforming the landscape of modern education by enhancing both teaching and learning experiences. With personalized learning systems, AI can adapt educational content to match individual student needs, pacing, and learning styles. Platforms powered by AI analyze student performance in real-time, offering tailored feedback and identifying areas that need improvement. This not only boosts student engagement but also helps educators allocate attention where it's most needed. AI-driven chatbots and virtual tutors provide 24/7 assistance, ensuring students have support outside traditional classroom hours. In administrative tasks, AI automates scheduling, grading, and student tracking, freeing up valuable time for teachers to focus on pedagogy. Moreover, AI enables predictive analytics to identify at-risk students early, allowing timely interventions that can prevent dropout or failure. Language processing tools assist in improving reading compr

In [10]:
hybrid_summary_result

'With personalized learning systems, AI can adapt educational content to match individual student needs, pacing, and learning styles. Platforms powered by AI analyze student performance in real-time, offering tailored feedback and identifying areas that need improvement. AI-driven chatbots and virtual tutors provide 24/7 assistance, ensuring students have support outside traditional classroom hours.'

In [11]:
sentence_wise_summary_result

"With personalized learning systems, AI can match educational content with individual students' needs and styles. AI can also adapt content based on student's age, gender, and other factors. platforms powered by AI analyze student performance in real-time, offering tailored feedback and identifying areas that need improvement. AI-driven chatbots and virtual tutors provide 24/7 assistance."