In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset
import spacy   
from nltk.corpus import stopwords 
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import re
import string
from wordcloud import WordCloud
from collections import Counter

In [None]:

ds = load_dataset("allenai/scitldr", "AIC")

train_dataset = load_dataset("allenai/scitldr", "AIC", split="train")
valid_dataset = load_dataset("allenai/scitldr", "AIC", split="validation")
test_dataset  = load_dataset("allenai/scitldr", "AIC", split="test")

train_df = train_dataset.to_pandas()
valid_df = valid_dataset.to_pandas()
test_df = test_dataset.to_pandas()


In [None]:
train_df.head()

In [None]:
valid_df.head()

In [None]:
test_df.head()

In [None]:
print('Train Dataset Shape:', train_df.shape)
print('Validation Dataset Shape:', valid_df.shape)
print('Test Dataset Shape:', test_df.shape)

In [None]:
train_df.info()

train_df["source"][0]

In [None]:
train_df["source"][0]

In [None]:
source_train = train_df["source"].apply(lambda x: " ".join(x))
target_train = train_df["target"].apply(lambda x: " ".join(x))
source_valid = valid_df["source"].apply(lambda x: " ".join(x))
target_valid = valid_df["target"].apply(lambda x: " ".join(x))
source_test = test_df["source"].apply(lambda x: " ".join(x))
target_test = test_df["target"].apply(lambda x: " ".join(x))

In [None]:
source_train

In [None]:
df_train = pd.DataFrame({'source': source_train, 'target': target_train})
df_valid = pd.DataFrame({'source': source_valid, 'target': target_valid})
df_test = pd.DataFrame({'source': source_test, 'target': target_test})

In [None]:
df_valid

In [None]:
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
def preprocess_text(text):
    text = text.lower()  
    text = re.sub(r'\s+', ' ', text)  
    text = re.sub(r'\[.*?\]', '', text) 
    text = re.sub(r'https?://\S+|www\.\S+', '', text) 
    text = text.translate(str.maketrans('', '', string.punctuation)) 
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.text not in stop_words]
    return " ".join(tokens)


In [None]:
tqdm.pandas(desc="Preprocessing Training Data")
clean_train_source = source_train.progress_apply(preprocess_text)
clean_train_target = target_train.progress_apply(preprocess_text)
tqdm.pandas(desc="Preprocessing Validation Data")
clean_valid_source = source_valid.progress_apply(preprocess_text)
clean_valid_target = target_valid.progress_apply(preprocess_text)
tqdm.pandas(desc="Preprocessing Testing Data")
clean_test_source = source_test.progress_apply(preprocess_text)
clean_test_target = target_test.progress_apply(preprocess_text)

In [None]:
def plot_text_length_distribution(before, after, title):
    plt.figure(figsize=(12, 5))
    sns.histplot(before.str.split().apply(len), bins=30, kde=True, label="Before", color="blue")
    sns.histplot(after.str.split().apply(len), bins=30, kde=True, label="After", color="red")
    plt.xlabel("Number of Words")
    plt.ylabel("Frequency")
    plt.title(title)
    plt.legend()
    plt.show()

plot_text_length_distribution(source_train, clean_train_source, "Text Length Distribution (Train Source)")
plot_text_length_distribution(target_train, clean_train_target, "Text Length Distribution (Train Target)")


In [None]:
def plot_wordcloud(texts, title):
    text = " ".join(texts)
    wordcloud = WordCloud(width=800, height=400, background_color="white").generate(text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title(title)
    plt.show()

plot_wordcloud(source_train, "Word Cloud Before Cleaning (Train Source)")
plot_wordcloud(clean_train_source, "Word Cloud After Cleaning (Train Source)")


In [None]:
def plot_most_common_words(texts, title, n=20):
    words = " ".join(texts).split()
    word_counts = Counter(words)
    most_common_words = word_counts.most_common(n)
    words, counts = zip(*most_common_words)
    
    plt.figure(figsize=(12, 5))
    sns.barplot(x=list(words), y=list(counts), palette="viridis")
    plt.xticks(rotation=45)
    plt.xlabel("Words")
    plt.ylabel("Frequency")
    plt.title(title)
    plt.show()

plot_most_common_words(source_train, "Most Common Words Before Cleaning (Train Source)")
plot_most_common_words(clean_train_source, "Most Common Words After Cleaning (Train Source)")

In [None]:
def generate_embeddings(texts, batch_size=32):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Generating Embeddings"):
        batch = texts[i : i + batch_size]
        batch_embeddings = sbert_model.encode(batch, convert_to_tensor=True).cpu().numpy()
        embeddings.extend(batch_embeddings)
    return embeddings

source_train_embeddings = generate_embeddings(clean_train_source.tolist())

In [None]:
source_train_embeddings

In [None]:
# FAISS (Facebook AI Similarity Search) for efficiently search for similar text embeddings.
import faiss

# Convert embeddings to a NumPy array
source_train_embeddings_np = np.array(source_train_embeddings).astype("float32")

# Create a FAISS index
index = faiss.IndexFlatL2(source_train_embeddings_np.shape[1])  # L2 (Euclidean) distance
index.add(source_train_embeddings_np)  # Add embeddings to the index

# Example: Search for the 5 most similar documents to the first document
query_embedding = source_train_embeddings_np[0].reshape(1, -1)  # Querying the first document
D, I = index.search(query_embedding, 5)  # Returns distances (D) and indices (I)

print("Top 5 similar document indices:", I)
print("Top 5 similarity scores:", D)



In [None]:
# T5 (Text-to-Text Transfer Transformer) used to generate summaries from source text.

from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load pre-trained T5 model and tokenizer
t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")
t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")

def summarize_text(text, max_length=50):
    input_text = "summarize: " + text  # Prefix needed for T5
    input_ids = t5_tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    
    # Generate summary
    summary_ids = t5_model.generate(input_ids, max_length=max_length, num_beams=4, early_stopping=True)
    return t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Example: Summarizing the first document
example_summary = summarize_text(source_train[0])
print("Original Text:", source_train[0])
print("Summarized Text:", example_summary)


In [None]:
# BERTopic and LDA are used for topic modeling and document clustering.

# Implementation using BERTopic:
from bertopic import BERTopic

# Initialize and fit BERTopic model
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(clean_train_source.tolist(), source_train_embeddings)

# Display the topics
topic_model.get_topic_info().head(10)

# Visualize the topics
topic_model.visualize_barchart(top_n_topics=10)

# Implementation using LDA (Latent Dirichlet Allocation):
from gensim.corpora import Dictionary
from gensim.models import LdaModel

# Tokenize the cleaned source text
tokenized_texts = [text.split() for text in clean_train_source.tolist()]

# Create a dictionary and corpus
dictionary = Dictionary(tokenized_texts)
corpus = [dictionary.doc2bow(text) for text in tokenized_texts]

# Train LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, passes=10)

# Display top words for each topic
for idx, topic in lda_model.show_topics(formatted=True, num_words=10):
    print(f"Topic {idx}: {topic}")


In [None]:
# Use FAISS for fast similarity search over the source document embeddings.

# Use T5 to generate summaries from documents.

# Use BERTopic and LDA for topic modeling and document clustering.