## Setup

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split

file = "bbc-news-data.csv"
data = pd.read_csv(file,sep="\t")
X_train, X_test, y_train, y_test = train_test_split(data['content'], data['category'], test_size=0.2, random_state=42)


In [22]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

stop_words = set(stopwords.words('english'))
def preprocess(text):
    text = text.lower()
    text = ''.join([word for word in text if word not in string.punctuation])
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

X_train = X_train.apply(preprocess)
X_test = X_test.apply(preprocess)

In [53]:
import gensim


ImportError: cannot import name 'Iterable' from 'collections' (C:\Users\prpaj\AppData\Local\Programs\Python\Python312\Lib\collections\__init__.py)

In [19]:
import pandas as pd
import gensim
from gensim.summarization import summarize
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords') 
nltk.download('wordnet') 

file = "bbc-news-data.csv"
data = pd.read_csv(file, sep="\t")

column_to_summarize = "content"

def process(text):
    stop_words = set(stopwords.words('english'))
    text = text.encode('ascii', 'ignore').decode('ascii') 
    words = nltk.tokenize.WhitespaceTokenizer().tokenize(text)
    filtered_words = [word.lower() for word in words if word.lower() not in stop_words]
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    return ' '.join(lemmatized_words)

def summarize_row(text):
    try:
        processed_text = process(text)
        return summarize(processed_text, ratio=0.2)  
    except ValueError: 
        return "Unable to summarize" 

data['summary'] = data[column_to_summarize].apply(summarize_row)

data.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prpaj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\prpaj\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Word2Vec

In [18]:
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
import string
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk import WordNetLemmatizer

def process(text):
    stop_words = set(stopwords.words('english'))
    text = text.encode('ascii', 'ignore').decode('ascii')  
    words = nltk.tokenize.WhitespaceTokenizer().tokenize(text)
    filtered_words = [word.lower() for word in words if word.lower() not in stop_words]
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    return ' '.join(lemmatized_words)

file = "bbc-news-data.csv"
data = pd.read_csv(file, sep="\t")

data['content'] = data['content'].astype(str)
data['content'] = data['content'].apply(process)

sentences = data['content'].apply(sent_tokenize)

all_words = [word_tokenize(sent) for sent in sentences if isinstance(sent, str)] 
model = Word2Vec(all_words, size=100, window=5, min_count=1) 
model.build_vocab(sentences, update=False) 
model.train(all_words, total_examples=model.corpus_count, epochs=model.epochs)

def create_summary(text):
    sentences = sent_tokenize(text)
    sentence_embeddings = [
        sum([model.wv[word] for word in word_tokenize(sent) if word in model.wv]) / len(word_tokenize(sent))
        for sent in sentences
    ]
    similarity_matrix = cosine_similarity(sentence_embeddings)
    sentence_scores = similarity_matrix.diagonal() 
    top_n = int(0.2 * len(sentences)) 
    top_sentence_indices = [idx for idx, score in sorted(enumerate(sentence_scores), key=lambda x: x[1], reverse=True)][:top_n]
    summary = ' '.join([sentences[i] for i in top_sentence_indices])
    return summary

data['summary'] = data['content'].apply(create_summary)
data.head()


RuntimeError: you must first build vocabulary before training the model

# BERT

In [None]:
import pandas as pd
import torch
from transformers import AutoModel, BertTokenizer
import nltk
from nltk import WordNetLemmatizer, sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt') 
nltk.download('stopwords')
nltk.download('wordnet') 

def process(text):
    stop_words = set(stopwords.words('english'))
    text = text.encode('ascii', 'ignore').decode('ascii')
    words = nltk.tokenize.WhitespaceTokenizer().tokenize(text)
    filtered_words = [word.lower() for word in words if word.lower() not in stop_words]
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    return ' '.join(lemmatized_words)

file = "bbc-news-data.csv"
data = pd.read_csv(file,sep="\t") 
data['content'] = data['content'].apply(process)

model_name = "distilbert-base-uncased"  
model = AutoModel.from_pretrained(model_name) 
tokenizer = BertTokenizer.from_pretrained(model_name)  

def create_summary_hybrid(text):
    sentences = sent_tokenize(text) 
    input_ids = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, max_length=512) 

    with torch.no_grad():
        if 'token_type_ids' in input_ids:
            del input_ids['token_type_ids'] 
        outputs = model(**input_ids)
        embeddings = outputs.last_hidden_state.mean(dim=1)

    similarity_matrix = cosine_similarity(embeddings)
    sentence_scores = similarity_matrix.diagonal() 
    top_n = int(0.2 * len(sentences)) 
    top_sentence_indices = [idx for idx, score in sorted(enumerate(sentence_scores), key=lambda x: x[1], reverse=True)][:top_n]
    top_sentences = [sentences[i] for i in top_sentence_indices]

    summary = ". ".join(top_sentences)
    return summary

data['summary'] = data['content'].apply(create_summary_hybrid)
data.head()
