                                                            * Downloading libraries *                                                                               

In [None]:
!pip3 install nltk rouge-score
!pip3 install torch==2.1.0 torchtext==0.16.0  # Example, use versions compatible with each other
!pip3 install pandas
!pip3 install transformers
!pip3 install sentencepiece
!pip3 install bert-extractive-summarizer
!pip3 install numpy==1.22.4
!pip3 install bert-extractive-summarizer transformers

In [None]:
!pip3 cache purge

                                                                Importing libraries                                                                                 

In [None]:
import torch
import torchtext
import sentencepiece as spm
import torch.nn as nn
import torch.optim as optim
import nltk
import random
import numpy as np
import pandas as pd
from summarizer import Summarizer
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split

In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')

There are 2 types of summarization : 
1. Abstractive text summarization: The summary usually uses different words and phrases to concisely convey the same meaning as the original text.

2. Extractive summarization: The summary contains the most important sentences from the original input text sentences without any paraphrasing or changes. The sentences deemed unnecessary are discarded.

Models tried for Use Case 3 : 

1. BART (Bidirectional and Auto-Regressive Transformers) Model 
2. T5 (Text-to-Text Transfer Transformer) Model 
3. BERT (Bidirectional Encoder Representations from Transformers)
5. PEGASUS (Pre-training with Extracted Gap-sentences for Abstractive Summarization)

Examples of models for Extractive Text Summarization include : 

1. BERT : bert-base-uncased
2. distilbert-base-uncased
3. Sentence-BERT (extractive)

                                                            Extractive BERT based pre-trained model                                                                 

In [9]:
# Loading the dataset from csv
def load_data(file_name):
    df=pd.read_csv(file_name)
    return df;

def print_table(df):
    # Determine the max length for each column
    col_widths = {col: max(df[col].apply(lambda x: len(str(x)))) for col in df.columns}
    
    # Print the table header with padded columns
    header = " | ".join([col.ljust(col_widths[col]) for col in df.columns])
    print(header)
    print("-" * len(header))  # Add a separator line
    
    # Print each row with padded columns
    for index, row in df.iterrows():
        row_str = " | ".join([str(value).ljust(col_widths[col]) for col, value in row.items()])
        print(row_str)
    
# Load the dataset from CSV
df = load_data('test.csv')

def main():
        
        results=[]
      
        # Iterate over the rows of the dataframe
        for index, row in df.iterrows():
                text = row['text']
                expected_summary = row['summary']

                # Step 1: Initialize the Summarizer with a BERT-based model, which is 'bert-base-uncased'        
                distilbert_model = Summarizer('distilbert-base-uncased')
                
                # Step 2: Perform Extractive Summarization
                predicted_summary = distilbert_model(text, ratio=0.2)

                # Step 3: Print the Summary
                #print("\nOriginal Text:\n", text)
                #print("\nExtractive Summary:\n", predicted_summary)

                results.append({'S.No':index+1,'Extractive Summary': predicted_summary,'Original Text': text})

        # Convert the results list into a DataFrame
        results_df = pd.DataFrame(results)

        # Print the results in a padded tabular format
        print_table(results_df)
    
if __name__ == "__main__":
    main()


S.No | Extractive Summary                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          | Original Text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      

                                                            MODEL CREATED FOR EXTRACTIVE SUMMARIZATION                                                            

In [None]:

    
# Sample Document (You would replace this with multiple documents in a real setting)
text = """
How does the BART summarization model compare to the other summarization models out there? Research groups still compare these models using the old recall-oriented understudy for gisting evaluation (ROUGE) metrics. But ROUGE looks for common words and n-grams between the generated and reference summaries — the more there are, the higher the score. Since abstractive models paraphrase the text, they may not score well, and high scores may not result in good summaries under real-world conditions.
"""

# Step 1: Tokenize the text into sentences
sentences = sent_tokenize(text)

# Step 2: Convert each sentence into an embedding vector (using random embeddings for simplicity)
# In a real scenario, you'd use pre-trained embeddings like GloVe or BERT
embedding_dim = 50
sentence_embeddings = [np.random.rand(embedding_dim) for _ in sentences]
sentence_embeddings = np.array(sentence_embeddings)

# Step 3: Create labeled data (for demonstration, let's label the first 2 sentences as "important")
labels = [1 if i < 2 else 0 for i in range(len(sentences))]  # 1: Important, 0: Not Important
labels = torch.tensor(labels, dtype=torch.float32).view(-1, 1)  # Ensure the shape is [batch_size, 1]

# Step 4: Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(sentence_embeddings, labels, test_size=0.2, random_state=42)

# Convert data to tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)

# Step 5: Define a simple neural network model for scoring sentences
class SentenceScoringModel(nn.Module):
    def __init__(self, input_dim):
        super(SentenceScoringModel, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_size=32, batch_first=True)
        self.fc = nn.Linear(32, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # Ensure input is 3D tensor of shape (batch_size, seq_len=1, input_size)
        #x = x.unsqueeze(1)  # Reshape to (batch_size, seq_len=1, input_size)
        
        # Forward pass through LSTM
        _, (hidden, _) = self.lstm(x)  # LSTM output
        output = self.fc(hidden[-1])  # Feed to fully connected layer
        return self.sigmoid(output)  # Sigmoid output between 0 and 1

# Instantiate model, define loss function and optimizer
model = SentenceScoringModel(input_dim=embedding_dim)
criterion = nn.BCELoss()  # Binary Cross Entropy for binary classification
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Step 6: Train the model
num_epochs = 100
batch_size = 2  # Define batch size
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    # Forward pass: Ensure correct dimensions
    num_batches = len(X_train) // batch_size  # Calculate number of batches
    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = start_idx + batch_size

        # Select a batch of input and target data
        X_batch = X_train[start_idx:end_idx]  # Shape: [batch_size, input_size]
        y_batch = y_train[start_idx:end_idx]  # Shape: [batch_size, 1]

        # Ensure input tensor is in the correct shape (3D tensor)
        X_batch = X_batch.unsqueeze(1)  # Reshape to (batch_size, seq_len=1, input_size)
        
        
        # Forward pass through the model
        predictions = model(X_batch).view(batch_size, 1)  # Predictions should have shape [batch_size, 1]

        # Calculate the loss
        loss = criterion(predictions, y_batch)
        loss.backward()
        optimizer.step()

    if (epoch+1) % 20 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Step 7: Evaluate the model on test data
model.eval()
with torch.no_grad():
    test_predictions = model(X_test).view(-1, 1)  # Ensure test predictions are [batch_size, 1]
    test_loss = criterion(test_predictions, y_test)
    print(f'Test Loss: {test_loss.item():.4f}')

# Step 8: Generate Summary
# Run the model on all sentences in the document to score them
sentence_scores = []
with torch.no_grad():
    for sentence_embedding in sentence_embeddings:
        sentence_embedding_tensor = torch.tensor(sentence_embedding, dtype=torch.float32).view(1, -1)  # Reshape for batch size
        sentence_embedding_tensor = sentence_embedding_tensor.unsqueeze(1)  # Shape: [1, 1, input_size]
        
        # Debugging output for tensor shape
        print(f'Sentence input shape: {sentence_embedding_tensor.shape}')
        
        score = model(sentence_embedding_tensor)
        sentence_scores.append(score.item())

# Select the top N sentences with the highest scores for the summary
N = 2  # Number of sentences for the summary
top_sentence_indices = np.argsort(sentence_scores)[-N:][::-1]
summary = ' '.join([sentences[i] for i in top_sentence_indices])

print("\n text:\n",text)
print("\nExtractive Summary:\n", summary)
