In [2]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import AdamW
import pandas as pd

from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

class HindiSummarizationDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=512):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        article = str(self.data.iloc[index, 0])
        summary = str(self.data.iloc[index, 1])

        article_encoding = self.tokenizer(
            article,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        summary_encoding = self.tokenizer(
            summary,
            max_length=self.max_len // 4,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        labels = summary_encoding['input_ids']
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            'input_ids': article_encoding['input_ids'].flatten(),
            'attention_mask': article_encoding['attention_mask'].flatten(),
            'labels': labels.flatten()
        }

def load_dataset(dataset_path, tokenizer, val_split=0.1):
    dataframe = pd.read_csv(dataset_path)
    dataset = HindiSummarizationDataset(dataframe, tokenizer)

    val_size = int(len(dataset) * val_split)
    train_size = len(dataset) - val_size

    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

    return train_dataset, val_dataset

def create_data_loader(dataset, batch_size):
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

def train_model(model, data_loader, optimizer, device, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {total_loss / len(data_loader):.4f}")

def eval_model(model, data_loader, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            total_loss += loss.item()
    return total_loss / len(data_loader)

def generate_summary(article_text, model, tokenizer, max_input_len=512, max_output_len=128, device='cpu'):
    model.eval()
    inputs = tokenizer(
        article_text,
        max_length=max_input_len,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    ).to(device)

    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    with torch.no_grad():
        summary_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=max_output_len,
            num_beams=4,
            length_penalty=2.0,
            early_stopping=True
        )

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

def main():
    model_name = 'google/mt5-small'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    train_dataset_path = '/content/drive/MyDrive/Colab Notebooks/cleaned_train.csv'
    test_dataset_path = '/content/drive/MyDrive/Colab Notebooks/cleaned_test.csv'

    train_dataset, val_dataset = load_dataset(train_dataset_path, tokenizer)
    test_dataset, _ = load_dataset(test_dataset_path, tokenizer)

    train_data_loader = create_data_loader(train_dataset, batch_size=8)
    val_data_loader = create_data_loader(val_dataset, batch_size=8)
    test_data_loader = create_data_loader(test_dataset, batch_size=8)

    optimizer = AdamW(model.parameters(), lr=1e-4)

    epochs = 30  # Increase number of epochs

    train_model(model, train_data_loader, optimizer, device, epochs)

    torch.save(model.state_dict(), '/content/drive/MyDrive/Colab Notebooks/hindi_summarization_model.pth')
    print("Model saved successfully.")

    test_loss = eval_model(model, test_data_loader, device)
    print(f"Test Loss: {test_loss:.4f}")

    model.load_state_dict(torch.load('/content/drive/MyDrive/Colab Notebooks/hindi_summarization_model.pth'))
    print("Model loaded successfully.")

    # Example usage of generating summary
    article_text = "प्रधानमंत्री नरेंद्र मोदी पठानकोट एयरबेस पहुंच गए हैं. वे एयरबेस में सुरक्षा के हालात का जायजा ले रहे हैं और वायुसेनाकर्मियों से मिल रहे हैं. सुबह करीब सवा दस बजे प्रधानमंत्री पंजाब के पठानकोट के लिए रवाना हुए. एयरबेस का जायजा लेने के बाद प्रधानमंत्री बॉर्डर इलाकों का हवाई सर्वेक्षण भी करेंगे. पठानकोट एयरबेस पर पिछले हफ्ते आतंकियों ने हमला किया था. पाकिस्तान से आए आतंकियों के हमले को विफल कर दिया गया था. सभी 6 पाकिस्तानी आतंकी मारे गए थे. 7 सुरक्षाबल भी शहीद हुए थे. भारत ने पाकिस्तान को सबूत सौंपते हुए दोषियों के खिलाफ सख्त कार्रवाई करने को कहा है. जानकारी के मुताबिक, प्रधानमंत्री के साथ आर्मी और एयरफोर्स के चीफ भी मौजूद रह सकते हैं. एयरबेस पर पाकिस्तानी आतंकियों ने हफ्ते भर पहले हमला किया था, जिसमें सात जवान शहीद हुए थे. सुरक्षा बलों ने मुठभेड़ में सभी छह आतंकियों को मार गिराया था, जबकि करीब पांच दिनों तक पूरे इलाके में तलाशी अभि‍यान चलाया गया था. पर"

    generated_summary = generate_summary(article_text, model, tokenizer, device=device)
    print("Generated Summary:", generated_summary)

if __name__ == "__main__":
    main()

Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]



Epoch 1/30, Train Loss: 11.0274
Epoch 2/30, Train Loss: 7.0447
Epoch 3/30, Train Loss: 6.2018
Epoch 4/30, Train Loss: 5.7409
Epoch 5/30, Train Loss: 5.4549
Epoch 6/30, Train Loss: 5.2219
Epoch 7/30, Train Loss: 5.0734
Epoch 8/30, Train Loss: 4.9602
Epoch 9/30, Train Loss: 4.7993
Epoch 10/30, Train Loss: 4.7174
Epoch 11/30, Train Loss: 4.6501
Epoch 12/30, Train Loss: 4.5619
Epoch 13/30, Train Loss: 4.4970
Epoch 14/30, Train Loss: 4.4319
Epoch 15/30, Train Loss: 4.3720
Epoch 16/30, Train Loss: 4.3256
Epoch 17/30, Train Loss: 4.2807
Epoch 18/30, Train Loss: 4.2115
Epoch 19/30, Train Loss: 4.1732
Epoch 20/30, Train Loss: 4.1450
Epoch 21/30, Train Loss: 4.1061
Epoch 22/30, Train Loss: 4.0707
Epoch 23/30, Train Loss: 4.0065
Epoch 24/30, Train Loss: 3.9930
Epoch 25/30, Train Loss: 3.9399
Epoch 26/30, Train Loss: 3.9035
Epoch 27/30, Train Loss: 3.8720
Epoch 28/30, Train Loss: 3.8373
Epoch 29/30, Train Loss: 3.8205
Epoch 30/30, Train Loss: 3.7760
Model saved successfully.
Test Loss: 2.9771
Mode

In [None]:
generated_summary = generate_summary(article_text, model, tokenizer, device=device)
    print("Generated Summary:", generated_summary)

if __name__ == "__main__":
    main()


In [None]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import pandas as pd

from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

def generate_summary(article_text, model, tokenizer, max_input_len=512, max_output_len=128, device='cpu'):
    model.eval()

    # Tokenize input text
    inputs = tokenizer(
        article_text,
        max_length=max_input_len,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    ).to(device)

    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    # Generate summary
    with torch.no_grad():
        summary_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=max_output_len,
            num_beams=4,
            length_penalty=2.0,
            early_stopping=True
        )

    # Decode the generated summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

def main():
    model_name = 'google/mt5-small'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    train_dataset_path = '/content/drive/MyDrive/Colab Notebooks/cleaned_train.csv'

    # Load the dataset
    dataframe = pd.read_csv(train_dataset_path)

    # Process the first 5 rows
    for index in range(5):
        article_text = dataframe.iloc[index, 0]  # Assuming the article is in the first column
        generated_summary = generate_summary(article_text, model, tokenizer, device=device)
        print(f"Generated Summary for article {index+1}:", generated_summary)

if __name__ == "__main__":
    main()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




Generated Summary for article 1: <extra_id_0>
Generated Summary for article 2: <extra_id_0> 
Generated Summary for article 3: <extra_id_0> »
Generated Summary for article 4: <extra_id_0>
Generated Summary for article 5: <extra_id_0>.


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model_name = 't5-small'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
def generate_summary(article_text, model, tokenizer, max_input_len=512, max_output_len=150, device='cpu'):
    model.eval()

    inputs = tokenizer.encode(
        article_text,
        return_tensors="pt",
        max_length=max_input_len,
        truncation=True
    ).to(device)

    with torch.no_grad():
        summary_ids = model.generate(
            inputs,
            max_length=max_output_len,
            min_length=20,
            length_penalty=2.0,
            num_beams=4,
            early_stopping=True
        )

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
import pandas as pd

# Load the test dataset
test_dataset_path = '/content/drive/MyDrive/Colab Notebooks/cleaned_test.csv'
test_dataset = pd.read_csv(test_dataset_path)

# Initialize T5 tokenizer and model
model_name = 't5-small'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

def generate_summary(article_text, model, tokenizer, max_input_len=512, max_output_len=150, device='cpu'):
    model.eval()

    inputs = tokenizer.encode(
        article_text,
        return_tensors="pt",
        max_length=max_input_len,
        truncation=True
    ).to(device)

    with torch.no_grad():
        summary_ids = model.generate(
            inputs,
            max_length=max_output_len,
            min_length=20,
            length_penalty=2.0,
            num_beams=4,
            early_stopping=True,
            no_repeat_ngram_size=3,
            top_k=50,
            top_p=0.95,
            temperature=1.0
        )

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

def main():
    for i in range(5):  # Generating summaries for the first 5 rows
        article_text = test_dataset.iloc[i]['article']
        generated_summary = generate_summary(article_text, model, tokenizer, device=device)
        print(generated_summary)

if __name__ == "__main__":
    main()


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


,.     47   ,,.
, ‘   .     175            200  ( 153 )  42 (
,          .          ()           ; : (), ",
.       ,      -          50-55        () -.
,  ‘-’     , ‘    .  Rutherford shoots Pant's foot https://t.co/Vul2Br7OT4 via @ipl — bishwa mohan mishr


In [6]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import AdamW
import pandas as pd

from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

class HindiSummarizationDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=512):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        article = str(self.data.iloc[index, 0])
        summary = str(self.data.iloc[index, 1])

        article_encoding = self.tokenizer(
            article,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        summary_encoding = self.tokenizer(
            summary,
            max_length=self.max_len // 4,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        labels = summary_encoding['input_ids']
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            'input_ids': article_encoding['input_ids'].flatten(),
            'attention_mask': article_encoding['attention_mask'].flatten(),
            'labels': labels.flatten()
        }

def load_dataset(dataset_path, tokenizer, val_split=0.1):
    dataframe = pd.read_csv(dataset_path)
    dataset = HindiSummarizationDataset(dataframe, tokenizer)

    val_size = int(len(dataset) * val_split)
    train_size = len(dataset) - val_size

    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

    return train_dataset, val_dataset

def create_data_loader(dataset, batch_size):
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

def train_model(model, data_loader, optimizer, device, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {total_loss / len(data_loader):.4f}")

def eval_model(model, data_loader, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            total_loss += loss.item()
    return total_loss / len(data_loader)

def generate_summary(article_text, model, tokenizer, max_input_len=512, max_output_len=128, device='cpu'):
    model.eval()
    inputs = tokenizer(
        article_text,
        max_length=max_input_len,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    ).to(device)

    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    with torch.no_grad():
        summary_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=max_output_len,
            num_beams=4,
            length_penalty=2.0,
            early_stopping=True
        )

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

def main():
    model_name = 'google/mt5-small'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    train_dataset_path = '/content/drive/MyDrive/Colab Notebooks/cleaned_train.csv'
    test_dataset_path = '/content/drive/MyDrive/Colab Notebooks/cleaned_test.csv'

    train_dataset, val_dataset = load_dataset(train_dataset_path, tokenizer)
    test_dataset, _ = load_dataset(test_dataset_path, tokenizer)

    train_data_loader = create_data_loader(train_dataset, batch_size=8)
    val_data_loader = create_data_loader(val_dataset, batch_size=8)
    test_data_loader = create_data_loader(test_dataset, batch_size=8)

    optimizer = AdamW(model.parameters(), lr=1e-4)

    epochs = 20  # Increase number of epochs

    train_model(model, train_data_loader, optimizer, device, epochs)

    torch.save(model.state_dict(), '/content/drive/MyDrive/Colab Notebooks/hindi_summarization_model.pth')
    print("Model saved successfully.")

    test_loss = eval_model(model, test_data_loader, device)
    print(f"Test Loss: {test_loss:.4f}")

    model.load_state_dict(torch.load('/content/drive/MyDrive/Colab Notebooks/hindi_summarization_model.pth'))
    print("Model loaded successfully.")

    # Example usage of generating summary
    article_text = "प्रधानमंत्री नरेंद्र मोदी पठानकोट एयरबेस पहुंच गए हैं. वे एयरबेस में सुरक्षा के हालात का जायजा ले रहे हैं और वायुसेनाकर्मियों से मिल रहे हैं. सुबह करीब सवा दस बजे प्रधानमंत्री पंजाब के पठानकोट के लिए रवाना हुए. एयरबेस का जायजा लेने के बाद प्रधानमंत्री बॉर्डर इलाकों का हवाई सर्वेक्षण भी करेंगे. पठानकोट एयरबेस पर पिछले हफ्ते आतंकियों ने हमला किया था. पाकिस्तान से आए आतंकियों के हमले को विफल कर दिया गया था. सभी 6 पाकिस्तानी आतंकी मारे गए थे. 7 सुरक्षाबल भी शहीद हुए थे. भारत ने पाकिस्तान को सबूत सौंपते हुए दोषियों के खिलाफ सख्त कार्रवाई करने को कहा है. जानकारी के मुताबिक, प्रधानमंत्री के साथ आर्मी और एयरफोर्स के चीफ भी मौजूद रह सकते हैं. एयरबेस पर पाकिस्तानी आतंकियों ने हफ्ते भर पहले हमला किया था, जिसमें सात जवान शहीद हुए थे. सुरक्षा बलों ने मुठभेड़ में सभी छह आतंकियों को मार गिराया था, जबकि करीब पांच दिनों तक पूरे इलाके में तलाशी अभि‍यान चलाया गया था. पर"

    generated_summary = generate_summary(article_text, model, tokenizer, device=device)
    print("Generated Summary:", generated_summary)

if __name__ == "__main__":
    main()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




Epoch 1/20, Train Loss: 11.2116
Epoch 2/20, Train Loss: 7.1601
Epoch 3/20, Train Loss: 6.2313
Epoch 4/20, Train Loss: 5.7600
Epoch 5/20, Train Loss: 5.4526
Epoch 6/20, Train Loss: 5.2397
Epoch 7/20, Train Loss: 5.0480
Epoch 8/20, Train Loss: 4.9436
Epoch 9/20, Train Loss: 4.8922
Epoch 10/20, Train Loss: 4.7287
Epoch 11/20, Train Loss: 4.6199
Epoch 12/20, Train Loss: 4.6017
Epoch 13/20, Train Loss: 4.5148
Epoch 14/20, Train Loss: 4.4462
Epoch 15/20, Train Loss: 4.3739
Epoch 16/20, Train Loss: 4.3212
Epoch 17/20, Train Loss: 4.2719
Epoch 18/20, Train Loss: 4.2464
Epoch 19/20, Train Loss: 4.1890
Epoch 20/20, Train Loss: 4.1491
Model saved successfully.
Test Loss: 3.1006
Model loaded successfully.
Generated Summary: बिहार के मुख्यमंत्री नरेंद्र मोदी के साथ आतंकी और पाकिस्तानी आतंकियों ने हमला कर दिया गया है. पाकिस्तानी आतंकियों ने हमला कर दिया गया है. प्रधानमंत्री नरेंद्र मोदी ने पाकिस्तानी आतंकियों के लिए पाकिस्तानी आतंकियों ने हमला कर दिया गया है. प्रधानमंत्री नरेंद्र मोदी ने पाकिस्तानी 

In [None]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import AdamW
import pandas as pd

from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

class HindiSummarizationDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=512):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        article = str(self.data.iloc[index, 0])
        summary = str(self.data.iloc[index, 1])

        article_encoding = self.tokenizer(
            article,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        summary_encoding = self.tokenizer(
            summary,
            max_length=self.max_len // 4,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        labels = summary_encoding['input_ids']
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            'input_ids': article_encoding['input_ids'].flatten(),
            'attention_mask': article_encoding['attention_mask'].flatten(),
            'labels': labels.flatten()
        }

def load_dataset(dataset_path, tokenizer, val_split=0.1):
    dataframe = pd.read_csv(dataset_path)
    dataset = HindiSummarizationDataset(dataframe, tokenizer)

    val_size = int(len(dataset) * val_split)
    train_size = len(dataset) - val_size

    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

    return train_dataset, val_dataset

def create_data_loader(dataset, batch_size):
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

def train_model(model, data_loader, optimizer, device, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {total_loss / len(data_loader):.4f}")

def eval_model(model, data_loader, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            total_loss += loss.item()
    return total_loss / len(data_loader)

def generate_summary(article_text, model, tokenizer, max_input_len=512, max_output_len=128, device='cpu'):
    model.eval()
    inputs = tokenizer(
        article_text,
        max_length=max_input_len,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    ).to(device)

    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    with torch.no_grad():
        summary_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=max_output_len,
            num_beams=4,
            length_penalty=2.0,
            early_stopping=True
        )

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

def main():
    model_name = 'google/mt5-small'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    train_dataset_path = '/content/drive/MyDrive/Colab Notebooks/cleaned_train.csv'
    test_dataset_path = '/content/drive/MyDrive/Colab Notebooks/cleaned_test.csv'

    train_dataset, val_dataset = load_dataset(train_dataset_path, tokenizer)
    test_dataset, _ = load_dataset(test_dataset_path, tokenizer)

    train_data_loader = create_data_loader(train_dataset, batch_size=8)
    val_data_loader = create_data_loader(val_dataset, batch_size=8)
    test_data_loader = create_data_loader(test_dataset, batch_size=8)

    optimizer = AdamW(model.parameters(), lr=1e-4)

    epochs = 50  # Increase number of epochs

    train_model(model, train_data_loader, optimizer, device, epochs)

    torch.save(model.state_dict(), '/content/drive/MyDrive/Colab Notebooks/hindi_summarization_model.pth')
    print("Model saved successfully.")

    test_loss = eval_model(model, test_data_loader, device)
    print(f"Test Loss: {test_loss:.4f}")

    model.load_state_dict(torch.load('/content/drive/MyDrive/Colab Notebooks/hindi_summarization_model.pth'))
    print("Model loaded successfully.")

    # Example usage of generating summary
    article_text = "प्रधानमंत्री नरेंद्र मोदी पठानकोट एयरबेस पहुंच गए हैं. वे एयरबेस में सुरक्षा के हालात का जायजा ले रहे हैं और वायुसेनाकर्मियों से मिल रहे हैं. सुबह करीब सवा दस बजे प्रधानमंत्री पंजाब के पठानकोट के लिए रवाना हुए. एयरबेस का जायजा लेने के बाद प्रधानमंत्री बॉर्डर इलाकों का हवाई सर्वेक्षण भी करेंगे. पठानकोट एयरबेस पर पिछले हफ्ते आतंकियों ने हमला किया था. पाकिस्तान से आए आतंकियों के हमले को विफल कर दिया गया था. सभी 6 पाकिस्तानी आतंकी मारे गए थे. 7 सुरक्षाबल भी शहीद हुए थे. भारत ने पाकिस्तान को सबूत सौंपते हुए दोषियों के खिलाफ सख्त कार्रवाई करने को कहा है. जानकारी के मुताबिक, प्रधानमंत्री के साथ आर्मी और एयरफोर्स के चीफ भी मौजूद रह सकते हैं. एयरबेस पर पाकिस्तानी आतंकियों ने हफ्ते भर पहले हमला किया था, जिसमें सात जवान शहीद हुए थे. सुरक्षा बलों ने मुठभेड़ में सभी छह आतंकियों को मार गिराया था, जबकि करीब पांच दिनों तक पूरे इलाके में तलाशी अभि‍यान चलाया गया था. पर"

    generated_summary = generate_summary(article_text, model, tokenizer, device=device)
    print("Generated Summary:", generated_summary)

if __name__ == "__main__":
    main()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
