# Loading in Data

In [None]:
import os
import pandas as pd

# Define the path to the 'BBC News Summary' folder
base_folder = '/kaggle/input/bbc-news-summary/BBC News Summary'

# Initialize empty lists to store the data
article_texts = []
summaries = []

In [None]:
# Loop through the subfolders (001, 002, etc.) inside 'News Articles' and 'Summaries'
for folder_name in os.listdir(os.path.join(base_folder, 'News Articles')):

    # Construct the paths to the TXT files in 'News Articles' and 'Summaries' folders
    article_folder = os.path.join(base_folder, 'News Articles', folder_name)
    summary_folder = os.path.join(base_folder, 'Summaries', folder_name)

    # Loop through the TXT files in the current subfolder
    for file_name in os.listdir(article_folder):
        # Construct the full paths to the TXT files
        article_file = os.path.join(article_folder, file_name)
        summary_file = os.path.join(summary_folder, file_name)
        
        # Check if the files exist
        if os.path.exists(article_file) and os.path.exists(summary_file):
            try:
                # Read the text and summary data from the TXT file with UTF-8 encoding
                article_df = open(article_file, 'r', encoding='utf-8').read()
                summary_df = open(summary_file, 'r', encoding='utf-8').read()
                
                # Append the article text and summary to the respective lists
                article_texts.append(article_df)
                summaries.append(summary_df)
            except UnicodeDecodeError:
                # Handle files with incorrect encoding
                print(f"File with incorrect encoding: {article_file}")
                continue

# Create a Pandas DataFrame with the collected data
data = {'Article Text': article_texts, 'Summary': summaries}
df = pd.DataFrame(data)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load data from a CSV file
data = df

# Ensure the column names match your DataFrame's column names
data.columns = ['Article Text', 'Summary']

# Split data into training, validation, and test sets
train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Now you have three datasets: train_data, val_data, and test_data
# You can proceed with preprocessing or model training

# Preparing Data for Training

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from tqdm import tqdm  # Import tqdm

# Initialize the T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

def convert_to_features(index, row):
    # Encode the articles and summaries to the format expected by T5
    input_encodings = tokenizer(row['Article Text'], truncation=True, padding='max_length', max_length=512, return_tensors="pt")
    target_encodings = tokenizer(row['Summary'], truncation=True, padding='max_length', max_length=150, return_tensors="pt")
    
    return {
        'input_ids': input_encodings['input_ids'],
        'labels': target_encodings['input_ids']
    }


# Convert your data batches to features with progress bar
train_features = [convert_to_features(index, row) for index, row in tqdm(train_data.iterrows(), total=len(train_data), desc='Processing Training Data')]
val_features = [convert_to_features(index, row) for index, row in tqdm(val_data.iterrows(), total=len(val_data), desc='Processing Validation Data')]

# Train the Model

## 1. Create a Dataset Class:
Define a PyTorch dataset class to manage your data.

In [None]:
from torch.utils.data import Dataset, DataLoader

class TextSummarizationDataset(Dataset):
    def __init__(self, features):
        self.features = features

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx]

# Instantiate the dataset class with your data
train_dataset = TextSummarizationDataset(train_features)
val_dataset = TextSummarizationDataset(val_features)

## 2. Create Data Loaders:
Set up data loaders to handle batching of data during training and validation.

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)

## 3. Set Up Training Loop:
Define the optimizer, loss function (if needed), and set up the training loop.

In [None]:
import torch.optim as optim
from tqdm import tqdm
import torch

# Specify the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Transfer the model to the GPU
model = model.to(device)

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=5e-4)

# Specify the number of epochs
num_epochs = 3

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    for batch in tqdm(train_dataloader, desc=f'Training Epoch {epoch + 1}'):
        optimizer.zero_grad()
        inputs = batch['input_ids'].squeeze(dim=1).to(device)
        labels = batch['labels'].squeeze(dim=1).to(device)
        outputs = model(input_ids=inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()

    # Validation loop
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc=f'Validation Epoch {epoch + 1}'):
            inputs = batch['input_ids'].squeeze(dim=1).to(device)
            labels = batch['labels'].squeeze(dim=1).to(device)
            outputs = model(input_ids=inputs, labels=labels)
            total_val_loss += outputs.loss.item()

    print(f'Epoch {epoch + 1}, Training Loss: {total_train_loss / len(train_dataloader)}, Validation Loss: {total_val_loss / len(val_dataloader)}')

In [None]:
model.save_pretrained('Model-Files')
tokenizer.save_pretrained('Model-Files')

# Test on a new summary here:

In [None]:
# Example of generating a summary
def summarize(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True).to(device)
    summary_ids = model.generate(
    inputs['input_ids'], 
    max_length=150, 
    num_beams=4, 
    length_penalty=2.0, 
    early_stopping=True, 
    no_repeat_ngram_size=2  # This parameter can help reduce repetition
).to(device)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Use the function to summarize a new article
article= '''
Title: "Tech Trends Shaping the Future: Innovations, Challenges, and Opportunities"

Introduction

Technology has become an integral part of our daily lives, revolutionizing the way we work, communicate, and interact with the world around us. With each passing year, we witness remarkable advancements in the tech industry that not only change the way we live but also offer new avenues for growth and development. In this article, we will explore some of the most significant tech trends that are shaping the future, as well as the challenges and opportunities they present.

1. Artificial Intelligence (AI) and Machine Learning

Artificial intelligence and machine learning have seen unprecedented growth in recent years. These technologies have the potential to transform industries such as healthcare, finance, and transportation. AI-powered chatbots, autonomous vehicles, and predictive analytics are just a few examples of how AI and machine learning are changing the way we work and live. However, ethical concerns, data privacy issues, and the need for responsible AI development are challenges that must be addressed as these technologies continue to advance.

2. Internet of Things (IoT)

The Internet of Things is connecting everyday objects and devices to the internet, enabling them to communicate and share data. From smart thermostats and wearable fitness trackers to smart cities' infrastructure, IoT is enhancing efficiency, convenience, and sustainability. However, the massive influx of data generated by IoT devices poses security and privacy concerns, requiring robust cybersecurity measures to protect sensitive information.

3. 5G Technology

The rollout of 5G networks is set to revolutionize connectivity, offering faster speeds and lower latency. This will unlock new possibilities for applications such as augmented reality (AR), virtual reality (VR), and the Internet of Things. The increased bandwidth will pave the way for innovations in telemedicine, autonomous vehicles, and remote work. However, concerns about the deployment of 5G infrastructure and potential health risks associated with prolonged exposure to high-frequency electromagnetic radiation are subjects of ongoing debate.

4. Cybersecurity

As technology continues to advance, the threat landscape in the digital world is also evolving. Cybersecurity is a critical concern for individuals, businesses, and governments. The rise of sophisticated cyberattacks, ransomware incidents, and data breaches underscores the need for robust security measures. Organizations must invest in cybersecurity technologies and develop proactive strategies to protect sensitive data and infrastructure.

5. Green Technology and Sustainability

The tech industry is increasingly focused on sustainability and environmental responsibility. Renewable energy sources, energy-efficient data centers, and eco-friendly products are at the forefront of this movement. As consumers become more environmentally conscious, companies are finding that sustainable practices are not only ethical but also profitable. Investing in green technology and adopting eco-friendly practices can lead to cost savings and a positive brand image.

6. Remote Work and Collaboration Tools

The COVID-19 pandemic accelerated the adoption of remote work and collaboration tools. Video conferencing, project management software, and cloud-based services have become essential for businesses to maintain productivity and adapt to remote work environments. While remote work offers flexibility and accessibility, it also presents challenges related to employee well-being, cybersecurity, and maintaining a sense of company culture.

7. Quantum Computing

Quantum computing is on the horizon and has the potential to revolutionize computing as we know it. With the ability to perform complex calculations at speeds unattainable by classical computers, quantum computers could solve problems in fields like cryptography, drug discovery, and climate modeling. However, building and maintaining stable quantum systems remains a significant technological challenge.

Conclusion

The tech landscape is continually evolving, presenting both exciting opportunities and complex challenges. As we embrace innovations in artificial intelligence, IoT, 5G, cybersecurity, sustainability, remote work, and quantum computing, it is crucial to approach these developments with a thoughtful and ethical perspective. Balancing the potential for progress with responsible development and addressing the associated risks will be key to harnessing the full potential of technology in shaping our future. By staying informed and proactive, individuals, businesses, and society as a whole can navigate the ever-changing tech landscape with confidence and resilience.'''
new_summary = summarize(article)
print(new_summary)