In [1]:
!pip install transformers datasets



In [2]:
import pandas as pd
import re
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset

# Define the preprocessing function
def preprocess_text(text):
    if isinstance(text, str):
        text = re.sub(r'\W', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        text = text.lower()
    else:
        text = ''
    return text

# Load the CSV file
file_path = '/content/IndianFinancialNews[1].csv'
df = pd.read_csv(file_path)

# Apply the preprocessing to the news articles column
df['cleaned_text'] = df['Description'].apply(preprocess_text)
# Insert a new column named 'summaries' with empty strings
df['Summary'] = ''

# Display the DataFrame with the new column
print(df.head())


# Convert the DataFrame to a Dataset
dataset = Dataset.from_pandas(df[['cleaned_text', 'Summary']])  # Assuming 'Summary' is the column with summaries

# Load the tokenizer and model
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Tokenize the dataset
def preprocess_data(examples):
    inputs = [doc for doc in examples['cleaned_text']]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['Summary'], max_length=150, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_data, batched=True)

# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,  # Normally you should split your data into train and eval sets
)

# Train the model
trainer.train()


   Unnamed: 0                    Date  \
0           0   May 26, 2020, Tuesday   
1           1   May 26, 2020, Tuesday   
2           2    May 25, 2020, Monday   
3           3    May 24, 2020, Sunday   
4           4  May 23, 2020, Saturday   

                                               Title  \
0  ATMs to become virtual bank branches, accept d...   
1  IDFC First Bank seniors to forgo 65% of bonus ...   
2  Huge scam in YES Bank for many years, says Enf...   
3  Bank of Maharashtra sanctioned Rs 2,789 cr in ...   
4  DCB Bank's profit before tax declines 37.6% to...   

                                         Description  \
0  Close to 14.6 per cent (or 35,000) of the 240,...   
1  V Vaidyanathan, managing director and chief ex...   
2  Rana Kapoor's wife also charged with abetting ...   
3  The bank said it was now gearing up to extend ...   
4  Net profit for the financial year ended March ...   

                                        cleaned_text Summary  
0  close to 14 6

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]



ValueError: expected sequence of length 29 at dim 1 (got 15)

In [None]:
import pandas as pd
import re
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset

# Define the preprocessing function
def preprocess_text(text):
    if isinstance(text, str):
        text = re.sub(r'\W', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        text = text.lower()
    else:
        text = ''
    return text

# Load the CSV file
file_path = '/content/IndianFinancialNews[1].csv'
df = pd.read_csv(file_path)

# Apply the preprocessing to the news articles column
df['cleaned_text'] = df['Description'].apply(preprocess_text)

# Generate initial summaries using a summarization pipeline
from transformers import pipeline

summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")

def generate_summary(text):
    if text:
        summary = summarizer(text, max_length=150, min_length=30, do_sample=False)
        return summary[0]['summary_text']
    else:
        return ''

# Generate summaries and add them to the new column
df['summaries'] = df['cleaned_text'].apply(generate_summary)

# Convert the DataFrame to a Dataset
dataset = Dataset.from_pandas(df[['cleaned_text', 'summaries']])

# Load the tokenizer and model
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Tokenize the dataset
def preprocess_data(examples):
    inputs = tokenizer(examples['cleaned_text'], max_length=512, truncation=True, padding=True)
    targets = tokenizer(examples['summaries'], max_length=150, truncation=True, padding=True)

    model_inputs = {
        'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
        'labels': targets['input_ids'],
    }
    return model_inputs

tokenized_datasets = dataset.map(preprocess_data, batched=True)

# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],  # Split your data into train and test sets
)

# Train the model
trainer.train()


Your max_length is set to 150, but your input_length is only 36. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=18)
Your max_length is set to 150, but your input_length is only 26. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=13)
Your max_length is set to 150, but your input_length is only 20. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=10)
Your max_length is set to 150, but your input_length is only 32. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=16)
Your