<a href="https://colab.research.google.com/github/sukantamitra007/GenAILearning/blob/TransformerModel/Project_Work.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Lets Install Necessary libraries

In [None]:
pip install transformers[torch]  datasets evaluate rouge_score

# Load BillSum dataset

In [None]:
from datasets import load_dataset

billsum = load_dataset("billsum", split="ca_test")
billsum = billsum.train_test_split(test_size=0.99)
billsum_train= billsum['train']   # Doing this to reduce the data volumn
billsum = billsum_train.train_test_split(test_size=0.40)

# Preprocess

In [None]:
from transformers import AutoTokenizer

checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# preprocessing function

In [7]:
prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]  # Add prefix  as required for the model
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True) # add truncation on and max length

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True) # add target truncation on

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [8]:
tokenized_billsum = billsum.map(preprocess_function, batched=True) #setting batched=True to process multiple elements of the dataset at

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [9]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint) #dynamically pad the sentences to the longest length in a batch during collation.

# Evaluate

In [10]:
import evaluate
rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [11]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

# Train

In [12]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [13]:
training_args = Seq2SeqTrainingArguments(
    output_dir="my_project",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=False#,
    #push_to_hub=True,
)

In [14]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.save_model("SummarizationFineTunedModel")

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,4.338615,0.1668,0.0588,0.1396,0.1396,19.0
2,No log,4.304058,0.1668,0.0588,0.1396,0.1396,19.0
3,No log,4.280647,0.1668,0.0588,0.1396,0.1396,19.0
4,No log,4.268909,0.1668,0.0588,0.1396,0.1396,19.0




TrainOutput(global_step=4, training_loss=4.848381042480469, metrics={'train_runtime': 276.8674, 'train_samples_per_second': 0.101, 'train_steps_per_second': 0.014, 'total_flos': 7579140882432.0, 'train_loss': 4.848381042480469, 'epoch': 4.0})

In [68]:
text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."

In [None]:
from transformers import pipeline
summarizer = pipeline("summarization", model="SummarizationFineTunedModel")
#summarizer(text,max_length=51)



Use Text Classification model funed tuned with financial news. In realtime we will finetuned it with production orginal customer review and the predefined label for our company. But for this project I am using existing model . As I have already fine tuned one model to showcase how to do fine tuning.

In [None]:
from transformers import pipeline

pipe = pipeline("text-classification", model="mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

In [131]:
def clssification_Of_review(feedback):
  classified_text=pipe(feedback)[0].get('label')
  return classified_text

In [None]:
bad_review='despite its sleek appearance, the build quality feels cheap and fragile . camera quality is disappointing, with images appearing blurry and pixelated . the advertised battery life is far from accurate, lasting only fraction of the promised time .'

print(prompt)


In [162]:
from transformers import pipeline, AutoTokenizer
import torch

torch.manual_seed(0)
model = "openai-community/gpt2" #"tiiuae/falcon-7b-instruct"

tokenizer = AutoTokenizer.from_pretrained(model)
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

In [159]:
def product_suggestion(review_summary):
  prompt = f"""You are a product engineering and based on the input text provide a improvement idea to the company.
  Text: The screen is prone to scratches and cracks. I've only had this phone for a week, and it already looks beaten up.
  Improvement idea: Product team need to work on screen quality improvement.
  Text: The build quality is poor. The phone feels flimsy and cheaply made, and it's prone to overheating.
  Improvement idea: Product team need to look into hardware improvement.
  Text: {review_summary}
  Final Output:"""

  sequences = pipe(
      prompt,
      max_new_tokens=40,
      do_sample=True,
      top_k=10,
  )
  return sequences[0].get('generated_text').split("Final Output:")[1]


**User Interface for summarization and sentiment object**

In [None]:
pip install gradio

In [None]:
import gradio as gr
def summarizer_new(inputFeedback):
    summary_text=summarizer(inputFeedback,max_length=51)[0].get('summary_text') #Summarization Model
    classification_label=clssification_Of_review(inputFeedback) #Classification Model
    if classification_label in 'negative':
      product_suggestion= sequences=product_suggestion(summary_text) #Product Suggestion
    else:
      product_suggestion= "Customer is happy with the product."

    return summary_text,classification_label,product_suggestion

demo = gr.Interface(fn=summarizer_new, inputs=gr.Textbox(), outputs= [gr.Textbox(label="summary", lines=3), gr.Textbox(label="sentiment", lines=3),
                                                                      gr.Textbox(label="product suggestion", lines=3)])
demo.launch()