In [2]:
import matplotlib.pyplot as plt
import torch
from torch import nn
import numpy as np 
device = "cuda" if torch.cuda.is_available() else "cpu"

# Import torch.nn.functional as F
import torch.nn.functional as F
import os

# Set your API key here
os.environ["WANDB_API_KEY"] = "5e8d06e5ae60a39c7945e42203c69152c47dbb61"


In [3]:
!pip install transformers
!pip install rouge-score
!pip install sacrebleu


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=994974cf2d95d9bc20937b22b7ddcaef0b3242c7ce962b43deb8f7b63683ef4d
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Downloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[2K   [90

In [4]:
data_path = "/kaggle/input/pcmag-dataset/train.csv"
test_path="/kaggle/input/pcmag-dataset/test.csv"
valid_path="/kaggle/input/pcmag-dataset/valid.csv"

In [5]:
output_dir_positive = "/kaggle/working/checkpoints/negative"

In [6]:
import pandas as pd
data = pd.read_csv(data_path)
df = pd.DataFrame(data).fillna("NONE")
print(df.columns)
data = pd.read_csv(test_path)
tf = pd.DataFrame(data).fillna("NONE")
data = pd.read_csv(valid_path)
vf = pd.DataFrame(data).fillna("NONE")

Index(['product_name', 'overall_rating', 'review_text', 'positive_comment',
       'negative_comment', 'neural_comment'],
      dtype='object')


In [7]:
# Convert the 'price' column to numeric, invalid parsing will be set as NaN
def is_numeric(value):
    try:
        float(value)
        return True
    except ValueError:
        return False 
df=df.fillna('None')
df = df[df["overall_rating"].apply(is_numeric)]
tf=tf[tf["overall_rating"].apply(is_numeric)]
vf=vf[vf["overall_rating"].apply(is_numeric)]


In [8]:
df['overall_rating'].unique()

array([4. , 3.5, 3. , 4.5, 5. , 2.5, 2. , 1.5, 1. , 0. ])

In [9]:
from sklearn.preprocessing import OneHotEncoder
possible_ratings = [i * 0.5 for i in range(11)] 
encoder = OneHotEncoder(sparse=False)
encoded_df = pd.get_dummies(df['overall_rating'], prefix='rating').reindex(columns=[f'rating_{r}' for r in possible_ratings], fill_value=0)
df = pd.concat([df, encoded_df], axis=1)
encoded_tf = pd.get_dummies(tf['overall_rating'], prefix='rating').reindex(columns=[f'rating_{r}' for r in possible_ratings], fill_value=0)
tf = pd.concat([tf, encoded_tf], axis=1)

# For vf (validation data)
encoded_vf = pd.get_dummies(vf['overall_rating'], prefix='rating').reindex(columns=[f'rating_{r}' for r in possible_ratings], fill_value=0)
vf = pd.concat([vf, encoded_vf], axis=1)

In [10]:
print(df.columns)
print(vf.columns)
print(vf.columns)

Index(['product_name', 'overall_rating', 'review_text', 'positive_comment',
       'negative_comment', 'neural_comment', 'rating_0.0', 'rating_0.5',
       'rating_1.0', 'rating_1.5', 'rating_2.0', 'rating_2.5', 'rating_3.0',
       'rating_3.5', 'rating_4.0', 'rating_4.5', 'rating_5.0'],
      dtype='object')
Index(['product_name', 'overall_rating', 'review_text', 'positive_comment',
       'negative_comment', 'neural_comment', 'rating_0.0', 'rating_0.5',
       'rating_1.0', 'rating_1.5', 'rating_2.0', 'rating_2.5', 'rating_3.0',
       'rating_3.5', 'rating_4.0', 'rating_4.5', 'rating_5.0'],
      dtype='object')
Index(['product_name', 'overall_rating', 'review_text', 'positive_comment',
       'negative_comment', 'neural_comment', 'rating_0.0', 'rating_0.5',
       'rating_1.0', 'rating_1.5', 'rating_2.0', 'rating_2.5', 'rating_3.0',
       'rating_3.5', 'rating_4.0', 'rating_4.5', 'rating_5.0'],
      dtype='object')


In [11]:
# Define custom dataset class
class Custom_Dataset(torch.utils.data.Dataset):
    def __init__(self, data,pgen,ngen,nut_gen, labels):
        self.data = data
        self.positive_gen=pgen
        self.negative_gen=ngen
        self.neutral_gen=nut_gen
        self.labels = labels
        
    def __getitem__(self, index: int) -> (torch.Tensor,torch.Tensor,torch.Tensor,torch.Tensor, list):
        x = self.data[index]
        pgen=self.positive_gen
        ngen=self.negative_gen
        nut_gen=self.neutral_gen
        y = self.labels[index]
        return (x,pgen,ngen,nut_gen, y)
    
    def __len__(self) -> int:
        return len(self.data)

In [12]:
from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(df[['review_text']], df[['overall_rating', 'positive_comment',
#        'negative_comment', 'neural_comment', 'overall_rating_0.0',
#        'overall_rating_1.0', 'overall_rating_1.5', 'overall_rating_2.0',
#        'overall_rating_2.5', 'overall_rating_3.0', 'overall_rating_3.5',
#        'overall_rating_4.0', 'overall_rating_4.5', 'overall_rating_5.0']], test_size=0.2, shuffle=True)

# # Define train and test datasets and dataloaders
# # data,pgen,ngen,nut_gen, labels
train_data = Custom_Dataset(data=df['review_text'],pgen=df['positive_comment'],ngen=df['negative_comment'],nut_gen=df['neural_comment'],labels=df[[ 'rating_0.0', 'rating_0.5',
       'rating_1.0', 'rating_1.5', 'rating_2.0', 'rating_2.5', 'rating_3.0',
       'rating_3.5', 'rating_4.0', 'rating_4.5', 'rating_5.0']])
test_data = Custom_Dataset(data=tf['review_text'],pgen=tf['positive_comment'],ngen=tf['negative_comment'],nut_gen=tf['neural_comment'],labels=tf[['rating_0.0', 'rating_0.5',
       'rating_1.0', 'rating_1.5', 'rating_2.0', 'rating_2.5', 'rating_3.0',
       'rating_3.5', 'rating_4.0', 'rating_4.5', 'rating_5.0']])
valid_data = Custom_Dataset(data=vf['review_text'],pgen=vf['positive_comment'],ngen=vf['negative_comment'],nut_gen=vf['neural_comment'],labels=vf[['rating_0.0', 'rating_0.5',
       'rating_1.0', 'rating_1.5', 'rating_2.0', 'rating_2.5', 'rating_3.0',
       'rating_3.5', 'rating_4.0', 'rating_4.5', 'rating_5.0']])
train_loader = torch.utils.data.DataLoader(dataset=train_data, batch_size=3, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_data, batch_size=1, shuffle=False)
test_loader = torch.utils.data.DataLoader(dataset=valid_data, batch_size=1, shuffle=False)

In [13]:
import transformers
from datasets import Dataset
from datasets import load_dataset, load_metric, load_from_disk
metric = load_metric('rouge',trust_remote_code=True)
model_checkpoints = 'facebook/bart-large-xsum'
# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"



  metric = load_metric('rouge',trust_remote_code=True)


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

In [14]:
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
import torch

# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = transformers.AutoTokenizer.from_pretrained(model_checkpoints)
def preprocess_data_negative(df):
  #get the dialogue text
  inputs = [("negative summary of "+dialogue) for dialogue in df['review_text']]
  #tokenize text
  model_inputs = tokenizer(inputs,  max_length=1024, padding='max_length', truncation=True)

  #tokenize labels
  with tokenizer.as_target_tokenizer():
    targets = tokenizer(df['negative_comment'], max_length=512, padding='max_length', truncation=True)
    
  model_inputs['labels'] = targets['input_ids']
  #reuturns input_ids, attention_masks, labels
  return model_inputs
def preprocess_data_positive(df):
  #get the dialogue text
  inputs = [("positive summary of "+dialogue) for dialogue in df['review_text']]
  #tokenize text
  model_inputs = tokenizer(inputs,  max_length=1024, padding='max_length', truncation=True)

  #tokenize labels
  with tokenizer.as_target_tokenizer():
    targets = tokenizer(df['positive_comment'], max_length=512, padding='max_length', truncation=True)
    
  model_inputs['labels'] = targets['input_ids']
  #reuturns input_ids, attention_masks, labels
  return model_inputs
def preprocess_data_neutral(df):
  #get the dialogue text
  inputs = [("neural summary of "+dialogue) for dialogue in df['review_text']]
  #tokenize text
  model_inputs = tokenizer(inputs,  max_length=1024, padding='max_length', truncation=True)

  #tokenize labels
  with tokenizer.as_target_tokenizer():
    targets = tokenizer(df['neural_comment'], max_length=512, padding='max_length', truncation=True)
    
  model_inputs['labels'] = targets['input_ids']
  #reuturns input_ids, attention_masks, labels
  return model_inputs



tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



In [16]:
# train_df=Dataset.from_pandas(df)
# valid_df=Dataset.from_pandas(vf)
# test_df=Dataset.from_pandas(tf)
# tokenize_train = train_df.map(preprocess_data_negative, batched = True, remove_columns=[ 'review_text', 'negative_comment'])
# tokenize_valid = valid_df.map(preprocess_data_negative, batched = True, remove_columns=[ 'review_text', 'negative_comment'])
# tokenize_test = test_df.map(preprocess_data_negative, batched = True, remove_columns=[ 'review_text', 'negative_comment'])

Map:   0%|          | 0/13741 [00:00<?, ? examples/s]



Map:   0%|          | 0/1735 [00:00<?, ? examples/s]

Map:   0%|          | 0/1718 [00:00<?, ? examples/s]

In [19]:
# Process each dataset separately
tokenize_train_negative = train_df.map(preprocess_data_negative, batched=True, remove_columns=['review_text', 'negative_comment'])
tokenize_train_positive = train_df.map(preprocess_data_positive, batched=True, remove_columns=['review_text', 'positive_comment'])
tokenize_train_neutral = train_df.map(preprocess_data_neutral, batched=True, remove_columns=['review_text', 'neural_comment'])

# Concatenate fields manually to ensure a flat structure
def concatenate_datasets_flat(dataset1, dataset2, dataset3):
    concatenated_dataset = {
        'input_ids': dataset1['input_ids'] + dataset2['input_ids'] + dataset3['input_ids'],
        'attention_mask': dataset1['attention_mask'] + dataset2['attention_mask'] + dataset3['attention_mask'],
        'labels': dataset1['labels'] + dataset2['labels'] + dataset3['labels'],
    }
   
    return Dataset.from_dict(concatenated_dataset)

# Concatenate the tokenized data across negative, positive, and neutral
tokenize_train = concatenate_datasets_flat(tokenize_train_negative, tokenize_train_positive, tokenize_train_neutral)
print(tokenize_train.column_names)
# Repeat the same process for validation and test sets
tokenize_valid_negative = valid_df.map(preprocess_data_negative, batched=True, remove_columns=['review_text', 'negative_comment'])
tokenize_valid_positive = valid_df.map(preprocess_data_positive, batched=True, remove_columns=['review_text', 'positive_comment'])
tokenize_valid_neutral = valid_df.map(preprocess_data_neutral, batched=True, remove_columns=['review_text', 'neural_comment'])

tokenize_valid = concatenate_datasets_flat(tokenize_valid_negative, tokenize_valid_positive, tokenize_valid_neutral)

tokenize_test_negative = test_df.map(preprocess_data_negative, batched=True, remove_columns=['review_text', 'negative_comment'])
tokenize_test_positive = test_df.map(preprocess_data_positive, batched=True, remove_columns=['review_text', 'positive_comment'])
tokenize_test_neutral = test_df.map(preprocess_data_neutral, batched=True, remove_columns=['review_text', 'neural_comment'])

tokenize_test = concatenate_datasets_flat(tokenize_test_negative, tokenize_test_positive, tokenize_test_neutral)

# Now tokenize_train, tokenize_valid, and tokenize_test contain the combined datasets in a flat structure


Map:   0%|          | 0/13741 [00:00<?, ? examples/s]

Map:   0%|          | 0/13741 [00:00<?, ? examples/s]

Map:   0%|          | 0/13741 [00:00<?, ? examples/s]

['input_ids', 'attention_mask', 'labels']


Map:   0%|          | 0/1735 [00:00<?, ? examples/s]

Map:   0%|          | 0/1735 [00:00<?, ? examples/s]

Map:   0%|          | 0/1735 [00:00<?, ? examples/s]

Map:   0%|          | 0/1718 [00:00<?, ? examples/s]

Map:   0%|          | 0/1718 [00:00<?, ? examples/s]

Map:   0%|          | 0/1718 [00:00<?, ? examples/s]

In [None]:
tokenize_train.save_to_disk("./tokenized_dataset/train/checkpoint")
tokenize_valid.save_to_disk("./tokenized_dataset/valid/checkpoint")
tokenize_test.save_to_disk("./tokenized_dataset/test/checkpoint")
# tokenized_train = load_from_disk("./tokenized_dataset/train/checkpoint")
# tokenized_valid = load_from_disk("./tokenized_dataset/valid/checkpoint")
# tokenized_test = load_from_disk("./tokenized_dataset/test/checkpoint")

In [20]:
model = transformers.AutoModelForSeq2SeqLM.from_pretrained(model_checkpoints).to(device)
#collator to create batches. It preprocess data with the given tokenizer
collator = transformers.DataCollatorForSeq2Seq(tokenizer, model=model)

pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/309 [00:00<?, ?B/s]

In [27]:
# print(df.columns)
# df["negative_comment"].head()
def compute_rouge(pred):
  predictions, labels = pred
  #decode the predictions
  decode_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
  #decode labels
  decode_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  #compute results
  res = metric.compute(predictions=decode_predictions, references=decode_labels, use_stemmer=True)
  #get %
  res = {key: value.mid.fmeasure * 100 for key, value in res.items()}

  pred_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
  res['gen_len'] = np.mean(pred_lens)

  return {k: round(v, 4) for k, v in res.items()}

In [28]:
args = transformers.Seq2SeqTrainingArguments(
    evaluation_strategy='epoch',  # Your current evaluation strategy
    save_strategy='epoch',        # Set save strategy to match evaluation strategy
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=5,
    predict_with_generate=True,
    eval_accumulation_steps=1,
    fp16=True,
    load_best_model_at_end=True,
    output_dir=output_dir_positive,
    resume_from_checkpoint=True
)

trainer = transformers.Seq2SeqTrainer(
    model, 
    args,
    train_dataset=tokenize_train,
    eval_dataset=tokenize_valid,
    data_collator=collator,
    tokenizer=tokenizer,
    compute_metrics=compute_rouge
)
trainer.train()

model.save_pretrained("./fine_tuned_model_negative")

# Save the tokenizer
tokenizer.save_pretrained("./fine_tuned_model_nagative")



Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 