In [None]:
import pandas as pd
import re
from gensim.parsing.preprocessing import remove_stopwords, strip_multiple_whitespaces, strip_numeric
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import spacy

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_sm")

def cleaning(s):
    # Tokenize using spaCy for more accurate tokenization
    doc = nlp(s)
    tokens = [token.text.lower() for token in doc if not token.is_stop and token.is_alpha]

    # Lemmatization using NLTK's WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join tokens back into a string
    cleaned_text = ' '.join(tokens)
    return cleaned_text

# Test the cleaning function
text = "This is a sample text containing URLs like http://example.com and non-alphabetic characters 1234."
cleaned_text = cleaning(text)
print(cleaned_text)


In [None]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import GPT2Tokenizer

# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Initialize the model
model = GPT2LMHeadModel.from_pretrained('gpt2')
model = model.to(device)


In [None]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
train_data, test_data = train_test_split(df_sample, test_size=0.25, random_state=42)

# Check the shape of the training and testing sets
print("Training data shape:", train_data.shape)
print("Testing data shape:", test_data.shape)

In [None]:
# Define a custom token for summarization task
SUMMARY_TOKEN = " [TL;DR] "

# Concatenate text, TL;DR token, and summary for training data
train_data['Processed_Input'] = train_data['Processed_Text'] + SUMMARY_TOKEN + train_data['Processed_Summary']

# Create an array containing processed input for training
input_data_train = train_data['Processed_Input'].tolist()
print(input_data_train)


In [None]:
test_data['Test_Text'] = test_data['Text'] + " TL;DR"
reviews_array_test = test_data['Test_Text'].tolist()
print(reviews_array_test)

In [None]:
import torch
from torch.utils.data import Dataset

# Define a unique token to denote TL;DR
TLDR_TOKEN = "<TLDR>"

class CustomReviewDataset(Dataset):
    def __init__(self, reviews, tokenizer, max_len):
        self.data = reviews
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.tokenizer.pad_token = self.tokenizer.eos_token

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        review = self.data[idx]
        text, tldr, summary = review['Text'], review['TLDR'], review['Summary']

        # Concatenate text, TL;DR, and summary with unique tokens
        combined_text = f"{text} {TLDR_TOKEN} {tldr} {summary}"

        # Tokenize the combined text
        tokens = self.tokenizer.encode(combined_text, add_special_tokens=True, max_length=self.max_len, truncation=True)

        # Pad tokens to max_len
        tokens += [self.tokenizer.pad_token_id] * (self.max_len - len(tokens))

        # Prepare labels (shifted by one position) for LM training
        labels = tokens[1:] + [self.tokenizer.pad_token_id]

        return {
            'input_ids': torch.tensor(tokens),
            'labels': torch.tensor(labels)
        }


In [None]:
from transformers import Trainer, TrainingArguments
from torch.utils.data import DataLoader

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./training_results',         # output directory
    num_train_epochs=3,                      # number of training epochs
    per_device_train_batch_size=16,          # batch size per device during training
    logging_dir='./training_logs',           # directory for storing logs
    logging_steps=100,                       # log every 100 steps
    save_steps=100                           # save checkpoint every 100 steps
)

# Create the DataLoader
train_loader = DataLoader(train_data, batch_size=training_args.per_device_train_batch_size, shuffle=True)

# Define the Trainer for fine-tuning
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_loader,
    tokenizer=tokenizer
)

# Custom training loop
for epoch in range(training_args.num_train_epochs):
    for batch in train_loader:
        # Forward pass
        inputs = batch['input_ids']
        labels = batch['labels']
        outputs = model(inputs, labels=labels)

        # Backward pass and update
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        # Logging
        if trainer.is_world_process_zero() and trainer.args.logging_steps and trainer.step % trainer.args.logging_steps == 0:
            trainer.log_metrics('train', epoch=epoch)

# Save the model
trainer.save_model('./fine_tuned_model')


In [None]:
model_path = "./results"
model.save_pretrained(model_path)

In [None]:
# Load the saved model
model_path = "./results\checkpoint-100"
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

# Assuming you have a ReviewDataset class for the test data
test_dataset = ReviewDataset(reviews_array_test, tokenizer, max_length=200)

# Evaluate the model
model.eval()
with torch.no_grad():
    for index, row in test_data.iterrows():
        input_ids = row['input_ids'].unsqueeze(0)  # Unsqueeze to add batch dimension
        generated_ids = model.generate(
            input_ids=input_ids,
            max_length=len(input_ids[0])+30,  # Set the maximum length of the generated text
            num_beams=4,     # Set the number of beams for beam search
            length_penalty=2.0,  # Set the length penalty for beam search
            repetition_penalty=2.0,  # Set the repetition penalty for beam search
            pad_token_id=tokenizer.pad_token_id,  # Set the pad token ID
            eos_token_id=tokenizer.eos_token_id,  # Set the end-of-sequence token ID
            early_stopping=True  # Enable early stopping
        )
        # Decode the generated text
        generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        print("Generated Text:", generated_text)
