<a href="https://colab.research.google.com/github/rishikeshvk/Assessment_12-06-24_RishikeshVK/blob/main/rishikesh_12_06_24.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import re

# Load the dataset
df = pd.read_csv("feedback_data.csv")

# Split concatenated column into separate columns
df[['Text', 'Sentiment', 'Source', 'Date/Time', 'User ID', 'Location', 'Confidence Score']] = df['Text, Sentiment, Source, Date/Time, User ID, Location, Confidence Score'].str.split(',', expand=True)

# Remove unnecessary columns
df = df[['Text', 'Sentiment', 'Source', 'Date/Time', 'User ID', 'Location', 'Confidence Score']]

# Data cleaning
def clean_text(text):
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text.lower()  # Convert text to lowercase

# Apply cleaning function to 'Text' column
df['Text'] = df['Text'].apply(clean_text)

# Handle missing values if any
df.dropna(inplace=True)

# Save cleaned dataset
df.to_csv("cleaned_feedback_data.csv", index=False)


In [8]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the cleaned dataset
df = pd.read_csv("cleaned_feedback_data.csv")

# Split dataset into train and test
train_texts, test_texts, train_labels, test_labels = train_test_split(df['Text'], df['Sentiment'], test_size=0.2, random_state=42)

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3) # 3 labels: positive, negative, neutral

# Tokenize input texts
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)

# Custom dataset class for BERT
class FeedbackDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create DataLoader for training and testing
train_dataset = FeedbackDataset(train_encodings, train_labels)
test_dataset = FeedbackDataset(test_encodings, test_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Fine-tune BERT
optimizer = torch.optim.AdamW(params=model.parameters(), lr=1e-5)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

for epoch in range(3):  # Adjust number of epochs as needed
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Evaluate BERT
model.eval()
predictions = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, axis=1).cpu().numpy())

# Calculate accuracy
accuracy = accuracy_score(test_labels, predictions)
print("Accuracy:", accuracy)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: new(): invalid data type 'str'

In [10]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import DataLoader, Dataset
import torch
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the cleaned dataset
df = pd.read_csv("cleaned_feedback_data.csv")

# Split dataset into train and test
train_texts, test_texts = train_test_split(df['Text'], test_size=0.2, random_state=42)

# Load T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Tokenize input texts
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, return_tensors='pt')
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True, return_tensors='pt')

# Custom dataset class for T5
class FeedbackDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# Create DataLoader for training and testing
train_dataset = FeedbackDataset(train_encodings)
test_dataset = FeedbackDataset(test_encodings)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Fine-tune T5
optimizer = torch.optim.AdamW(params=model.parameters(), lr=1e-4)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

for epoch in range(3):  # Adjust number of epochs as needed
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)  # Labels are set to input_ids for auto-regressive generation
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Generate summaries
model.eval()
generated_summaries = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=50, num_beams=4, early_stopping=True)
        summaries = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        generated_summaries.extend(summaries)

# Print generated summaries
for summary in generated_summaries:
    print(summary)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.



Dieses song immer puts me in a nostalgic mood it reminds me of good times.




















In [11]:
from transformers import BertTokenizer, BertForSequenceClassification, T5Tokenizer, T5ForConditionalGeneration
import torch

class FeedbackAnalyzer:
    def __init__(self):
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
        self.t5_tokenizer = T5Tokenizer.from_pretrained('t5-small')
        self.t5_model = T5ForConditionalGeneration.from_pretrained('t5-small')
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.bert_model.to(self.device)
        self.t5_model.to(self.device)

    def analyze_feedback(self, feedback_text):
        # Sentiment Analysis with BERT
        inputs = self.bert_tokenizer.encode_plus(
            feedback_text,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            return_tensors='pt',
            truncation=True
        )
        inputs = {key: val.to(self.device) for key, val in inputs.items()}
        with torch.no_grad():
            outputs = self.bert_model(**inputs)
        sentiment_label = torch.argmax(outputs.logits).item()

        # Text Summarization with T5
        input_ids = self.t5_tokenizer.encode(
            "summarize: " + feedback_text,
            return_tensors="pt",
            max_length=512,
            truncation=True
        )
        input_ids = input_ids.to(self.device)
        with torch.no_grad():
            summary_ids = self.t5_model.generate(input_ids, num_beams=4, max_length=100, early_stopping=True)
        summary = self.t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

        return sentiment_label, summary

# Example usage
feedback_analyzer = FeedbackAnalyzer()
feedback_text = "The product was excellent! I really loved it."
sentiment_label, summary = feedback_analyzer.analyze_feedback(feedback_text)
print("Sentiment Label:", sentiment_label)
print("Summary:", summary)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Sentiment Label: 2
Summary: the product was excellent!


In [13]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [14]:
from sklearn.metrics import accuracy_score
from rouge import Rouge

# Sentiment Analysis Evaluation
def evaluate_sentiment_model(model, dataloader):
    model.eval()
    predictions = []
    labels = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            output = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions.extend(torch.argmax(output.logits, axis=1).cpu().numpy())
            labels.extend(batch['labels'].cpu().numpy())
    accuracy = accuracy_score(labels, predictions)
    return accuracy

# Text Summarization Evaluation
def evaluate_summarization_model(model, dataloader, tokenizer):
    model.eval()
    rouge = Rouge()
    generated_summaries = []
    reference_summaries = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=50, num_beams=4, early_stopping=True)
            summaries = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            generated_summaries.extend(summaries)
            reference_summaries.extend(batch['reference_summary'])
    rouge_scores = rouge.get_scores(generated_summaries, reference_summaries, avg=True)
    return rouge_scores

# Example usage for evaluation
sentiment_accuracy = evaluate_sentiment_model(bert_model, test_loader)
print("Sentiment Analysis Accuracy:", sentiment_accuracy)

rouge_scores = evaluate_summarization_model(t5_model, test_loader, t5_tokenizer)
print("ROUGE Scores:", rouge_scores)

NameError: name 'bert_model' is not defined