In [3]:
import pandas as pd

In [4]:
data = pd.read_csv('complaintData.csv')

In [5]:
import nltk
from nltk.corpus import stopwords
import string

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Cleaning function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = ''.join([char for char in text if char not in string.punctuation])  # Remove punctuation
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

data['cleaned_reviews'] = data['complaints'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')

# Transform the cleaned reviews
tfidf_matrix = tfidf_vectorizer.fit_transform(data['cleaned_reviews'])

In [7]:
from sklearn.decomposition import NMF

# Set the number of topics
num_topics = 4

# Initialize and fit the NMF model
nmf_model = NMF(n_components=num_topics, random_state=42)
nmf_model.fit(tfidf_matrix)

# Get the topic-word matrix
topic_words = nmf_model.components_


In [8]:
# Get the words corresponding to the topic-word matrix
feature_names = tfidf_vectorizer.get_feature_names_out()

# Display the top words for each topic
for topic_idx, topic in enumerate(topic_words):
    print(f"Topic {topic_idx}: ", [feature_names[i] for i in topic.argsort()[-10:]])


Topic 0:  ['debited', 'details', 'purchase', 'provide', 'refund', 'transaction', 'udemy', 'payment', 'account', 'visible']
Topic 1:  ['debited', 'confirmation', 'email', 'udemy', 'learning', 'added', 'got', 'deducted', 'course', 'money']
Topic 2:  ['complete', 'showed', 'purchase', 'support', 'course', 'udemy', 'website', 'present', 'portal', 'educational']
Topic 3:  ['huge', 'scam', 'improve', 'platform', 'feels', 'course', 'available', 'bought', 'isnt', 'ive']


In [9]:
# Assuming `nmf_model` is your fitted NMF model and `tfidf_matrix` is the TF-IDF representation of your reviews
topic_distribution = nmf_model.transform(tfidf_matrix)
import numpy as np

# Get the index of the topic with the highest weight for each review
dominant_topic = np.argmax(topic_distribution, axis=1)
# Define the mapping from topic index to category
topic_to_category = {
    0: 'Payment-related',
    1: 'Service-related',
    2: 'Product-related',
    3: 'Technical issues'
}

# Map the dominant topic index to the corresponding category
predicted_category = [topic_to_category[topic] for topic in dominant_topic]
# Assuming `data` is your original DataFrame containing the reviews
data['predicted_category'] = predicted_category


In [10]:
data['predicted_category']

0      Payment-related
1      Payment-related
2      Service-related
3      Service-related
4      Payment-related
            ...       
295    Payment-related
296    Payment-related
297    Service-related
298    Payment-related
299    Service-related
Name: predicted_category, Length: 300, dtype: object

In [11]:
import torch
print(torch.cuda.is_available())  # Should return True if CUDA is set up correctly
print(torch.version.cuda)          # Should return the CUDA version being used by PyTorch

True
12.1


In [14]:
data.head()

Unnamed: 0,titles,complaints,cleaned_reviews,predicted_category
0,Udemy — Amount deducted via UPI but course not...,I tried purchasing a course on Udemy by payin...,tried purchasing course udemy paying amount pa...,Payment-related
1,Udemy — Did not recieve refund. (complaint),I recently purchased a course on Udemy of val...,recently purchased course udemy value 449 rupe...,Payment-related
2,Udemy — Transaction Failed but money got debit...,Money got debited from my side but on udemy p...,money got debited side udemy page showed trans...,Service-related
3,Udemy — Refund for a course (complaint),I have purchased a course on udemy on complet...,purchased course udemy complete web design449r...,Service-related
4,Udemy — Payment successful but course not purc...,I purchased the following course on Udemy: ht...,purchased following course udemy httpswww udem...,Payment-related


In [15]:
from sklearn.model_selection import train_test_split

# Assuming 'Complaint_Text' is the column with text data and 'predicted_category' contains the labels
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['cleaned_reviews'].values,
    data['predicted_category'].values,
    test_size=0.2,
    random_state=42
)


In [16]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the texts
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=128)

# Encode the labels
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
test_labels_encoded = label_encoder.transform(test_labels)


  from .autonotebook import tqdm as notebook_tqdm


In [17]:
from transformers import BertForSequenceClassification

# Load the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
import torch
from torch.utils.data import DataLoader, TensorDataset

# Convert encodings to tensors
train_inputs = torch.tensor(train_encodings['input_ids'])
train_masks = torch.tensor(train_encodings['attention_mask'])
train_labels_tensor = torch.tensor(train_labels_encoded)

test_inputs = torch.tensor(test_encodings['input_ids'])
test_masks = torch.tensor(test_encodings['attention_mask'])
test_labels_tensor = torch.tensor(test_labels_encoded)

# Create DataLoader objects for training and validation
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels_tensor)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

test_data = TensorDataset(test_inputs, test_masks, test_labels_tensor)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False)


In [19]:
from transformers import AdamW

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Set up training parameters
epochs = 3




In [22]:
from tqdm import tqdm

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training loop
for epoch in range(epochs):
    model.train()
    for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}"):
        batch_inputs, batch_masks, batch_labels = [b.to(device) for b in batch]
        model.zero_grad()

        # Forward pass
        outputs = model(input_ids=batch_inputs, attention_mask=batch_masks, labels=batch_labels)
        loss = outputs.loss
        loss.backward()

        # Optimization step
        optimizer.step()


Training Epoch 1:   0%|          | 0/15 [00:17<?, ?it/s]


RuntimeError: "nll_loss_forward_reduce_cuda_kernel_2d_index" not implemented for 'Int'

In [21]:
# Convert batch_labels to LongTensor
batch_labels = batch_labels.long()

# Forward pass
outputs = model(input_ids=batch_inputs, attention_mask=batch_masks, labels=batch_labels)
loss = outputs.loss
