In [16]:
import pandas as pd
import numpy as np
import string
import nltk
import torch
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.nn import functional as F
from tqdm import tqdm

In [2]:
print(torch.cuda.is_available()) 
print(torch.version.cuda)    

True
12.1


In [3]:
data = pd.read_csv('complaintData.csv')

In [4]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()  #Convert to lowercase
    text = ''.join([char for char in text if char not in string.punctuation])  #Remove punctuation
    text = ' '.join([word for word in text.split() if word not in stop_words]) #Remove stopwords
    return text

data['cleaned_reviews'] = data['complaints'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
#Initialize the TF-IDF Vectorizer
#tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
#tfidf_matrix = tfidf_vectorizer.fit_transform(data['cleaned_reviews'])

In [9]:
# #Set the number of topics
# num_topics = 4

# #Initialize and fit the NMF model
# nmf_model = NMF(n_components=num_topics, random_state=42)
# nmf_model.fit(tfidf_matrix)

# #Get the topic-word matrix
# topic_words = nmf_model.components_

In [10]:
# #Get the words corresponding to the topic-word matrix
# feature_names = tfidf_vectorizer.get_feature_names_out()

# #Display the top words for each topic
# for topic_idx, topic in enumerate(topic_words):
#     print(f"Topic {topic_idx}: ", [feature_names[i] for i in topic.argsort()[-10:]]) #argsort() returns indices in ascending order


Topic 0:  ['debited', 'details', 'purchase', 'provide', 'refund', 'transaction', 'udemy', 'payment', 'account', 'visible']
Topic 1:  ['debited', 'confirmation', 'email', 'udemy', 'learning', 'added', 'got', 'deducted', 'course', 'money']
Topic 2:  ['complete', 'showed', 'purchase', 'support', 'course', 'udemy', 'website', 'present', 'portal', 'educational']
Topic 3:  ['huge', 'scam', 'improve', 'platform', 'feels', 'course', 'available', 'bought', 'isnt', 'ive']


In [19]:
# #nmf_model is fitted NMF model and `tfidf_matrix` is the TF-IDF representation of reviews
# topic_distribution = nmf_model.transform(tfidf_matrix)

# #Get the index of the topic with the highest weight for each review
# dominant_topic = np.argmax(topic_distribution, axis=1)
# #Define the mapping from topic index to category
# topic_to_category = {
#     0: 'Payment-related',
#     1: 'Service-related',
#     2: 'Product-related',
#     3: 'Technical issues'
# }

# #Map the dominant topic index to the corresponding category
# predicted_category = [topic_to_category[topic] for topic in dominant_topic]
# data['predicted_category'] = predicted_category


In [5]:
reviews = data['cleaned_reviews'].astype(str).tolist()  #Convert to list of strings

#Create a Document-Term Matrix using CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
dtm = vectorizer.fit_transform(reviews)

n_topics = 4  
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda.fit(dtm)

#Display the topics and their top words
feature_names = vectorizer.get_feature_names_out()

for topic_idx, topic in enumerate(lda.components_):
    print(f"Topic {topic_idx}: ", [feature_names[i] for i in topic.argsort()[-10:]])

Topic 0:  ['learning', 'courses', 'added', 'transaction', 'account', 'paid', 'udemy', 'phonepe', 'payment', 'course']
Topic 1:  ['id', 'learning', 'account', 'email', 'showing', 'got', 'debited', 'money', 'course', 'udemy']
Topic 2:  ['pay', 'transaction', 'purchase', 'purchased', 'visible', 'refund', 'payment', 'account', 'course', 'udemy']
Topic 3:  ['failed', 'refund', 'got', 'rs', 'transaction', 'payment', 'deducted', 'money', 'udemy', 'course']


In [6]:
#Get the dominant topic for each document
doc_topic_dist = lda.transform(dtm)  #Get the topic distribution for each document
dominant_topic = doc_topic_dist.argmax(axis=1)  #Get the index of the dominant topic

#Define topic to category mapping
topic_to_category = {
    0: 'Product-related',
    1: 'Service-related',
    2: 'Payment-related',
    3: 'Technical issues'
}

# Map the dominant topic index to the corresponding category
predicted_category = [topic_to_category[topic] for topic in dominant_topic]
data['predicted_category'] = predicted_category

# Display the DataFrame with predicted categories
data[['cleaned_reviews', 'predicted_category']]

Unnamed: 0,cleaned_reviews,predicted_category
0,tried purchasing course udemy paying amount pa...,Product-related
1,recently purchased course udemy value 449 rupe...,Payment-related
2,money got debited side udemy page showed trans...,Service-related
3,purchased course udemy complete web design449r...,Payment-related
4,purchased following course udemy httpswww udem...,Payment-related
...,...,...
295,made payment via phonepe udemy amount rs 455 d...,Payment-related
296,payment via google pay udemy amount rs 490 deb...,Payment-related
297,hii tried buy course udemy shows transaction f...,Technical issues
298,purchased udemy course linux administration co...,Payment-related


In [7]:
data['predicted_category']

0       Product-related
1       Payment-related
2       Service-related
3       Payment-related
4       Payment-related
             ...       
295     Payment-related
296     Payment-related
297    Technical issues
298     Payment-related
299    Technical issues
Name: predicted_category, Length: 300, dtype: object

In [8]:
data.head()

Unnamed: 0,titles,complaints,cleaned_reviews,predicted_category
0,Udemy — Amount deducted via UPI but course not...,I tried purchasing a course on Udemy by payin...,tried purchasing course udemy paying amount pa...,Product-related
1,Udemy — Did not recieve refund. (complaint),I recently purchased a course on Udemy of val...,recently purchased course udemy value 449 rupe...,Payment-related
2,Udemy — Transaction Failed but money got debit...,Money got debited from my side but on udemy p...,money got debited side udemy page showed trans...,Service-related
3,Udemy — Refund for a course (complaint),I have purchased a course on udemy on complet...,purchased course udemy complete web design449r...,Payment-related
4,Udemy — Payment successful but course not purc...,I purchased the following course on Udemy: ht...,purchased following course udemy httpswww udem...,Payment-related


In [18]:
# Step 1: Encode the labels
label_encoder = LabelEncoder()
data['encoded_category'] = label_encoder.fit_transform(data['predicted_category'])

# Step 2: Tokenization using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 3. Train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['cleaned_reviews'].tolist(), data['encoded_category'].tolist(), test_size=0.2, random_state=42
)

# 4. Tokenization using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class ReviewDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create dataset objects
train_dataset = ReviewDataset(train_texts, train_labels)
test_dataset = ReviewDataset(test_texts, test_labels)

# 5. Reduce batch size
batch_size = 8  # Reducing to 8 for smaller memory consumption

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# 6. Model and optimizer setup
device = torch.device("cpu")  # Use CPU explicitly
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(data['encoded_category'].unique()))
model = model.to(device)  # Ensure the model is on CPU

optimizer = AdamW(model.parameters(), lr=2e-5)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
# 7. Training loop
num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0

    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{num_epochs}"):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)  # Move inputs to CPU
        attention_mask = batch['attention_mask'].to(device)  # Move masks to CPU
        labels = batch['label'].to(device)  # Move labels to CPU

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_train_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Epoch {epoch+1} | Training Loss: {avg_train_loss:.4f}")

# 8. Evaluation loop
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating"):
        input_ids = batch['input_ids'].to(device)  # Move inputs to CPU
        attention_mask = batch['attention_mask'].to(device)  # Move masks to CPU
        labels = batch['label'].to(device)  # Move labels to CPU

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(F.softmax(logits, dim=1), dim=1)
        correct += (predictions == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Training Epoch 1/3: 100%|██████████| 30/30 [04:23<00:00,  8.77s/it]


Epoch 1 | Training Loss: 1.1920


Training Epoch 2/3: 100%|██████████| 30/30 [04:33<00:00,  9.12s/it]


Epoch 2 | Training Loss: 0.6702


Training Epoch 3/3: 100%|██████████| 30/30 [03:49<00:00,  7.64s/it]


Epoch 3 | Training Loss: 0.3062


Evaluating: 100%|██████████| 8/8 [00:16<00:00,  2.05s/it]

Test Accuracy: 100.00%





In [28]:
import torch
from transformers import BertTokenizer

# Initialize the tokenizer (make sure to use the same model as during training)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def classify_review(review, model):
    # Preprocess the review
    inputs = tokenizer(review, return_tensors='pt', padding=True, truncation=True, max_length=512)
    
    # Move inputs to the specified device (CPU in this case)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    
    # Make predictions
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(torch.softmax(logits, dim=1), dim=1)

    # Convert prediction to category
    predicted_category = predictions.item()  # Get the predicted class index
    return predicted_category

# Example usage
new_review = " Amount detucted from my account but the courses are not available in ""My learning"" section."  # Your review text
predicted_category_index = classify_review(new_review, model)

# Map index to category (assuming you have a list of categories)
categories = ['Payment-related', 'Service-related', 'Product-related', 'Technical issues']
predicted_category = categories[predicted_category_index]

print(f"The review is classified as: {predicted_category}")


The review is classified as: Service-related
