<a href="https://colab.research.google.com/github/preronagit/Complaint-Management-System/blob/main/Complaint_Management_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

latest code


In [None]:
from google.colab import files
import io
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier
from transformers import DistilBertTokenizer, DistilBertModel
import torch
import nltk
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity

# Download NLTK stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# upload the file
uploaded = files.upload()

file_name = list(uploaded.keys())[0]
df = pd.read_csv(io.BytesIO(uploaded[file_name]))


# Preprocessing
def preprocess(text):
    # Convert to lowercase, remove stopwords
    if isinstance(text, str):
        return ' '.join([word for word in text.lower().split() if word not in stop_words])
    else:
        return ''

df['cleaned_complaint'] = df['Complaint Description'].apply(preprocess)

df.dropna(subset=['Category'], inplace=True)

# Split the dataset
X = df['cleaned_complaint']
y = df['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Load pre-trained DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

def get_distilbert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

# Get embeddings for all complaints
X_train_embeddings = np.vstack([get_distilbert_embeddings(text) for text in X_train])
X_test_embeddings = np.vstack([get_distilbert_embeddings(text) for text in X_test])

# Dimensionality reduction with PCA
pca = PCA(n_components=50)  # Adjust to find the best balance of dimensionality and performance
X_train_pca = pca.fit_transform(X_train_embeddings)
X_test_pca = pca.transform(X_test_embeddings)

# Gradient Boosting Classifier
gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gbc.fit(X_train_pca, y_train)

# Evaluate on test set
y_pred = gbc.predict(X_test_pca)
print("Test Accuracy:", accuracy_score(y_test, y_pred)*100, "%")
print(classification_report(y_test, y_pred))

# Function to classify new complaints
def classify_complaint(complaint, model, pca):
    complaint_cleaned = preprocess(complaint)
    complaint_embedding = get_distilbert_embeddings(complaint_cleaned)
    complaint_pca = pca.transform(complaint_embedding)
    return model.predict(complaint_pca)[0]

# Function to get category recommendations based on input
def recommend_categories(input_text, n_recommendations=3):
    input_cleaned = preprocess(input_text)
    input_embedding = get_distilbert_embeddings(input_cleaned)
    input_pca = pca.transform(input_embedding)

    # Calculate cosine similarity with training data
    similarities = cosine_similarity(input_pca, X_train_pca)
    similar_indices = np.argsort(similarities[0])[::-1]

    recommended_categories = []
    for index in similar_indices[:n_recommendations]:
        recommended_categories.append(y_train.iloc[index])

    return list(set(recommended_categories))

# Main loop for user input
def main():
    print("Welcome to the Complaint Categorization System!")
    print("Type your complaint below (or type 'exit' to quit):")

    while True:
        user_input = input("Complaint: ")
        if user_input.lower() == 'exit':
            print("Exiting the system. Thank you!")
            break

        # Recommend categories
        recommended_categories = recommend_categories(user_input)
        print("Recommended Categories:", recommended_categories)

        # Classify using the best model
        predicted_category = classify_complaint(user_input, gbc, pca)
        print("Predicted Category:", predicted_category)
        print()

if __name__ == "__main__":
    main()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Saving Filtered data.csv to Filtered data.csv


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Test Accuracy: 70.79953650057938 %
                         precision    recall  f1-score   support

                  Civil       0.59      0.67      0.63       242
             Electrical       0.82      0.83      0.82       190
           Housekeeping       0.73      0.56      0.64       109
            Steel Works       0.33      0.15      0.21        13
Water and Plumbing Work       0.80      0.81      0.81       230
               Woodwork       0.56      0.53      0.55        79

               accuracy                           0.71       863
              macro avg       0.64      0.59      0.61       863
           weighted avg       0.71      0.71      0.71       863

Welcome to the Complaint Categorization System!
Type your complaint below (or type 'exit' to quit):
Complaint: my toilet sink is broken
Recommended Categories: ['Civil']
Predicted Category: Civil

Complaint: there is a dead cat on the balcony
Recommended Categories: ['Civil', 'Woodwork', 'Housekeeping']
Predict