In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from tqdm.notebook import tqdm
tqdm.pandas()
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", category=DeprecationWarning)
import nltk
from textblob import TextBlob
import matplotlib.pyplot as plt
import string

Topic Modelling using LDA and LSI:  https://www.kaggle.com/shirshmall/topic-modelling-lda-lsi-custom-complaints

# Import Data

In [2]:
data = pd.read_csv("/kaggle/input/consumer-complaint/complaints.csv")
data = data[data["Issue"].isnull()==False]
data.reset_index(inplace=True, drop=True)

In [3]:
# data["Consumer complaint narrative"][np.random.randint(len(data),size=1)].values
print("Count of unique Issues: ", len(data["Issue"].unique()))
data = data[0:3000]

Count of unique Issues:  165


# Data Cleaning

In [4]:
# data["Consumer complaint narrative"]

# Lowercase
data["Issue"] = data["Issue"].progress_apply(lambda x: " ".join(x.lower() for x in x.split()))

# Removing Punctuation
data["Issue"] = data["Issue"].progress_apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

# # Removal of word having two or more "x" letters
data["Issue"] = data["Issue"].progress_apply(lambda x: " ".join(x for x in x.split() if "x" not in x))

# Removal of numbers and words having numbers
data["Issue"] = data["Issue"].progress_apply(lambda x: " ".join(x for x in x.split() if not any(c.isdigit() for c in x)))

  0%|          | 0/3000 [00:00<?, ?it/s]

  0%|          | 0/3000 [00:00<?, ?it/s]

  0%|          | 0/3000 [00:00<?, ?it/s]

  0%|          | 0/3000 [00:00<?, ?it/s]

In [5]:
data = data[["Issue", "Consumer complaint narrative"]]
data = data[(data["Issue"]!="")] 
data.reset_index(inplace=True, drop=True)

In [6]:
# Save cleaned data
data.to_csv("complaints_cleaned.csv", index=False)

# BERTopic - Topic Modeling using BERT, UMap, HDBSCAN

In [7]:
!pip install -U accelerate
!pip install -U transformers
!pip install transformers[torch]
!pip install accelerate -U
!pip install sacrebleu
!pip install evaluate
!pip install bertopic
import transformers
import accelerate
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from gensim.models.coherencemodel import CoherenceModel
from sklearn.datasets import fetch_20newsgroups
from gensim import corpora

Collecting accelerate
  Downloading accelerate-0.22.0-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.2/251.2 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.12.0
    Uninstalling accelerate-0.12.0:
      Successfully uninstalled accelerate-0.12.0
Successfully installed accelerate-0.22.0
Collecting transformers
  Downloading transformers-4.32.0-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.30.1
    Uninstalling transformers-4.30.1:
      Successfully uninstalled transformers-4.30.1
Successfully installed transformers-4.32.0
[0mCollecting sacrebleu
  Downloading sacrebleu-2.3.1-py

In [8]:
# Load cleaned data
data = pd.read_csv("/kaggle/working/complaints_cleaned.csv")
column = "Issue" #  "Consumer complaint narrative"
data

Unnamed: 0,Issue,Consumer complaint narrative
0,incorrect information on your report,
1,incorrect information on your report,
2,problem with a credit reporting companys inves...,
3,incorrect information on your report,
4,improper use of your report,
...,...,...
2995,improper use of your report,
2996,incorrect information on your report,
2997,improper use of your report,
2998,incorrect information on your report,


In [9]:
# Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

# Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english")

# Create topic representation
ctfidf_model = ClassTfidfTransformer()

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [10]:
topic_model = BERTopic( embedding_model=embedding_model, umap_model=umap_model, hdbscan_model=hdbscan_model,
                      vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model, nr_topics=10, min_topic_size = 10)

In [11]:
# Training
topics, probabilities = topic_model.fit_transform(data[column])

In [12]:
documents = pd.DataFrame({"Document": data[column], "ID": range(len(data[column])),"Topic": topics})
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

# Extract vectorizer and analyzer from BERTopic
vectorizer = topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()

# Extract features for Topic Coherence evaluation
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [[words for words, _ in topic_model.get_topic(topic)] 
               for topic in range(len(set(topics))-1)]

# Evaluate
coherence_model = CoherenceModel(topics=topic_words, texts=tokens, corpus=corpus, dictionary=dictionary, coherence='u_mass')
u_mass_score = coherence_model.get_coherence()

coherence_model = CoherenceModel(topics=topic_words, texts=tokens, corpus=corpus, dictionary=dictionary, coherence='c_v')
cv_score = coherence_model.get_coherence()
print(" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -")
print( "\n \n u_mass score: ", u_mass_score, "  |  ", "cv_score: ", cv_score )

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

 
 u_mass score:  -0.26619367608106637   |   cv_sco

In [13]:
# Visualizing the generated topics is using a barchart
topic_model.visualize_barchart()

In [14]:
# Results - Interactive graph
topic_model.visualize_topics()

***

In [15]:
print("yo")

yo
