In [None]:
!pip install pyspark




In [None]:
from google.colab import drive
drive.mount('/content/drive')




Mounted at /content/drive


In [None]:
import zipfile

zip_path = "/content/complaints.csv.zip"
extract_path = "/content/"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)



In [None]:
import pandas as pd
from collections import defaultdict

csv_path = "/content/complaints.csv"
chunksize = 10000
target_column = "Consumer complaint narrative"
label_column = "Product"
sub_product_column = "Sub-product"
issue_column = "Issue"
min_text_length = 50

# Dataset configurations: (label_count * samples_per_label = total rows)
configs = {
    "120K": {"top_n_classes": 20, "max_samples_per_class": 6000},
}

# Load once to get top N classes
full_df = pd.read_csv(csv_path, usecols=[label_column])
full_class_counts = full_df[label_column].value_counts()

for label, config in configs.items():
    print(f"\n🚀 Generating {label} dataset...")

    top_classes = full_class_counts.head(config["top_n_classes"]).index.tolist()
    class_counts = defaultdict(int)
    cleaned_data = []

    # Iterate over CSV in chunks to process large dataset
    for chunk in pd.read_csv(csv_path, chunksize=chunksize, usecols=[label_column, sub_product_column, issue_column, target_column]):
        chunk = chunk.dropna(subset=[target_column, label_column, sub_product_column, issue_column])

        # Process each row in the chunk
        for _, row in chunk.iterrows():
            text = str(row[target_column]).strip()
            label_val = row[label_column]
            sub_product_val = row[sub_product_column]
            issue_val = row[issue_column]

            if (
                label_val in top_classes and
                class_counts[label_val] < config["max_samples_per_class"] and
                len(text) >= min_text_length
            ):
                # Create the cleaned entry
                cleaned_data.append({
                    "Product": label_val,
                    "Sub-product": sub_product_val,
                    "Issue": issue_val,
                    "Complaint Narrative": text
                })
                class_counts[label_val] += 1

        # Stop early if all class targets met
        if all(class_counts[c] >= config["max_samples_per_class"] for c in top_classes):
            break

    # Save each version
    df_cleaned = pd.DataFrame(cleaned_data)
    out_path = f"cleaned_complaints_{label.lower()}.csv"
    df_cleaned.to_csv(out_path, index=False)
    print(f"✅ Saved: {out_path} with shape {df_cleaned.shape}")



🚀 Generating 120K dataset...
✅ Saved: cleaned_complaints_120k.csv with shape (94850, 4)


In [None]:
import pandas as pd

In [None]:
df_cleaned = pd.read_csv("/content/cleaned_complaints_120k.csv")
df_cleaned.head()


Unnamed: 0,Product,Sub-product,Issue,Complaint Narrative
0,"Credit reporting, credit repair services, or o...",Credit reporting,Improper use of your report,usc section 1681 states that there must be wri...
1,Checking or savings account,Checking account,Managing an account,PNC Bank will not allow me to link my accounts...
2,Checking or savings account,Checking account,Managing an account,My account was suspended without warning or no...
3,Debt collection,Auto debt,Attempts to collect debt not owed,"In my initial letter dated XX/XX/2019, to Hyun..."
4,Debt collection,Credit card debt,False statements or representation,I am the consumer natural person making this r...


In [None]:
!pip install -U sentence-transformers
!pip install boto3  # only if you plan to upload to AWS S3


Collecting sentence-transformers
  Downloading sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_

Collecting boto3
  Downloading boto3-1.39.12-py3-none-any.whl.metadata (6.7 kB)
Collecting botocore<1.40.0,>=1.39.12 (from boto3)
  Downloading botocore-1.39.12-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.14.0,>=0.13.0 (from boto3)
  Downloading s3transfer-0.13.1-py3-none-any.whl.metadata (1.7 kB)
Downloading boto3-1.39.12-py3-none-any.whl (139 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.9/139.9 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading botocore-1.39.12-py3-none-any.whl (13.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.9/13.9 MB[0m [31m121.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Downloading s3transfer-0.13.1-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.3/85.3 kB[0m [31m6.7 MB/s[0m eta [36m0

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from multiprocessing import Pool, cpu_count
from tqdm import tqdm
import os


In [None]:
# Load your dataset (change path if needed)
df = pd.read_csv('/content/cleaned_complaints_120k.csv')
texts = df['Complaint Narrative'].dropna().tolist()


In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

def embed_batch(text_batch):
    return model.encode(text_batch, batch_size=32)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
def batchify(lst, n):
    """Split list into chunks of n."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

batch_size = 1000
batches = list(batchify(texts, batch_size))

all_embeddings = []

for i, batch in enumerate(tqdm(batches)):
    emb = embed_batch(batch)
    all_embeddings.extend(emb)
    # Optional: Save intermediate batches (for recovery)
    np.save(f'/content/embeddings_batch_{i}.npy', emb)

# Final embedding array
all_embeddings = np.array(all_embeddings)
np.save('/content/final_complaint_embeddings.npy', all_embeddings)


100%|██████████| 95/95 [01:07<00:00,  1.40it/s]


In [None]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

# Load model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Load data
df = pd.read_csv('/content/cleaned_complaints_120k.csv')

# Columns to embed
columns_to_embed = ['Product', 'Issue', 'Sub-product']
batch_size = 5000

# Create folder to store embeddings
os.makedirs("/content/bert_batches", exist_ok=True)

def process_column_in_batches(col_name):
    print(f"\n🔁 Embedding column: {col_name}")
    texts = df[col_name].fillna("").astype(str).tolist()

    all_embeddings = []

    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i+batch_size]
        batch_embeddings = model.encode(batch_texts, batch_size=32, show_progress_bar=False)
        all_embeddings.extend(batch_embeddings)

        # Save individual batch file
        batch_path = f"/content/bert_batches/{col_name.lower().replace(' ', '_')}_batch_{i//batch_size}.npy"
        np.save(batch_path, batch_embeddings)

    # Save combined embeddings for column
    final_path = f"/content/bert_batches/{col_name.lower().replace(' ', '_')}_full.npy"
    np.save(final_path, all_embeddings)
    print(f"✅ Final saved: {final_path} | Shape: {np.array(all_embeddings).shape}")

# Process each column
for col in columns_to_embed:
    process_column_in_batches(col)



🔁 Embedding column: Product


100%|██████████| 19/19 [00:17<00:00,  1.08it/s]


✅ Final saved: /content/bert_batches/product_full.npy | Shape: (94850, 384)

🔁 Embedding column: Issue


100%|██████████| 19/19 [00:17<00:00,  1.07it/s]


✅ Final saved: /content/bert_batches/issue_full.npy | Shape: (94850, 384)

🔁 Embedding column: Sub-product


100%|██████████| 19/19 [00:17<00:00,  1.08it/s]


✅ Final saved: /content/bert_batches/sub-product_full.npy | Shape: (94850, 384)


In [None]:
import numpy as np

# Load the saved file
embeddings = np.load('/content/final_complaint_embeddings.npy')


In [None]:
# Show first 3 embeddings
for i in range(3):
    print(f"Embedding {i+1}:")
    print(embeddings[i])
    print("------")


Embedding 1:
[-7.65363034e-03  1.34724928e-02 -1.28066028e-02 -4.10553440e-02
 -1.24950437e-02  2.21050773e-02  4.08740155e-03 -1.30390665e-02
 -3.97963710e-02 -9.48979557e-02 -2.89754439e-02  1.87887549e-02
  6.27515139e-03 -3.10901646e-02 -2.41270624e-02  5.45572769e-03
  1.87357161e-02 -5.98457344e-02 -9.98271536e-03  5.38122058e-02
  2.33006924e-02  1.67579856e-02 -9.03921053e-02  2.49221772e-02
 -4.93298247e-02 -2.87803616e-02  2.78138258e-02  4.07855287e-02
  3.22599970e-02 -2.18287297e-02  4.54926305e-02  4.45146263e-02
  4.24192585e-02  3.89619805e-02  1.18799590e-01 -7.12135062e-02
  4.79552709e-02 -9.27921459e-02 -1.01447878e-02 -5.93474135e-02
 -4.77607474e-02 -3.43499444e-02 -3.38140246e-03  4.63837869e-02
 -3.41693289e-03 -6.88662892e-03 -1.81457140e-02  1.90298762e-02
  1.13734314e-02  6.45489097e-02 -1.05569446e-02 -1.81838702e-02
  2.43690866e-03  5.19127212e-02 -7.40112215e-02 -5.78749888e-02
  4.67807464e-02 -1.41890794e-02 -1.40762962e-02  1.02384919e-02
  1.10074326

In [None]:
import pandas as pd
import numpy as np

# Load BERT embeddings of complaint narratives
X_narrative = np.load('/content/final_complaint_embeddings.npy')

# Load and prepare main dataset
complaints_df = pd.read_csv('/content/cleaned_complaints_120k.csv')
complaints_df = complaints_df[['Complaint Narrative', 'Product', 'Sub-product', 'Issue']].dropna()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

# Encode Product
le_product = LabelEncoder()
y_product = le_product.fit_transform(complaints_df['Product'])

# Split and train
X_train_prod, X_test_prod, y_train_prod, y_test_prod = train_test_split(
    X_narrative, y_product, test_size=0.2, random_state=42)

clf_product = LogisticRegression(max_iter=1000)
clf_product.fit(X_train_prod, y_train_prod)

# Evaluate
print("🎯 Product Classification Report")
print(classification_report(y_test_prod, clf_product.predict(X_test_prod), target_names=le_product.classes_))


🎯 Product Classification Report
                                                                              precision    recall  f1-score   support

                                                     Bank account or service       0.58      0.60      0.59      1214
                                                 Checking or savings account       0.54      0.59      0.56      1223
                                                               Consumer Loan       0.46      0.41      0.43      1205
                                                                 Credit card       0.47      0.44      0.45      1194
                                                 Credit card or prepaid card       0.47      0.47      0.47      1211
                         Credit reporting or other personal consumer reports       0.60      0.64      0.62      1164
Credit reporting, credit repair services, or other personal consumer reports       0.57      0.58      0.57      1181
                       

In [None]:
from collections import defaultdict

subproduct_models = defaultdict(dict)

for product in complaints_df['Product'].unique():
    sub_df = complaints_df[complaints_df['Product'] == product]
    X_sub = X_narrative[sub_df.index]

    le_sub = LabelEncoder()
    y_sub = le_sub.fit_transform(sub_df['Sub-product'])

    clf_sub = LogisticRegression(max_iter=1000)
    clf_sub.fit(X_sub, y_sub)

    subproduct_models[product]['model'] = clf_sub
    subproduct_models[product]['encoder'] = le_sub


In [None]:
issue_models = defaultdict(dict)

for sub_product in complaints_df['Sub-product'].unique():
    issue_df = complaints_df[complaints_df['Sub-product'] == sub_product]
    X_issue = X_narrative[issue_df.index]

    le_issue = LabelEncoder()
    y_issue = le_issue.fit_transform(issue_df['Issue'])

    clf_issue = LogisticRegression(max_iter=1000)
    clf_issue.fit(X_issue, y_issue)

    issue_models[sub_product]['model'] = clf_issue
    issue_models[sub_product]['encoder'] = le_issue


In [15]:
from sentence_transformers import SentenceTransformer

embed_model = SentenceTransformer('all-MiniLM-L6-v2')

def predict_full_pipeline(user_input):
    embedding = embed_model.encode([user_input])

    # Predict Product
    pred_prod_enc = clf_product.predict(embedding)[0]
    pred_prod = le_product.inverse_transform([pred_prod_enc])[0]
    print(f"🧭 Product: {pred_prod}")

    # Predict Sub-product
    sub_model = subproduct_models[pred_prod]['model']
    sub_encoder = subproduct_models[pred_prod]['encoder']
    pred_sub_enc = sub_model.predict(embedding)[0]
    pred_sub = sub_encoder.inverse_transform([pred_sub_enc])[0]
    print(f"🔍 Sub-product: {pred_sub}")

    # Predict Issue
    if pred_sub in issue_models:
        issue_model = issue_models[pred_sub]['model']
        issue_encoder = issue_models[pred_sub]['encoder']
        pred_issue_enc = issue_model.predict(embedding)[0]
        pred_issue = issue_encoder.inverse_transform([pred_issue_enc])[0]
        print(f"❗ Issue: {pred_issue}")
    else:
        pred_issue = "Unknown"
        print("⚠️ Issue prediction not available for this sub-product.")

    return pred_prod, pred_sub, pred_issue


In [16]:
user_input = "I was charged twice on my credit card and no one helped me resolve it."
predict_full_pipeline(user_input)


🧭 Product: Credit card or prepaid card
🔍 Sub-product: General-purpose credit card or charge card
❗ Issue: Problem with a purchase shown on your statement


('Credit card or prepaid card',
 'General-purpose credit card or charge card',
 'Problem with a purchase shown on your statement')

In [17]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min

# STEP 1: Load Data and Embeddings
complaints_df = pd.read_csv('/content/cleaned_complaints_120k.csv')
complaints_df = complaints_df[['Complaint Narrative', 'Product', 'Sub-product', 'Issue']].dropna()

# Load pre-generated narrative embeddings
X_narrative = np.load('/content/final_complaint_embeddings.npy')

# Make sure rows match (dropna affects size)
X_narrative = X_narrative[complaints_df.index]

# STEP 2: Apply KMeans Clustering (50 semantic groups)
n_clusters = 50
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(X_narrative)

# STEP 3: Get Closest Points to Cluster Centers (Top 50 distinct complaints)
closest_indices, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, X_narrative)

# STEP 4: Extract Representative Complaints
top_50_semantic = complaints_df.iloc[closest_indices][['Product', 'Sub-product', 'Issue']].copy()
top_50_semantic['Cluster'] = range(1, 51)

# STEP 5: Save Output
top_50_semantic.to_csv('/content/top_50_semantic_clusters.csv', index=False)
print("✅ Saved: /content/top_50_semantic_clusters.csv")
display(top_50_semantic.head())


✅ Saved: /content/top_50_semantic_clusters.csv


Unnamed: 0,Product,Sub-product,Issue,Cluster
45451,"Money transfer, virtual currency, or money ser...",Mobile or digital wallet,Unauthorized transactions or other transaction...,1
66521,Vehicle loan or lease,Loan,Struggling to pay your loan,2
714,Debt collection,I do not know,Attempts to collect debt not owed,3
49773,Credit card,General-purpose credit card or charge card,Problem with a company's investigation into an...,4
661,"Payday loan, title loan, or personal loan",Installment loan,Charged fees or interest you didn't expect,5
