In [45]:
import logging
import os

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [9]:
data_path_processed = "../tracking/data/data_processed"
data_version = 1
file_name=f"tickets_classification_eng_{data_version}.csv"

file_path = os.path.join(data_path_processed, file_name)
df = pd.read_csv(file_path)
df.shape

(18963, 3)

In [10]:
tfidf = TfidfVectorizer(min_df=2, max_df=0.95)
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [17]:
df = df.dropna(subset=["processed_text"])

In [18]:
df["processed_text"].sample(5, random_state=42)
df.shape

(18961, 3)

In [19]:
dtm = tfidf.fit_transform(df["processed_text"])
len_vocab = len(tfidf.vocabulary_)
logger.info(f"TF-IDF vectorizer fitted with {len_vocab}  unique words")

INFO:__main__:TF-IDF vectorizer fitted with 7167  unique words


In [None]:
nmf = NMF(n_components=3, random_state=123)
# nmf.fit(dtm)
W = nmf.fit_transform(dtm)
H = nmf.components_

In [52]:
W, H

(array([[0.02645237, 0.01315789, 0.02177922],
        [0.00856271, 0.01632226, 0.00320532],
        [0.        , 0.06172149, 0.        ],
        ...,
        [0.01893377, 0.02293754, 0.01144488],
        [0.03037479, 0.02187064, 0.00764549],
        [0.00050097, 0.0553455 , 0.07718023]]),
 array([[6.41588415e-04, 2.66501462e-04, 0.00000000e+00, ...,
         3.65930494e-03, 1.24901995e-03, 2.62607675e-04],
        [0.00000000e+00, 4.67068206e-06, 9.14965100e-04, ...,
         6.01930890e-03, 0.00000000e+00, 5.07276639e-04],
        [2.22356154e-04, 3.50283715e-04, 3.53713491e-04, ...,
         5.38955214e-03, 2.74664513e-03, 0.00000000e+00]]))

In [51]:
vocab = np.array(tfidf.get_feature_names_out())
vocab

array(['00', '000', '11', ..., 'zip', 'zone', 'zoom'], dtype=object)

In [53]:
top_words = lambda t: [vocab[i] for i in np.argsort(t)[: -15 - 1 : -1]]
topic_words = [top_words(t) for t in H]
topics = [" ".join(t) for t in topic_words]
topics

['account bank check money chase deposit fund day claim fee branch call transact number charg',
 'credit card report chase inquiri charg account disput compani score letter author fraud inform use',
 'payment loan chase mortgag month home interest time pay year modif rate amount fee letter']

In [67]:
col_names = ["topic" + str(i) for i in range(nmf.n_components)]
tickets_names = ["ticket_" + str(i) for i in range(len(df.processed_text))]
df_doc_topics = pd.DataFrame(
    np.round(W, 2), columns=col_names, index=tickets_names
)
top_topics = np.argmax(W, axis=1)
df_doc_topics["relevant_topics"] = top_topics
df["relevant_topics"] = top_topics
df.sample(5, random_state=42)

Unnamed: 0,complaint_what_happened,ticket_classification,processed_text,relevant_topics
10644,On XX/XX/XXXX I sent via Certified mail to JPC...,Mortgage + Conventional home mortgage,mail jpchase research department/archives/prop...,2
17305,XXXX appraisal fee not refunded after closing ...,Bank account or service + Other bank product/s...,fee refund refin attempt bank zip code.th fee ...,0
8250,My car was hit by an uninsured drunk driver wh...,"Credit reporting, credit repair services, or o...",car drunk driver park middl night insur compan...,2
16343,"On XX/XX/2018, at approx XXXX CT, I spoke with...",Credit card or prepaid card + General-purpose ...,2018 ct card servic supervisor telephon center...,1
14018,car was total ins co sent ck over. Bank needs ...,Consumer Loan + Vehicle loan,car total co ck bank need process gap claim wa...,0


In [68]:
dict_mapping = {
    0: "Bank Account Services",
    1: "Credit Report or Prepaid Card",
    2: "Mortgage/Loan",
}
df["relevant_topics"] = df["relevant_topics"].map(dict_mapping)
df.sample(5, random_state=42)

Unnamed: 0,complaint_what_happened,ticket_classification,processed_text,relevant_topics
10644,On XX/XX/XXXX I sent via Certified mail to JPC...,Mortgage + Conventional home mortgage,mail jpchase research department/archives/prop...,Mortgage/Loan
17305,XXXX appraisal fee not refunded after closing ...,Bank account or service + Other bank product/s...,fee refund refin attempt bank zip code.th fee ...,Bank Account Services
8250,My car was hit by an uninsured drunk driver wh...,"Credit reporting, credit repair services, or o...",car drunk driver park middl night insur compan...,Mortgage/Loan
16343,"On XX/XX/2018, at approx XXXX CT, I spoke with...",Credit card or prepaid card + General-purpose ...,2018 ct card servic supervisor telephon center...,Credit Report or Prepaid Card
14018,car was total ins co sent ck over. Bank needs ...,Consumer Loan + Vehicle loan,car total co ck bank need process gap claim wa...,Bank Account Services
