In [15]:
import datetime
import logging
import json
import os

import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag


In [2]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

logger

<Logger __main__ (INFO)>

In [3]:
nltk.download("averaged_perceptron_tagger")
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

stop_words = set(stopwords.words("english"))
stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/legion_5/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/legion_5/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/legion_5/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/legion_5/nltk_data...


In [4]:
file_name = "tickets_classification_eng"

def read_json(path: str, file_name: str):
    """This method is used to read the json file"""
    file_path = os.path.join(path, file_name)
    datos = None
    with open(file_path, "r") as file:
        datos = json.load(file)
    df_tickets = pd.json_normalize(datos)
    return df_tickets, datos

name_data_input = f"{file_name}"
PATH_DATA_RAW = "../tracking/data/data_raw"
PATH_DATA_PROCESSED = "../tracking/data/data_processed"

# reading JSON data
df_tickets, json_tickets  = read_json(
    path=PATH_DATA_RAW, file_name=f"{name_data_input}.json"
)

In [5]:
df_tickets.sample(5, random_state=42)

Unnamed: 0,_index,_type,_id,_score,_source.tags,_source.zip_code,_source.complaint_id,_source.issue,_source.date_received,_source.state,...,_source.company_response,_source.company,_source.submitted_via,_source.date_sent_to_company,_source.company_public_response,_source.sub_product,_source.timely,_source.complaint_what_happened,_source.sub_issue,_source.consumer_consent_provided
35387,complaint-public-v2,complaint,41,0.0,,63031,41,"Loan modification,collection,foreclosure",2012-05-21T12:00:00-05:00,MO,...,Closed with explanation,JPMORGAN CHASE & CO.,Phone,2012-05-29T12:00:00-05:00,,Conventional fixed mortgage,Yes,,,
55952,complaint-public-v2,complaint,2923855,0.0,,110XX,2923855,Problem caused by your funds being low,2018-06-01T12:00:00-05:00,NY,...,Closed with monetary relief,JPMORGAN CHASE & CO.,Web,2018-06-01T12:00:00-05:00,,Checking account,Yes,From XX/XX/XXXX through XX/XX/XXXX Chase Bank ...,Non-sufficient funds and associated fees,Consent provided
6597,complaint-public-v2,complaint,1642171,0.0,,923XX,1642171,Deposits and withdrawals,2015-11-05T12:00:00-05:00,CA,...,Closed with explanation,JPMORGAN CHASE & CO.,Web,2015-11-05T12:00:00-05:00,,Other bank product/service,Yes,XX/XX/XXXX I mailed out a check for {$890.00} ...,,Consent provided
55707,complaint-public-v2,complaint,3041680,0.0,,26651,3041680,Fees or interest,2018-10-09T12:00:00-05:00,WV,...,Closed with explanation,JPMORGAN CHASE & CO.,Referral,2018-10-10T12:00:00-05:00,,General-purpose credit card or charge card,Yes,,Problem with fees,
8277,complaint-public-v2,complaint,3295264,0.0,,78213,3295264,Closing an account,2019-07-01T12:00:00-05:00,TX,...,Closed with explanation,JPMORGAN CHASE & CO.,Referral,2019-07-03T12:00:00-05:00,,Checking account,Yes,,Company closed your account,


In [6]:
df_tickets.shape

(78313, 22)

In [7]:
df = df_tickets[
    [
        "_source.complaint_what_happened",
        "_source.product",
        "_source.sub_product",
    ]
]
df = df.rename(
    columns={
        "_source.complaint_what_happened": "complaint_what_happened",
        "_source.product": "category",
        "_source.sub_product": "sub_product",
    }
)
df["ticket_classification"] = (
    df["category"] + " + " + df["sub_product"]
)
df = df.drop(["sub_product", "category"], axis=1)
df["complaint_what_happened"] = df["complaint_what_happened"].replace(
    "", np.nan
)
df = df.dropna(subset=["complaint_what_happened", "ticket_classification"])
df = df.reset_index(drop=True)
logger.info("Data successfully transformed")
df.shape

INFO:__main__:Data successfully transformed


(18963, 2)

In [8]:
df.sample(5, random_state=42)

Unnamed: 0,complaint_what_happened,ticket_classification
7459,I have an auto loan with Chase Auto Finance. \...,Vehicle loan or lease + Loan
8700,JP Morgan Chase Bank rejected refinance of cur...,Mortgage + Conventional adjustable mortgage (ARM)
9084,I have been a member with Chase bank almost a ...,Checking or savings account + Other banking pr...
7858,"On XX/XX/2020, I contacted chase regarding a r...",Mortgage + Conventional home mortgage
1328,At the end of XXXX XXXX I called customer serv...,Credit card or prepaid card + General-purpose ...


In [9]:
column_to_process = df["complaint_what_happened"]
column_to_process

0        Good morning my name is XXXX XXXX and I apprec...
1        I upgraded my XXXX XXXX card in XX/XX/2018 and...
2        Chase Card was reported on XX/XX/2019. However...
3        On XX/XX/2018, while trying to book a XXXX  XX...
4        my grand son give me check for {$1600.00} i de...
                               ...                        
18958    My husband passed away. Chase bank put check o...
18959    After being a Chase Card customer for well ove...
18960    On Wednesday, XX/XX/XXXX I called Chas, my XXX...
18961    I am not familiar with XXXX pay and did not un...
18962    I have had flawless credit for 30 yrs. I've ha...
Name: complaint_what_happened, Length: 18963, dtype: object

In [16]:
def tokenize(text: str):
    """This method is used to tokenize the text"""
    tokens = word_tokenize(text.lower(), language="english")
    return tokens

def remove_stopwords(tokens: list):
    """This method is used to remove the stopwords from the text"""
    filtered_tokens = [
        word for word in tokens if word.lower() not in stop_words
    ]
    return filtered_tokens

def lemmatize_stemmer(tokens: list):
    """This method is used to lemmatize the text"""
    lemmatized_tokens = [stemmer.stem(word) for word in tokens]
    return lemmatized_tokens

def lemmatize(tokens: list):
    """This method is used to lemmatize the text"""
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return lemmatized_tokens

def pos_tagging(tokens: list):
        """This method is used to pos_tagging the text"""
        tagged = pos_tag(tokens)
        nouns = [word for word, pos in tagged if pos == "NN"]
        return " ".join(nouns)

In [11]:
tokenized_text = column_to_process.apply(tokenize)
tokenized_text
# text_without_stopwords = tokenized_text.apply(remove_stopwords)
# text_lemma = text_without_stopwords.apply(lemmatize)
# pos_tagging_tokens = text_lemma.apply(self.pos_tagging)
# final_time = datetime.datetime.now()
# logger.info(f"Text successfully processed")
# logger.info(f"time = {final_time - initial_time}")
# pos_tagging_tokens  # text_without_stopwords.apply(lambda x: ' '.join(x))

0        [good, morning, my, name, is, xxxx, xxxx, and,...
1        [i, upgraded, my, xxxx, xxxx, card, in, xx/xx/...
2        [chase, card, was, reported, on, xx/xx/2019, ....
3        [on, xx/xx/2018, ,, while, trying, to, book, a...
4        [my, grand, son, give, me, check, for, {, $, 1...
                               ...                        
18958    [my, husband, passed, away, ., chase, bank, pu...
18959    [after, being, a, chase, card, customer, for, ...
18960    [on, wednesday, ,, xx/xx/xxxx, i, called, chas...
18961    [i, am, not, familiar, with, xxxx, pay, and, d...
18962    [i, have, had, flawless, credit, for, 30, yrs,...
Name: complaint_what_happened, Length: 18963, dtype: object

In [12]:
# tokenized_text = column_to_process.apply(tokenize)
text_without_stopwords = tokenized_text.apply(remove_stopwords)
text_without_stopwords
# text_lemma = text_without_stopwords.apply(lemmatize)
# pos_tagging_tokens = text_lemma.apply(self.pos_tagging)
# final_time = datetime.datetime.now()
# logger.info(f"Text successfully processed")
# logger.info(f"time = {final_time - initial_time}")
# pos_tagging_tokens  # text_without_stopwords.apply(lambda x: ' '.join(x))

0        [good, morning, name, xxxx, xxxx, appreciate, ...
1        [upgraded, xxxx, xxxx, card, xx/xx/2018, told,...
2        [chase, card, reported, xx/xx/2019, ., however...
3        [xx/xx/2018, ,, trying, book, xxxx, xxxx, tick...
4        [grand, son, give, check, {, $, 1600.00, }, de...
                               ...                        
18958    [husband, passed, away, ., chase, bank, put, c...
18959    [chase, card, customer, well, decade, ,, offer...
18960    [wednesday, ,, xx/xx/xxxx, called, chas, ,, xx...
18961    [familiar, xxxx, pay, understand, great, risk,...
18962    [flawless, credit, 30, yrs, ., 've, chase, cre...
Name: complaint_what_happened, Length: 18963, dtype: object

In [13]:
# tokenized_text = column_to_process.apply(tokenize)
# text_without_stopwords = tokenized_text.apply(remove_stopwords)
text_lemma = text_without_stopwords.apply(lemmatize)
text_lemma
# pos_tagging_tokens = text_lemma.apply(self.pos_tagging)
# final_time = datetime.datetime.now()
# logger.info(f"Text successfully processed")
# logger.info(f"time = {final_time - initial_time}")
# pos_tagging_tokens  # text_without_stopwords.apply(lambda x: ' '.join(x))

0        [good, morning, name, xxxx, xxxx, appreciate, ...
1        [upgraded, xxxx, xxxx, card, xx/xx/2018, told,...
2        [chase, card, reported, xx/xx/2019, ., however...
3        [xx/xx/2018, ,, trying, book, xxxx, xxxx, tick...
4        [grand, son, give, check, {, $, 1600.00, }, de...
                               ...                        
18958    [husband, passed, away, ., chase, bank, put, c...
18959    [chase, card, customer, well, decade, ,, offer...
18960    [wednesday, ,, xx/xx/xxxx, called, chas, ,, xx...
18961    [familiar, xxxx, pay, understand, great, risk,...
18962    [flawless, credit, 30, yr, ., 've, chase, cred...
Name: complaint_what_happened, Length: 18963, dtype: object

In [14]:
text_lemma_stemmer = text_without_stopwords.apply(lemmatize_stemmer)
text_lemma_stemmer

0        [good, morn, name, xxxx, xxxx, appreci, could,...
1        [upgrad, xxxx, xxxx, card, xx/xx/2018, told, a...
2        [chase, card, report, xx/xx/2019, ., howev, ,,...
3        [xx/xx/2018, ,, tri, book, xxxx, xxxx, ticket,...
4        [grand, son, give, check, {, $, 1600.00, }, de...
                               ...                        
18958    [husband, pass, away, ., chase, bank, put, che...
18959    [chase, card, custom, well, decad, ,, offer, m...
18960    [wednesday, ,, xx/xx/xxxx, call, chas, ,, xxxx...
18961    [familiar, xxxx, pay, understand, great, risk,...
18962    [flawless, credit, 30, yrs, ., ve, chase, cred...
Name: complaint_what_happened, Length: 18963, dtype: object

In [17]:
pos_tagging_tokens = text_lemma.apply(pos_tagging)
pos_tagging_tokens

0        morning name appreciate chase bank cardmember ...
1        xxxx card agent date agent information order u...
2        chase card application identity consent servic...
3        xx/xx/2018 book ticket offer ticket card infor...
4        son check deposit account fund chase bank acco...
                               ...                        
18958    husband bank check hold rent car insurance nee...
18959    chase card customer decade solicitation credit...
18960    wednesday chas credit card provider claim purc...
18961    pay risk consumer chase bank app year trust de...
18962    flawless credit yr credit card chase freedom x...
Name: complaint_what_happened, Length: 18963, dtype: object

In [18]:
pos_tagging_tokens_stemmer = text_lemma_stemmer.apply(pos_tagging)
pos_tagging_tokens_stemmer

0        morn name appreci chase bank cardmemb servic c...
1        card anniversari date inform order account ann...
2        card report howev submit ident consent fraudul...
3        xx/xx/2018 book ticket offer ticket appli card...
4        son check deposit account fund bank account mo...
                               ...                        
18958    husband pass bank check hold rent car insur ne...
18959    card custom solicit credit card bonus airlin h...
18960    wednesday call chas credit card provid claim p...
18961    pay risk provid consum chase bank app year tru...
18962    flawless credit yrs chase credit card chase fr...
Name: complaint_what_happened, Length: 18963, dtype: object

In [23]:
pos_tagging_tokens.str.replace(
            r"x+/", "", regex=True
        ).str.replace(
            "xxxx", ""
        )

0        morning name appreciate chase bank cardmember ...
1         card agent date agent information order upgra...
2        chase card application identity consent servic...
3        2018 book ticket offer ticket card information...
4        son check deposit account fund chase bank acco...
                               ...                        
18958    husband bank check hold rent car insurance nee...
18959    chase card customer decade solicitation credit...
18960    wednesday chas credit card provider claim purc...
18961    pay risk consumer chase bank app year trust de...
18962    flawless credit yr credit card chase freedom  ...
Name: complaint_what_happened, Length: 18963, dtype: object

In [24]:
df.shape

(18963, 2)

In [25]:
df["processed_text"] = pos_tagging_tokens.str.replace(
    r"x+/", "", regex=True
).str.replace(
    "xxxx", ""
)
df.shape

(18963, 3)

In [26]:
df.sample(5, random_state=42)

Unnamed: 0,complaint_what_happened,ticket_classification,processed_text
7459,I have an auto loan with Chase Auto Finance. \...,Vehicle loan or lease + Loan,auto loan auto finance instruction loan payoff...
8700,JP Morgan Chase Bank rejected refinance of cur...,Mortgage + Conventional adjustable mortgage (ARM),jp bank refinance predatory jp bank loan mortg...
9084,I have been a member with Chase bank almost a ...,Checking or savings account + Other banking pr...,member chase bank year account organization ar...
7858,"On XX/XX/2020, I contacted chase regarding a r...",Mortgage + Conventional home mortgage,2020 chase cash equity document step emailstoc...
1328,At the end of XXXX XXXX I called customer serv...,Credit card or prepaid card + General-purpose ...,end customer service fee credit card stateme...


In [29]:
df.dropna(subset=["processed_text"]).shape

(18963, 3)

In [35]:
pd.read_csv("../tracking/data/data_processed/tickets_classification_eng_1.csv").sample(5, random_state=42)

Unnamed: 0,complaint_what_happened,ticket_classification,processed_text
7459,I have an auto loan with Chase Auto Finance. \...,Vehicle loan or lease + Loan,auto loan auto financ instruct loan payoff amo...
8700,JP Morgan Chase Bank rejected refinance of cur...,Mortgage + Conventional adjustable mortgage (ARM),jp bank reject predatori jp bank loan payment ...
9084,I have been a member with Chase bank almost a ...,Checking or savings account + Other banking pr...,member chase bank year account funer year amou...
7858,"On XX/XX/2020, I contacted chase regarding a r...",Mortgage + Conventional home mortgage,2020 chase regard refi cash equiti sign wait s...
1328,At the end of XXXX XXXX I called customer serv...,Credit card or prepaid card + General-purpose ...,end call custom fee credit card statement age...


In [37]:
pd.read_csv("../tracking/data/data_processed/tickets_classification_eng_lemmatized.csv").sample(5, random_state=42)

Unnamed: 0,complaint_what_happened,ticket_classification,processed_text
7459,I have an auto loan with Chase Auto Finance. \...,Vehicle loan or lease + Loan,auto loan auto finance instruction loan payoff...
8700,JP Morgan Chase Bank rejected refinance of cur...,Mortgage + Conventional adjustable mortgage (ARM),jp bank refinance predatory jp bank loan mortg...
9084,I have been a member with Chase bank almost a ...,Checking or savings account + Other banking pr...,member chase bank year account organization ar...
7858,"On XX/XX/2020, I contacted chase regarding a r...",Mortgage + Conventional home mortgage,2020 chase cash equity document step emailstoc...
1328,At the end of XXXX XXXX I called customer serv...,Credit card or prepaid card + General-purpose ...,end customer service fee credit card stateme...
