In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import DBSCAN
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
import wordnet
import plotly.express as px

In [2]:
# Data Clean-up
df = pd.read_csv('../csv/overview.csv')
df['unrecognised_msgs'] = df['unrecognised_msgs'].str.replace('[', '').str.replace(']', '')
msg_list_v1 = []
for msg in df['unrecognised_msgs']:
    msg_list_v1.append(msg.split(','))
msg_list_v2 = []
for msg in msg_list_v1:
    for x in msg:
        msg_list_v2.append(x.replace('"', '').strip())
msg_df = pd.DataFrame(msg_list_v2, columns=['unrecognised_msgs'])
msg_df = msg_df.dropna()
msg_df.to_csv('unrecognised_msgs.csv', index=False)

In [15]:
# Pre-processing Unrecognised Messages, Stopword removal and Lemmatization
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    words = text.split()
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

msg_df['Preprocessed Messages'] = msg_df['unrecognised_msgs'].apply(preprocess_text)

In [4]:
# Extract Trigrams from Pre-processed messages
vectorizer = CountVectorizer(ngram_range = (3,3))
X = vectorizer.fit_transform(msg_df['Preprocessed Messages'])
features = (vectorizer.get_feature_names_out())
print("\n\nFeatures : \n", features)



Features : 
 ['account branch nearby' 'account family member'
 'account option available' 'account statement generated'
 'account type suitable' 'amount brokerage accounts'
 'amount small business' 'app available android' 'apply credit card'
 'apply small business' 'applying credit card' 'approved credit card'
 'assist changing contact' 'assist loan application'
 'assistance foreign currency' 'assistance redeeming reward'
 'associated home equity' 'available android ios'
 'balance requirement new' 'bank holiday schedule' 'banking app password'
 'branch nearby visit' 'business loan applications' 'call claiming bank'
 'card lost stolen' 'card option available' 'card statement clarify'
 'change account pin' 'change online banking' 'change pin online'
 'changing address bank' 'changing contact information'
 'checkbook branch nearby' 'claiming bank legitimate'
 'closing cost associated' 'cost associated home' 'cost stop payment'
 'credit card applications' 'credit card limit' 'credit card

In [16]:
# Applying TFIDF
vectorizer = TfidfVectorizer(ngram_range = (3,3))
X2 = vectorizer.fit_transform(msg_df['Preprocessed Messages'])
scores = (X2.toarray())
print("\n\nScores : \n", scores)



Scores : 
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [17]:
# Getting top ranking features
sums = X2.sum(axis = 0)
data1 = []
for col, term in enumerate(features):
    data1.append( (term, sums[0,col] ))
ranking = pd.DataFrame(data1, columns = ['term','rank'])
words = (ranking.sort_values('rank', ascending = False))
print ("\n\nWords head : \n", words.head(5))



Words head : 
                                 term      rank
21              cutoff time same day  2.884327
66  need assistance foreign currency  2.828427
12    change online banking password  2.121320
31         help stop payment request  2.121320
98     request credit limit increase  1.732051


In [10]:
# Extract Trigrams from Pre-processed messages
vectorizer = CountVectorizer(ngram_range = (4,4))
X = vectorizer.fit_transform(msg_df['Preprocessed Messages'])
features = (vectorizer.get_feature_names_out())
print("\n\nFeatures : \n", features)



Features : 
 ['account branch nearby visit' 'account type suitable savings'
 'amount small business loans' 'app available android ios'
 'apply small business loan' 'assist changing contact information'
 'assistance foreign currency exchange'
 'assistance redeeming reward points' 'associated home equity loans'
 'balance requirement new accounts' 'call claiming bank legitimate'
 'card statement clarify charges' 'change online banking password'
 'checkbook branch nearby visit' 'closing cost associated home'
 'cost associated home equity' 'credit card option available'
 'credit card statement clarify' 'credit limit credit cards'
 'credit score personal loan' 'criterion credit limit increases'
 'cutoff time same day' 'daily transfer limit wire'
 'document required account opening' 'documentation needed update address'
 'electronic statement instead paper' 'expedite replacement lost card'
 'experiencing issue online banking' 'fee changing address bank'
 'get approved credit card' 'help cre

In [11]:
# Applying TFIDF
vectorizer = TfidfVectorizer(ngram_range = (4,4))
X2 = vectorizer.fit_transform(msg_df['Preprocessed Messages'])
scores = (X2.toarray())
print("\n\nScores : \n", scores)



Scores : 
 [[0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 ...
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.5 ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]]


In [13]:
# Getting top ranking features
sums = X2.sum(axis = 0)
data1 = []
for col, term in enumerate(features):
    data1.append( (term, sums[0,col] ))
ranking = pd.DataFrame(data1, columns = ['term','rank'])
words = (ranking.sort_values('rank', ascending = False))
print ("\n\nWords head : \n", words.head(5))



Words head : 
                                     term      rank
108             suspect fraud account do  4.000000
5    assist changing contact information  3.000000
98         request credit limit increase  2.121320
48             like request credit limit  2.121320
12        change online banking password  2.024862


In [45]:
df = pd.read_csv('../csv/overview.csv')
# Data Clean-up
df['unrecognised_msgs'] = df['unrecognised_msgs'].str.replace('[', '').str.replace(']', '')
msg_list_v1 = []
for msg in df['unrecognised_msgs']:
    msg_list_v1.append(msg.split(','))
msg_list_v2 = []
for msg in msg_list_v1:
    for x in msg:
        msg_list_v2.append(x.replace('"', '').strip())
msg_df = pd.DataFrame(msg_list_v2, columns=['unrecognised_msgs'])
msg_df = msg_df.dropna()

# Pre-processing Unrecognised Messages, Stopword removal and Lemmatization
stop_words = set(stopwords.words('english'))
stop_words.add('like')
stop_words.add('do')
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    words = text.split()
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

msg_df['Preprocessed Messages'] = msg_df['unrecognised_msgs'].apply(preprocess_text)

popular_topic_list = []

# Extract Trigrams from Pre-processed messages
count_vectorizer_trigram = CountVectorizer(ngram_range = (3,3))
X1_trigram = count_vectorizer_trigram.fit_transform(msg_df['Preprocessed Messages'])
features_trigram = (count_vectorizer_trigram.get_feature_names_out())

# Applying TFIDF for Trigrams
tf_vectorizer_trigram = TfidfVectorizer(ngram_range = (3,3))
X2_trigram = tf_vectorizer_trigram.fit_transform(msg_df['Preprocessed Messages'])
scores = (X2_trigram.toarray())

# Getting top 5 Trigrams
sums_trigram = X2_trigram.sum(axis = 0)
data1_trigram = []
for col, term in enumerate(features_trigram):
    data1_trigram.append( (term, sums_trigram[0,col] ))
trigram_ranking = pd.DataFrame(data1_trigram, columns = ['term','rank'])
trigram_words = (trigram_ranking.sort_values('rank', ascending = False))
for x in trigram_words['term'].head(5):
    if 'do' not in x:
        popular_topic_list.append(x)

['open new account', 'branch nearby visit', 'suspect fraud account', 'credit limit increase']
