In [214]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import DBSCAN
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
import wordnet
import plotly.express as px

In [215]:
# Data Clean-up
df = pd.read_csv('../csv/overview.csv')
df['unrecognised_msgs'] = df['unrecognised_msgs'].str.replace('[', '').str.replace(']', '')
msg_list_v1 = []
for msg in df['unrecognised_msgs']:
    msg_list_v1.append(msg.split(','))
msg_list_v2 = []
for msg in msg_list_v1:
    for x in msg:
        msg_list_v2.append(x.replace('"', '').strip())
msg_df = pd.DataFrame(msg_list_v2, columns=['unrecognised_msgs'])
msg_df = msg_df.dropna()
msg_df.to_csv('unrecognised_msgs.csv', index=False)

In [209]:
# Pre-processing Unrecognised Messages, Stopword removal and Lemmatization
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    words = text.split()
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

msg_df['Preprocessed Messages'] = msg_df['unrecognised_msgs'].apply(preprocess_text)

In [213]:
# Extract Trigrams from Pre-processed messages
vectorizer = CountVectorizer(ngram_range = (3,3))
X = vectorizer.fit_transform(msg_df['Preprocessed Messages'])
features = (vectorizer.get_feature_names_out())
print("\n\nFeatures : \n", features)



Features : 
 ['account branch nearby' 'account family member'
 'account option available' 'account statement generated'
 'account type suitable' 'amount brokerage accounts'
 'amount small business' 'app available android' 'apply credit card'
 'apply small business' 'applying credit card' 'approved credit card'
 'assist changing contact' 'assist loan application'
 'assistance foreign currency' 'assistance redeeming reward'
 'associated home equity' 'available android ios'
 'balance requirement new' 'bank holiday schedule' 'banking app password'
 'branch nearby visit' 'business loan applications' 'call claiming bank'
 'card lost stolen' 'card option available' 'card statement clarify'
 'change account pin' 'change online banking' 'change pin online'
 'changing address bank' 'changing contact information'
 'checkbook branch nearby' 'claiming bank legitimate'
 'closing cost associated' 'cost associated home' 'cost stop payment'
 'credit card applications' 'credit card limit' 'credit card

In [211]:
# Applying TFIDF
vectorizer = TfidfVectorizer(ngram_range = (3,3))
X2 = vectorizer.fit_transform(msg_df['Preprocessed Messages'])
scores = (X2.toarray())
print("\n\nScores : \n", scores)



Scores : 
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [212]:
# Getting top ranking features
sums = X2.sum(axis = 0)
data1 = []
for col, term in enumerate(features):
    data1.append( (term, sums[0,col] ))
ranking = pd.DataFrame(data1, columns = ['term','rank'])
words = (ranking.sort_values('rank', ascending = False))
print ("\n\nWords head : \n", words.head(10))



Words head : 
                              term      rank
142              open new account  3.000000
21            branch nearby visit  2.884327
66               fraud account do  2.828427
198         suspect fraud account  2.828427
12        assist changing contact  2.121320
31   changing contact information  2.121320
162  question recent transactions  2.000000
174          request credit limit  1.732051
98            like request credit  1.732051
45          credit limit increase  1.732051
