In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns
import string
import re
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [2]:
df = pd.read_csv('Transaction_Naration_Data_Set.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)
df['AC_ID'] = pd.Categorical(df['AC_ID'])

In [3]:
df.describe(include='all')

Unnamed: 0,AC_ID,AMOUNT,DOC_DATE,NARATION,DR_CR
count,22546758.0,22546758.0,22546758,22545735,22546758
unique,2887759.0,,180,128603,2
top,1863495.0,,28-DEC-22,Cash Withdrawal From A/C No.:,Debit
freq,44975.0,,740727,2359719,13871120
mean,,20755.28,,,
std,,82347.5,,,
min,,0.01,,,
25%,,600.0,,,
50%,,1500.0,,,
75%,,7000.0,,,


In [4]:
df.head()

Unnamed: 0,AC_ID,AMOUNT,DOC_DATE,NARATION,DR_CR
0,3644877,342800.0,01-NOV-22,Cash Withdraw,Debit
1,1175256,342800.0,01-NOV-22,Cash Withdrawal From A/C No.:,Credit
2,2298226,1500.0,01-NOV-22,"Cash withdraw from Micro Merchant point, MM A/C",Debit
3,4066427,1500.0,01-NOV-22,Credit Against Merchant Cash Withdraw Service ...,Credit
4,1978564,2550.0,01-NOV-22,"Cash withdraw from Micro Merchant point, MM A/C",Debit


In [5]:
df.dtypes

AC_ID       category
AMOUNT       float64
DOC_DATE      object
NARATION      object
DR_CR         object
dtype: object

In [6]:
narration = df['NARATION']
# remove punctuation
translator = str.maketrans('', '', string.punctuation)
narration = narration.apply(lambda x: str(x).translate(translator))
# narration.drop_duplicates().to_csv('narration_light_cleaned.csv', index=False)

In [7]:
# remove non-ascii characters
narration = narration.apply(lambda x: re.sub('[^\x00-\x7F]+', ' ', str(x).strip()))
narration.drop_duplicates().to_csv('narration_non_ascii_removed.csv', index=False)

In [8]:
unique_narration = df['NARATION']
unique_narration = unique_narration.drop_duplicates()
print(len(unique_narration))
# unique_narration.to_csv('narration.csv', index=False)

128604


In [9]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize 
import re
stop_words = stopwords.words('english')
wnl = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shuaib\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Shuaib\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Shuaib\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
def clean_narration(x):
    x = str(x).lower().strip()
    x = re.sub('[^a-zA-Z ]', ' ', x)
    x = " ".join(x.split())
    words = word_tokenize(x)
    words = [wnl.lemmatize(w) for w in words]
    filtered_words = [w for w in words if w not in stop_words]
    return " ".join(filtered_words)

In [11]:
unique_narration = unique_narration.apply(lambda x: clean_narration(x))

In [12]:
unique_narration.to_csv('narration_cleaned.csv', index=False)

In [13]:
# vectorize the cleaned narration
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(unique_narration)
print(X.shape)

(128604, 30372)


In [14]:
num_clusters = 12
from sklearn.cluster import KMeans
km = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=100, n_init=1, verbose=True)
km.fit(X)

Initialization complete
Iteration 0, inertia 214654.7047582739.
Iteration 1, inertia 116919.86289339911.
Iteration 2, inertia 115683.94508849054.
Iteration 3, inertia 115177.08258791096.
Iteration 4, inertia 115085.37968125037.
Iteration 5, inertia 115042.18192299362.
Iteration 6, inertia 115037.60311105319.
Iteration 7, inertia 115037.4961278199.
Iteration 8, inertia 115037.48350295007.
Iteration 9, inertia 115037.48068617101.
Iteration 10, inertia 115037.48002047653.
Converged at iteration 10: strict convergence.


In [None]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()


In [19]:
for i in range(num_clusters):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :7]:
        print(' %s' % terms[ind])

Cluster 0:
 cheque
 ae
 withdrawal
 ad
 banking
 branch
 agent
Cluster 1:
 eftn
 transaction
 bank
 ltd
 islami
 sonali
 bangladesh
Cluster 2:
 deposit
 cash
 inter
 branch
 withdraw
 agent
 md
Cluster 3:
 january
 salary
 month
 disbursement
 title
 rent
 allowance
Cluster 4:
 transfer
 fund
 smart
 app
 regular
 cap
 dp
Cluster 5:
 sme
 jamtoil
 dal
 pu
 male
 purush
 kornushuti
Cluster 6:
 withdraw
 md
 transfer
 fund
 bazar
 loan
 islam
Cluster 7:
 fee
 ctsu
 head
 usd
 office
 fvg
 citibank
Cluster 8:
 salary
 month
 march
 credit
 november
 fvg
 february
Cluster 9:
 dol
 mohila
 polli
 unnayan
 para
 polly
 unnoyon
Cluster 10:
 bill
 cash
 net
 deposit
 month
 bidyut
 wifi
Cluster 11:
 vgd
 cash
 deposit
 payment
 agent
 inter
 fund


In [29]:
from sklearn.decomposition import PCA
import seaborn as sns
pca = PCA(n_components=2).fit(X.toarray())
data2D = pca.transform(X.toarray())
labels = km.labels_
sns.scatterplot(data2D[:,0], data2D[:,1], hue=labels, palette=sns.color_palette("hls", num_clusters))


MemoryError: Unable to allocate 29.1 GiB for an array with shape (128604, 30372) and data type float64