In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('expense_dataset.csv', encoding='latin1')

In [3]:
df.head()

Unnamed: 0,Transaction Description,Category,Price (â¹)
0,Taxi Service,Transportation,1892.45
1,Gaming Subscription,Entertainment,3040.31
2,Amazon Purchase,Shopping,3215.28
3,Bike Servicing,Utilities,3124.89
4,Gym Membership,Health & Fitness,276.82


In [4]:
df.rename(columns={'Price (â¹)': "Amount", 'Category': 'target', 'Transaction Description': 'Description'}, inplace=True)
df.head()

Unnamed: 0,Description,target,Amount
0,Taxi Service,Transportation,1892.45
1,Gaming Subscription,Entertainment,3040.31
2,Amazon Purchase,Shopping,3215.28
3,Bike Servicing,Utilities,3124.89
4,Gym Membership,Health & Fitness,276.82


In [5]:
df['target'].nunique()

15

In [6]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [7]:
df['target'] = encoder.fit_transform(df['target'])

In [8]:
df.head()

Unnamed: 0,Description,target,Amount
0,Taxi Service,12,1892.45
1,Gaming Subscription,1,3040.31
2,Amazon Purchase,10,3215.28
3,Bike Servicing,14,3124.89
4,Gym Membership,5,276.82


In [9]:
df.duplicated().sum()

0

In [10]:
df['target'].value_counts()

target
14    35
1     26
3     25
12    24
4     18
10    16
5     12
0     12
13    11
11     7
6      6
2      3
7      3
8      1
9      1
Name: count, dtype: int64

In [11]:
import nltk

In [12]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\santo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
# stopwords are the word which do not contribute un the meaning of sentence but they are used to buid the sentence
from nltk.corpus import stopwords
stopwords.words('english')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [14]:
# punctuations 
import string 
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [15]:
# stamming
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
ps.stem('Loving')

'love'

In [16]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    y = []

    # Removing special charecter
    for i in text:
        if i.isalnum():
            y.append(i)

    # removing stop word and punctuation
    text = y[:]
    y.clear()
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)


    # stemming
    text = y[:]
    y.clear()

    for i in text:
        y.append(ps.stem(i));
    return " ".join(y)

In [17]:
transform_text("I loved the youtube lecture in machine learning. How about you ? ")

'love youtub lectur machin learn'

In [18]:
df['transformed_text'] = df['Description'].apply(transform_text)

In [19]:
df.head()

Unnamed: 0,Description,target,Amount,transformed_text
0,Taxi Service,12,1892.45,taxi servic
1,Gaming Subscription,1,3040.31,game subscript
2,Amazon Purchase,10,3215.28,amazon purchas
3,Bike Servicing,14,3124.89,bike servic
4,Gym Membership,5,276.82,gym membership


In [20]:
from wordcloud import WordCloud

In [21]:
wc = WordCloud(width=500, height=500, min_font_size=10, background_color='white')

In [22]:
spam_wc = wc.generate(df[df['target'] == 1]['transformed_text'].str.cat(sep=' '))

In [23]:
df['target'].unique()

array([12,  1, 10, 14,  5,  3,  2,  4, 13,  0, 11,  8,  6,  7,  9])

In [24]:
df['target'].unique()

array([12,  1, 10, 14,  5,  3,  2,  4, 13,  0, 11,  8,  6,  7,  9])

In [25]:
df[df['target']==1]['transformed_text'].tolist()

['game subscript',
 'stream servic',
 'stream servic',
 'movi ticket',
 'stream servic',
 'theater ticket',
 'concert ticket',
 'netflix subscript',
 'netflix subscript',
 'theater ticket',
 'theater ticket',
 'concert ticket',
 'stream servic',
 'game subscript',
 'game subscript',
 'stream servic',
 'netflix subscript',
 'netflix subscript',
 'movi ticket',
 'concert ticket',
 'theater ticket',
 'theater ticket',
 'spotifi premium',
 'movi ticket',
 'spotifi premium',
 'game subscript']

In [26]:
Entertainment =[]
for msg in df[df['target']==1]['transformed_text'].tolist():
    for word in msg.split():
        Entertainment.append(word)

In [27]:
df[df['target']==1]['transformed_text'].tolist()

['game subscript',
 'stream servic',
 'stream servic',
 'movi ticket',
 'stream servic',
 'theater ticket',
 'concert ticket',
 'netflix subscript',
 'netflix subscript',
 'theater ticket',
 'theater ticket',
 'concert ticket',
 'stream servic',
 'game subscript',
 'game subscript',
 'stream servic',
 'netflix subscript',
 'netflix subscript',
 'movi ticket',
 'concert ticket',
 'theater ticket',
 'theater ticket',
 'spotifi premium',
 'movi ticket',
 'spotifi premium',
 'game subscript']

In [28]:
Finance =[]
for msg in df[df['target']==2]['transformed_text'].tolist():
    for word in msg.split():
        Finance.append(word)

In [29]:
df[df['target']==3]['transformed_text'].tolist()

['starbuck coffe',
 'restaur dinner',
 'onlin groceri',
 'starbuck coffe',
 'fast food order',
 'onlin groceri',
 'mcdonald',
 'fast food order',
 'fast food order',
 'mcdonald',
 'onlin groceri',
 'onlin groceri',
 'restaur dinner',
 'mcdonald',
 'restaur dinner',
 'mcdonald',
 'starbuck coffe',
 'onlin groceri',
 'restaur dinner',
 'mcdonald',
 'restaur dinner',
 'onlin groceri',
 'mcdonald',
 'restaur dinner',
 'restaur dinner']

In [30]:
Food_Drinks =[]
for msg in df[df['target']==3]['transformed_text'].tolist():
    for word in msg.split():
        Food_Drinks.append(word)

In [31]:
Health =[]
for msg in df[df['target']==4]['transformed_text'].tolist():
    for word in msg.split():
        Health.append(word)

In [32]:
Health_Fitness =[]
for msg in df[df['target']==5]['transformed_text'].tolist():
    for word in msg.split():
        Health_Fitness.append(word)

In [33]:
Housing =[]
for msg in df[df['target']==6]['transformed_text'].tolist():
    for word in msg.split():
        Housing.append(word)

In [34]:
Insurance=[]
for msg in df[df['target']==7]['transformed_text'].tolist():
    for word in msg.split():
        Insurance.append(word)

In [35]:
Lifestyle=[]
for msg in df[df['target']==8]['transformed_text'].tolist():
    for word in msg.split():
        Lifestyle.append(word)

In [36]:
Loans=[]
for msg in df[df['target']==9]['transformed_text'].tolist():
    for word in msg.split():
        Loans.append(word)

In [37]:
Shopping=[]
for msg in df[df['target']==10]['transformed_text'].tolist():
    for word in msg.split():
        Shopping.append(word)

In [38]:
df[df['target']==10]['transformed_text'].tolist()

['amazon purchas',
 'home applianc purchas',
 'cloth store',
 'home applianc purchas',
 'pet food purchas',
 'pet food purchas',
 'amazon purchas',
 'pet food purchas',
 'electron store',
 'cloth store',
 'pet food purchas',
 'home applianc purchas',
 'cloth store',
 'amazon purchas',
 'home applianc purchas',
 'home applianc purchas']

In [39]:
Technology=[]
for msg in df[df['target']==11]['transformed_text'].tolist():
    for word in msg.split():
        Technology.append(word)

In [40]:
Transportation=[]
for msg in df[df['target']==12]['transformed_text'].tolist():
    for word in msg.split():
        Transportation.append(word)

In [41]:
Travel=[]
for msg in df[df['target']==13]['transformed_text'].tolist():
    for word in msg.split():
        Travel.append(word)

In [42]:
Utilities=[]
for msg in df[df['target']==14]['transformed_text'].tolist():
    for word in msg.split():
        Utilities.append(word)

In [43]:
# converting the text into number as vectorization
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cv = CountVectorizer()
tfidf = TfidfVectorizer(max_features=3000)

In [44]:
x = cv.fit_transform(df['transformed_text']).toarray()
# x = tfidf.fit_transform(df['transformed_text']).toarray()

In [45]:
y = df['target'].values

In [46]:
from sklearn.model_selection import train_test_split

In [47]:
xtrain,xtest,ytrain,ytest =  train_test_split(x,y,test_size=0.2, random_state=2)

In [48]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score,confusion_matrix, precision_score 

In [49]:
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

In [50]:
gnb.fit(xtrain,ytrain)
ypred1 = gnb.predict(xtest)
print(accuracy_score(ytest,ypred1))
print(confusion_matrix(ytest,ypred1))
# print(precision_score(ytest,ypred1))

1.0
[[4 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 5 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 5 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 3 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 6 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 2 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 6 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 4]]


In [51]:
mnb.fit(xtrain,ytrain)
ypred2 = mnb.predict(xtest)
print(accuracy_score(ytest,ypred2))
print(confusion_matrix(ytest,ypred2))
# print(precision_score(ytest,ypred2))

0.9
[[3 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 5 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 5 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 2]
 [0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 6 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 2 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 6 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 4]]


In [52]:
bnb.fit(xtrain,ytrain)
ypred3 = bnb.predict(xtest)
print(accuracy_score(ytest,ypred3))
print(confusion_matrix(ytest,ypred3))
# print(precision_score(ytest,ypred3))

0.65
[[0 0 0 1 0 0 0 0 0 0 0 0 3]
 [0 5 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 5 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 2]
 [0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 4 0 0 0 2]
 [0 1 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 6 0 0]
 [0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 4]]


In [53]:
# we choose tfidf --> MNB
import pickle
pickle.dump(cv, open('vectorizer.pkl','wb'))
pickle.dump(mnb, open('model.pkl','wb'))

In [56]:
import joblib  # For saving models

# Save the trained model
joblib.dump(mnb, 'expense_categorization_model.pkl')
joblib.dump(mnb, 'vectorizer_categorization_model.pkl')

# # Save the vectorizer
# joblib.dump(vectorizer, 'vectorizer.pkl')


['vectorizer_categorization_model.pkl']