Experimenting with river library:
- create an incremental learning model
- data is 616 tuples of (bank_statement_line, vendor)
- split data
- train one at a time: model.learn_one(sentence,label)
- predict one from a bank_statement_line aka sentence
- save & load model every 100 items to simulate real usage
- see real predictions

In [1]:
import pickle
import re
import random
from pprint import pprint
import river
from river.naive_bayes import MultinomialNB
from river.feature_extraction import BagOfWords,TFIDF
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
#dir(river)

In [3]:
print("hello world")

hello world


In [4]:
def split_data( d ):  # remove 10 items(test_data) from the last ~200 items (only test last 2 months)
    test = []
    indexes = random.sample( range(  400, len(d)-11   ), 10) # get 10 random indexes between 400 & 600...
    # test only on the last 2 months(data was collected from 01/2021 to 06/2021) Every month have roughly 100 items. Total items ~= 616
    print('data lenght == ',len(d))
    print('random indexes used for test data ==>', indexes)
    for i in indexes:
        r = d.pop(i)
        test.append(r)
    return d, test

In [5]:
def save_model(m):
    with open('saved_model.pkl', 'wb') as f:
        pickle.dump(m, f)
    print('==> model was saved')

def load_model():
    with open('saved_model.pkl', 'rb') as f:
        m = pickle.load(f)
    print('==> model was loaded')    
    return m

In [6]:
df = pd.read_csv('model-data.csv')
d = list( df.to_records(index=False) ) # river accepts a list of tuples
data, test_data = split_data(d)
#print(data[:5])
#print(test_data)

data lenght ==  616
random indexes used for test data ==> [567, 591, 492, 450, 519, 563, 576, 530, 460, 420]


In [7]:
model = river.compose.Pipeline(
    ('vectorizer',BagOfWords(lowercase=True)), #convert text(feature) from string to a dict
    ('nb',MultinomialNB())
)

In [8]:
# pipe_nb # visualize

In [9]:
# pipe_nb.steps 

In [10]:
print('Incremental training + saving & reloading model every 100 items bc thats how is going to be used'); print()
metric = river.metrics.Accuracy()
save_interval = 100
load = False

for i, (sentence,label) in enumerate(data):
    if load == True:
        model = load_model()
        load = False
    pred_before = model.predict_one(sentence)
    metric = metric.update(label, pred_before)
    model = model.learn_one(sentence,label)
    if i == save_interval:
        save_model(model)
        load = True
        print('amount of classes after '+str(save_interval)+' items == ', len(model.predict_proba_one(sentence))  )
        print('model   metric   after  '+str(save_interval)+' items ==> ' + str(metric))
        del model # sanity ck
        save_interval += 100

save_model(model)

Incremental training + saving & reloading model every 100 items bc thats how is going to be used

==> model was saved
amount of classes after 100 items ==  34
model   metric   after  100 items ==> Accuracy: 60.40%
==> model was loaded
==> model was saved
amount of classes after 200 items ==  54
model   metric   after  200 items ==> Accuracy: 63.68%
==> model was loaded
==> model was saved
amount of classes after 300 items ==  63
model   metric   after  300 items ==> Accuracy: 68.77%
==> model was loaded
==> model was saved
amount of classes after 400 items ==  78
model   metric   after  400 items ==> Accuracy: 71.07%
==> model was loaded
==> model was saved
amount of classes after 500 items ==  89
model   metric   after  500 items ==> Accuracy: 71.66%
==> model was loaded
==> model was saved
amount of classes after 600 items ==  102
model   metric   after  600 items ==> Accuracy: 70.38%
==> model was loaded
==> model was saved


In [11]:
# print('Predictions of "model" with test_data'); print()
# classes = model.predict_proba_one(sentence)
# for sentence,label in test_data:
#     p = model.predict_one( sentence )
#     print('label in classes before == ', label in classes)
#     print('Prediction, Label = '+p+', '+label)
#     if p != label:
#         print('WRONG!!! Sentence was: ', sentence)
        
#     model = model.learn_one(sentence,label)
#     classes = model.predict_proba_one(sentence)
#     print('label in classes after == ', label in classes)
#     print('===================')

In [12]:
# print('Predictions of "reloaded model" with test_data'); print()
# model_2 = load_model()
# for sentence,label in test_data:
#     p = model_2.predict_one( sentence )
#     print('Prediction, Label = '+p+', '+label)
#     if p != label:
#         print('Sentence was: ', sentence)
#         #print(pipe_nb.predict_proba_one(sentence))
#         print('=========')

In [13]:
# print('Predictions of "untrained model" with test_data'); print()

# untrained_model = river.compose.Pipeline(
#     ('vectorizer',BagOfWords(lowercase=True)), #convert text(feature) from string to a dict
#     ('nb',MultinomialNB())
# )

# for sentence,label in test_data:
#     p = untrained_model.predict_one( sentence )
#     print('Prediction, Label = ', p, label)
#     if p != label:
#         print('Sentence was: ', sentence)
#         #print(pipe_nb.predict_proba_one(sentence))
#         print('=========')

In [14]:
# not sure about *removing numbers* to improve accuracy
# bc some labels iclude a number from the lime
# data_without_numbers = []
#for d in data:
#    print('original => ', d[0])
#    sentence = re.sub(r"[0-9]+","", d[0])
#    print('corrected => ' sentence)

In [15]:
# problem w TextBlob ==> some corrections make it worse 
#from textblob import TextBlob
#corrected_data = []
#for d in data:
#    print('original => ', d[0])
#    sentence = TextBlob(d[0]).correct()
#    print('corrected => ', sentence)
#    print()