# Import Libraries and Data

In [2]:
import os
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [4]:
# import data
# Load the files
with open('./pkl_data/bristol_spending_data.pkl', 'rb') as pickle_file:
    df = pickle.load(pickle_file)    
with open('./pkl_data/bristol_spending_data_daily.pkl', 'rb') as pickle_file:
    df_daily = pickle.load(pickle_file)
with open('./pkl_data/bristol_spending_data_monthly.pkl', 'rb') as pickle_file:
    df_monthly = pickle.load(pickle_file)

# NLP
Lets make a NLP model to see if we can find trends in the supplier names and descriptions

## Tokenizing the supplier names

In [5]:
# set parameters of tokenizer
NUMWORDS = 10000
OOV_TOK = "<OOV>"
tokenizer = Tokenizer(num_words=NUMWORDS, oov_token=OOV_TOK)

### Coverting data to strings in order to tokenize
When I was trying to tokenize the description columns I was getting an error as some of the values were floats instead of strings. We're trying to tokenize this data so we need to convert it first.

Lets check what sort of data we have in these columns.

In [6]:
# define a function to count the data types
def count_datatypes(val):
    if isinstance(val, float):
        return 'float'
    elif isinstance(val, int):
        return 'int'
    elif isinstance(val, str):
        return 'str'
    else:
        return 'other'

# The column we want to look at
column = 'Description 1'

# apply the function to each element of the column.
datatype_counts = df[column].apply(count_datatypes)

# count the occurrences of each data type
float_count = (datatype_counts == 'float').sum().sum()
int_count = (datatype_counts == 'int').sum().sum()
str_count = (datatype_counts == 'str').sum().sum()

print(f'Data types in the {column} column:\n')
print('Floats: ', float_count)
print('Ints: ', int_count)
print('Strings: ', str_count)

# print out the rows where we are getting floats
float_rows = df[df[column].apply(lambda x: isinstance(x, float))]

print('\n\nThe rows containing float values:')
print(float_rows)

Data types in the Description 1 column:

Floats:  8
Ints:  0
Strings:  1022705


The rows containing float values:
                                        Supplier    Amount   Pay Date  \
60235                     Creative Youth Network   1081.99 2022-04-25   
163006                      OPUS Claim Solutions   1331.02 2020-11-11   
166575  Arthur J.Gallagher Insurance Brokers Ltd  17300.00 2020-10-28   
167118  Arthur J.Gallagher Insurance Brokers Ltd  55208.16 2020-10-23   
171961                               Wansbroughs   4422.00 2020-09-29   
171962                               Wansbroughs  -4422.00 2020-09-29   
172129                      OPUS Claim Solutions    474.20 2020-09-28   
172359                      OPUS Claim Solutions    455.84 2020-09-25   

       Description 1                       Description 2 Description 3  
60235            NaN                                 NaN                
163006           NaN  Insurance Fund - holding account 1                
166575  

From this investigaton we can see the floats are related to data without a description (NaN values). So we can convert these to strings so that we can tokenize everything. We'll use the fill NaN function from pandas to do this.

In [7]:
# we want to tokenize the supplier data and descriptions 1-3
# create a list of the data we want to tokenize

column_list = ['Supplier', 'Description 1', 'Description 2', 'Description 3']
tokenized_data = {}

# convert each column to a numpy array, tokenize andd then store
for item in column_list:
    
    # fill NaN values with a string
    df[item] = df[item].fillna('NaN')
    
    # convert each column to a numpy array
    sentences = df[item].to_numpy()

    # tokenize the supplier data
    tokenizer.fit_on_texts(sentences)

    # create sequences using the new word index
    sequences = tokenizer.texts_to_sequences(sentences)
    tokenized_data[item] = pad_sequences(sequences, padding='post')

    # check out the most common words
    # word_index is sorted with the most common words appearing first
    print(f'Word index of the {item} column in descending order of frequency:\n{tokenizer.word_index}\n')

    # we can also see all the word counts using this
    print(f'Word counts in the {item} column:\n{tokenizer.word_counts}\n')

    # check out the max length of the supplier names (remeber: we have already padded the data to fit it to the max length)
    print(f'Max length in the {item} column:\n{len(tokenized_data[item][0])}\n\n')


Word index of the Supplier column in descending order of frequency:
{'<OOV>': 1, 'ltd': 2, 'a': 3, 't': 4, 'care': 5, 'limited': 6, 'bristol': 7, 'services': 8, 'trust': 9, 'the': 10, 'plc': 11, 'housing': 12, 'redacted': 13, 'staffing': 14, 'guidant': 15, 'carlisle': 16, 'uk': 17, 'global': 18, 'randstad': 19, 'homes': 20, 'group': 21, 'family': 22, 'maples': 23, 'brandon': 24, 'home': 25, 'plumbing': 26, 'connolly': 27, 'callaghan': 28, 'house': 29, 'community': 30, 'healthcare': 31, 'and': 32, 'contractors': 33, '21': 34, 'properties': 35, 'hudson': 36, 'west': 37, 'milestones': 38, 'st': 39, 'rental': 40, 'school': 41, 'south': 42, 'alex': 43, 'fry': 44, 'for': 45, 'fostering': 46, 'heating': 47, 'property': 48, 'm': 49, 'solutions': 50, 'holdings': 51, 'step': 52, 'glevum': 53, 'mimosa': 54, 'second': 55, 'of': 56, 'support': 57, 'residential': 58, 'e': 59, 'maintenance': 60, 'centre': 61, 'health': 62, 'nursery': 63, 'somerset': 64, 'north': 65, 's': 66, 'national': 67, 'associat

# Unsupervised learning
We want to try and classify the transactions (e.g. education, social care, housing). We can use unsupervised learning to first see if there are any trends in the data. Otherwise a semi-supervised learning model might be the way to go and I can manually label some of the transactions and use those labels to build a semi-supervised model.

In [8]:
# column to fit model to
column = 'Description 1'
data = tokenized_data[column]

kmeans = KMeans(n_clusters=6, random_state=42)
kmeans.fit(data)



In [9]:
# explore the clusters
print(kmeans.cluster_centers_)
print(kmeans.labels_)

[[ 3.09429012e+01  4.32585651e+01  2.98444729e+01  2.61038698e+01
   2.04370304e+01  1.06888388e+01  4.83178530e+00  3.49197090e+00
   4.14165703e+00  3.41889206e+00  1.70556747e+00  9.06399321e-01]
 [ 8.05377130e+03  3.39668696e+02  1.62396522e+02  5.98860870e+01
   2.40991304e+01  5.05495652e+01  2.34782609e-01 -1.55431223e-14
  -7.99360578e-15 -2.66453526e-15 -1.86517468e-14 -5.55111512e-16]
 [ 4.48958012e+02  4.39585768e+02  2.61361096e+02  1.38167509e+02
   4.26384148e+01  1.51924968e+01  5.09178701e-01  6.47760397e-01
   3.75870586e-01  4.96895930e-01  1.02140518e-14 -3.00870440e-14]
 [ 2.80100915e+03  6.09411170e+02  3.29503611e+02  1.09322870e+03
   6.82941743e+02  2.39075590e+02  1.53221955e+02  9.31805489e+01
   5.04039480e+01  9.62927299e-04 -1.44328993e-14  1.22124533e-15]
 [ 1.26541765e+03  9.09970588e+02  4.33381176e+03  7.27864706e+02
   3.55326471e+02  1.51335294e+02  5.34632353e+01  3.85044118e+01
   2.76176471e+01  2.26470588e-01 -1.04360964e-14 -5.21804822e-15]
 [ 9.

In [10]:
# trying to compute silhouette score takes a long time.

# silhouette_score = silhouette_score(tokenized_data['Supplier'], kmeans.labels_)
# print(f"Silhouette score: {silhouette_score:.3f}")

labels = kmeans.labels_
for i in range(len(set(labels))):
    print(f"Cluster {i}:")
    # Print the transaction descriptions in each cluster
    descriptions = df[column][labels == i]
    print(descriptions)
    print()

Cluster 0:
0                     Services - Fees and Charges
1                     Services - Fees and Charges
2          R&M - Departmental response (external)
3          R&M - Departmental response (external)
4           R&M - Departmental planned (external)
                            ...                  
1022707             PAYMENTS TO VOLUNTARY ASSOCNS
1022708             PAYMENTS TO VOLUNTARY ASSOCNS
1022709                              FEES PAYABLE
1022710          EQUIPMENT, FURNITURE & MATERIALS
1022711        C Y P S ONLY - TOPPING UP PAYMENTS
Name: Description 1, Length: 972525, dtype: object

Cluster 1:
5949                      Balances brought forward
60235                                          NaN
64409      TTP - HRA contributions to general fund
83745      TTP - HRA contributions to general fund
83941                           Members Allowances
                            ...                   
1022406                            S2BS5N117170939
1022446            

# Semi Supervised Learning - BERT

I maually labelled some of the transactions in the data with categories that I obtained from the council's budget. The plan was to then use these labels to train the algorithm to label the unlabelled data.

In [11]:
#for downloading BERT
# pip install sentence_transformers
from sentence_transformers import SentenceTransformer

#for finding most similar text vectors
from sklearn.metrics.pairwise import cosine_similarity

#regular expressoin toolkit
import re

#NLP toolkits
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

#for plotting expense categories later
import matplotlib.pyplot as plt 
plt.style.use('ggplot')
import seaborn as sns
import matplotlib
import matplotlib.ticker as ticker # for formatting major units on x-y axis

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/samuelspeller/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [1]:
# import the semi-labelled data

df = pd.read_csv('./spending_data/bristol_spending_data_final_semi_labeled.csv')

# set non numeric values to n/a with errors=coerce
df['Amount'] = pd.to_numeric(df['Amount'], errors='coerce')

# convert the date to a datetime object
df['Pay Date']= pd.to_datetime(df['Pay Date'], format='%d/%m/%Y')

# check the df
print(df.info())

# Write to pickle pickle file
with open('./pkl_data/bristol_spending_data_semi_labelled.pkl', 'wb') as pickle_file:
    pickle.dump(df, pickle_file)

NameError: name 'pd' is not defined

## Download pre-trained BERT model.

In [14]:
# This may take some time to download and run 
# depending on the size of the input

bert_input = tokenized_data['Description 2'].tolist()
model = SentenceTransformer('paraphrase-mpnet-base-v2') 
embeddings = model.encode(bert_input, show_progress_bar = True)
embedding_BERT = np.array(embeddings)



KeyboardInterrupt: 

