# Semi Supervised Learning - BERT

I maually labelled some of the transactions in the data with categories that I obtained from the council's budget. The plan was to then use these labels to train the algorithm to label the unlabelled data.

In [1]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#for downloading BERT
# pip install sentence_transformers
from sentence_transformers import SentenceTransformer

#for finding most similar text vectors
from sklearn.metrics.pairwise import cosine_similarity

#regular expressoin toolkit
import re

#NLP toolkits
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

#for plotting transaction categories later
plt.style.use('ggplot')
import seaborn as sns
import matplotlib.ticker as ticker # for formatting major units on x-y axis

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/samuelspeller/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# # import the semi-labelled data and write to a pkl file

# df = pd.read_excel('./spending_data/bristol_spending_data_final_semi_labeled.xlsx')

# # set non numeric values to n/a with errors=coerce
# df['Amount'] = pd.to_numeric(df['Amount'], errors='coerce')

# # convert the date to a datetime object
# df['Pay Date']= pd.to_datetime(df['Pay Date'], format='%d/%m/%Y')

# # check the df
# print(df.info())

# # Write to pickle pickle file
# with open('./pkl_data/bristol_spending_data_semi_labelled.pkl', 'wb') as pickle_file:
#     pickle.dump(df, pickle_file)

In [3]:
# import data
# Load the semi-labelled dataset   
with open('./pkl_data/bristol_spending_data_semi_labelled.pkl', 'rb') as pickle_file:
    df = pickle.load(pickle_file)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1022713 entries, 0 to 1022712
Data columns (total 7 columns):
 #   Column         Non-Null Count    Dtype         
---  ------         --------------    -----         
 0   Supplier       1022713 non-null  object        
 1   Amount         1022712 non-null  float64       
 2   Pay Date       1022713 non-null  datetime64[ns]
 3   Description 1  1022704 non-null  object        
 4   Description 2  1018731 non-null  object        
 5   Description 3  834026 non-null   object        
 6   label          45276 non-null    float64       
dtypes: datetime64[ns](1), float64(2), object(4)
memory usage: 54.6+ MB


## Split into labelled and unlabelled data

In [4]:
# lets create a dictionary containing the labels with descriptions
labels = {1:'Education, Learning and Skills Improvement',
          2:'Safeguarding vulnerable adults and children',
          3:'Social care and support for adults including the elderly',
          4:'Support for voluntary groups',
          5:'Public Health',
          6:'pupil premium (education)',
          7:'Museums and Culture',
          8:'Property',
          9:'Community Services Parks and open spaces',
          10:'Housing and Landlord Services',
          11:'early years education',
          12:'SEN'
          }

In [5]:
# we want to train on the labelled data so lets pull all the transactions with labels

# the dataframe has some 0 values and some na. Lets convert all the 0 to na
df['label'] = df['label'].replace(0, np.nan)

# filter rows where the label is NaN, e.g. not labelled
labelled_df = df.loc[df['label'].notna()]
unlabelled_df = df.loc[df['label'].isna()]


print(labelled_df.head())
print(unlabelled_df.head())

print(f'\n\nthe number of labelled transactions: {len(labelled_df)}')
print(f'\n\nthe number of unlabelled transactions: {len(unlabelled_df)}')

                                      Supplier   Amount   Pay Date  \
0                                 MUSO LIMITED    600.0 2013-03-28   
1                        ARNOLFINI GALLERY LTD   5000.0 2013-03-25   
2                             SHOW OF STRENGTH   8000.0 2013-03-25   
3                          THE TOBACCO FACTORY  10000.0 2013-03-25   
4  BRISTOL INTERNATIONAL JAZZ & BLUES FESTIVAL   2500.0 2013-03-18   

                Description 1               Description 2  \
0                FEES PAYABLE  "ARTS, EVENTS & FESTIVALS"   
1  PAYMENTS TO OTHER AGENCIES  "ARTS, EVENTS & FESTIVALS"   
2  PAYMENTS TO OTHER AGENCIES  "ARTS, EVENTS & FESTIVALS"   
3  PAYMENTS TO OTHER AGENCIES  "ARTS, EVENTS & FESTIVALS"   
4              GRANTS PAYABLE  "ARTS, EVENTS & FESTIVALS"   

                Description 3  label  
0            ARTS & FESTIVALS    7.0  
1                   ARNOLFINI    7.0  
2            SHOW OF STRENGTH    7.0  
3  TOBACCO FACTORY ARTS TRUST    7.0  
4          NEIGHB

## combine descriptions

lets combine the supplier name and the descriptions so we can tokenize and train on all of them at the same time

In [6]:
# create a new column containing all the text data
# make sure there is a space inbetween the different column text data.
# ignore a value if it is NaN or 'REDACTED' as we don't want the model to train on these.

columns_to_combine = ['Supplier', 'Description 1', 'Description 2', 'Description 3' ]

labelled_df['text_data'] = labelled_df[columns_to_combine].apply(lambda x: ' '.join(y for y in x.dropna().astype(str) if 'REDACTED' not in y), axis=1)

unlabelled_df['text_data'] = unlabelled_df[columns_to_combine].apply(lambda x: ' '.join(y for y in x.dropna().astype(str) if 'REDACTED' not in y), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labelled_df['text_data'] = labelled_df[columns_to_combine].apply(lambda x: ' '.join(y for y in x.dropna().astype(str) if 'REDACTED' not in y), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unlabelled_df['text_data'] = unlabelled_df[columns_to_combine].apply(lambda x: ' '.join(y for y in x.dropna().astype(str) if 'REDACTED' not in y), axis=1)


In [7]:
labelled_df.head()

Unnamed: 0,Supplier,Amount,Pay Date,Description 1,Description 2,Description 3,label,text_data
0,MUSO LIMITED,600.0,2013-03-28,FEES PAYABLE,"""ARTS, EVENTS & FESTIVALS""",ARTS & FESTIVALS,7.0,"MUSO LIMITED FEES PAYABLE ""ARTS, EVENTS & FEST..."
1,ARNOLFINI GALLERY LTD,5000.0,2013-03-25,PAYMENTS TO OTHER AGENCIES,"""ARTS, EVENTS & FESTIVALS""",ARNOLFINI,7.0,ARNOLFINI GALLERY LTD PAYMENTS TO OTHER AGENCI...
2,SHOW OF STRENGTH,8000.0,2013-03-25,PAYMENTS TO OTHER AGENCIES,"""ARTS, EVENTS & FESTIVALS""",SHOW OF STRENGTH,7.0,"SHOW OF STRENGTH PAYMENTS TO OTHER AGENCIES ""A..."
3,THE TOBACCO FACTORY,10000.0,2013-03-25,PAYMENTS TO OTHER AGENCIES,"""ARTS, EVENTS & FESTIVALS""",TOBACCO FACTORY ARTS TRUST,7.0,THE TOBACCO FACTORY PAYMENTS TO OTHER AGENCIES...
4,BRISTOL INTERNATIONAL JAZZ & BLUES FESTIVAL,2500.0,2013-03-18,GRANTS PAYABLE,"""ARTS, EVENTS & FESTIVALS""",NEIGHBOURHOOD ARTS,7.0,BRISTOL INTERNATIONAL JAZZ & BLUES FESTIVAL GR...


## Create word embeddings for labeled data
Download and then use the pre-trained BERT model to do this.

In [8]:
def clean_text_BERT(text):
    '''
    A function to clean and tokenize text so it is ready to be inputted into BERT
    '''

    # Convert words to lower case.
    text = text.lower()

    # Remove special characters and numbers. This also removes the dates 
    # which are not important in classifying expenses
    text = re.sub(r'[^\w\s]|https?://\S+|www\.\S+|https?:/\S+|[^\x00-\x7F]+|\d+', '', str(text).strip())
  
    # Tokenise 
    text_list = word_tokenize(text)
    result = ' '.join(text_list)
    return result

In [9]:
# clean our text data
text_raw = labelled_df['text_data']
text_BERT = text_raw.apply(lambda x: clean_text_BERT(x))

In [10]:
# an example of the text data before and after cleaning
print(text_raw[2000])
print(text_BERT[2000])


St George Health Centre Services - Supplies and Services to service users 03:NHS Health Checks  
st george health centre services supplies and services to service users nhs health checks


In [12]:
# # This may take some time to download and run.
# # depending on the size of the input.
# # I have saved the output using the cell below so it can be loaded quickly.

# model = SentenceTransformer('paraphrase-mpnet-base-v2') 
# bert_input = text_BERT.tolist()
# embeddings = model.encode(bert_input, show_progress_bar = True)
# embedding_BERT = np.array(embeddings)

Batches:   0%|          | 0/1415 [00:00<?, ?it/s]

In [20]:
# # save these embeddings to a numpy file
# np.save('./pkl_data/labelled_data_embedding.npy', embedding_BERT)


In [21]:
# load the embeddings from the pkl file
embedding_BERT = np.load('./pkl_data/labelled_data_embedding.npy', allow_pickle = True)

embedding_BERT

array([[-0.1147997 ,  0.00622684, -0.06120515, ..., -0.01737143,
         0.16032901, -0.01618893],
       [-0.05237475,  0.2965644 , -0.03495542, ..., -0.00776623,
         0.12089074,  0.03460617],
       [-0.13074368,  0.17108506, -0.05069165, ...,  0.01171169,
         0.10542176, -0.04691478],
       ...,
       [-0.10476775,  0.19025211,  0.00164859, ..., -0.12478687,
        -0.25964585, -0.03215555],
       [-0.10476775,  0.19025211,  0.00164859, ..., -0.12478687,
        -0.25964585, -0.03215555],
       [-0.10476775,  0.19025211,  0.00164859, ..., -0.12478687,
        -0.25964585, -0.03215555]], dtype=float32)

In [15]:
# # have a look at the word embeddings

# df_embedding_bert = pd.DataFrame(embeddings)
# df_embedding_bert.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.1148,0.006227,-0.061205,0.055468,0.119211,-0.133504,0.078139,-0.041612,0.057181,0.107018,...,0.015894,-0.055724,0.046353,-0.081037,-0.058252,-0.214004,-0.102968,-0.017371,0.160329,-0.016189
1,-0.052375,0.296564,-0.034955,0.221045,0.061655,-0.072989,0.238213,-0.012476,0.121805,0.046962,...,0.04282,-0.027373,0.052239,-0.009347,-0.035258,-0.218472,-0.011269,-0.007766,0.120891,0.034606
2,-0.130744,0.171085,-0.050692,0.192037,0.058402,-0.046011,-0.00091,0.078297,0.136332,0.06271,...,0.089419,-0.066612,0.238528,0.071327,-0.037926,0.030003,0.012964,0.011712,0.105422,-0.046915
3,-0.111276,0.301693,0.041094,0.242109,-0.036181,0.022725,0.122539,0.095006,0.153006,0.060218,...,-0.065611,-0.048483,-0.030131,-0.006278,0.049288,0.232537,0.00858,-0.004122,0.042453,0.046353
4,-0.114618,0.306195,-0.072638,0.144114,0.027658,-0.048989,-0.135876,0.098075,0.017136,0.10636,...,0.096182,-0.037302,-0.001019,-0.107279,0.011401,-0.126855,-0.091941,0.026467,0.152749,-0.019689


## Create word embeddings for unlabelled data

In [16]:
# first create a test sample (as it takes a long time to compute this)
# this will create a new dataframe using the first 200 rows 
test_unlabelled_df = unlabelled_df.head(500)

# Load texts
text_test_raw = test_unlabelled_df['text_data']

# Apply data cleaning function as for training data
text_test_BERT = text_test_raw.apply(lambda x: clean_text_BERT(x))


# Apply BERT embedding
bert_input_test = text_test_BERT.tolist()
model = SentenceTransformer('paraphrase-mpnet-base-v2') 
embeddings_test = model.encode(bert_input_test, show_progress_bar = True)
embedding_BERT_test = np.array(embeddings_test)

df_embedding_bert_test = pd.DataFrame(embeddings_test)

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

## Pair unseen data with the most similar training data

In [17]:
# Find the most similar word embedding with unseen data in the training data

similarity_new_data = cosine_similarity(embedding_BERT_test, embedding_BERT)
similarity_df = pd.DataFrame(similarity_new_data)

# Returns index for most similar embedding
# See first column of the output dataframe below
index_similarity = similarity_df.idxmax(axis = 1)

print(index_similarity)

0       8164
1       8164
2       8519
3      30465
4       8320
       ...  
495    27141
496    44958
497    27144
498     4940
499     4940
Length: 500, dtype: int64


In [18]:
# Return dataframe for most similar embedding/transactions in training dataframe
data_inspect = labelled_df.iloc[index_similarity, :].reset_index(drop = True)

unseen_verbatim = text_test_raw.reset_index(drop = True)
matched_verbatim = data_inspect['text_data']
label = data_inspect['label']

d_output = {
            'unseen_transaction': unseen_verbatim,
            'matched_transaction': matched_verbatim, 
            'matched_class': label
            
            }

d_output_df = pd.DataFrame.from_dict(d_output)
d_output_df.tail()



Unnamed: 0,unseen_transaction,matched_transaction,matched_class
495,New Delight Enterprises Ltd Services - Fees an...,Drawn In Bristol Services - Fees and Charges M...,13.0
496,Kennet Equipment Leasing Ltd Equipment - Purch...,"KINGKRAFT LTD EQUIPMENT, FURNITURE & MATERIALS...",12.0
497,Cooltech Environmental Engineering Ltd Service...,Bristol Blue Glass (Sw) Ltd Services - Fees an...,13.0
498,Pattersons (Bristol) Limited Catering 100 Temp...,Pegasus Catering Ltd Catering 15:Smokefree Bri...,5.0
499,Pattersons (Bristol) Limited Catering 100 Temp...,Pegasus Catering Ltd Catering 15:Smokefree Bri...,5.0


# Next Steps

* save the word embeddings so we don't need to compute them every time.
* train a random set of unlabelled data.
* train all the data