In [23]:
import requests
import boto3

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel

from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

from tqdm import tqdm

import pandas as pd
import numpy as np

import string
import time

from dotenv import dotenv_values

In [3]:
env_vars = dotenv_values('.env')

In [32]:
MAX_LEN=512
BATCH_SIZE=2

In [5]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/siyer/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
df = pd.read_csv("sample.csv")
df.head()

Unnamed: 0,Original Vendor,Deduped Vendor,Invoice Description,GL Description,Amount,Date,Vendor Mapping,GL Mapping,Final Category,Final Mapping,...,Description,GL-Acct,GL Account Description,Vendor,Vendor Code,Invoice Line Amt,PO Number,PO Line,Exclusion,Notes
0,Aptean,Aptean,FA - CIP,FA - CIP,618,1-May-22,IT Software,,Technology,IT Software,...,FA - CIP,1560-00,FA - CIP,Aptean,Aptean,$617.50,,,,
1,Aptean,Aptean,Pre Payments,Pre Payments,38496,1-May-22,IT Software,,Technology,IT Software,...,Pre Payments,1120-00,Pre Payments,Aptean,Aptean,"$38,496.07",,,,
2,Ashleys Pallets,Ashleys Pallets,Supplies - WH,Supplies - WH,7350,1-May-22,Crates & Pallets,,Packing & Shipping Supplies,Crates & Pallets,...,Supplies - WH,5740-00,Supplies - WH,Ashleys Pallets,Ashleys,"$7,350.00",,,,
3,Ashleys Pallets,Ashleys Pallets,Supplies - Warehouse,Supplies - Warehouse,12900,1-May-22,Crates & Pallets,,Packing & Shipping Supplies,Crates & Pallets,...,Supplies - Warehouse,5740-00,Supplies - Warehouse,Ashleys Pallets,Ashleys,"$12,900.00",,,,
4,Ashleys Pallets,Ashleys Pallets,Supplies - WH,Supplies - WH,6450,1-May-22,Crates & Pallets,,Packing & Shipping Supplies,Crates & Pallets,...,Supplies - WH,5740-00,Supplies - WH,Ashleys Pallets,Ashleys,"$6,450.00",,,,


In [7]:
df = df[~df['Final Mapping'].str.isupper()]
label = pd.get_dummies(df['Final Mapping']).apply(lambda row: np.array(row.astype(int)), axis=1)
df = df.rename(columns={'Original Vendor': 'vendor', 
                        'GL Account Description': 'description',
                        'Vendor Mapping': 'mapping'})
df = df[["vendor", "description", "mapping"]]
df = df.assign(label=label)
df.head()

Unnamed: 0,vendor,description,mapping,label
0,Aptean,FA - CIP,IT Software,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Aptean,Pre Payments,IT Software,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Ashleys Pallets,Supplies - WH,Crates & Pallets,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
3,Ashleys Pallets,Supplies - Warehouse,Crates & Pallets,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
4,Ashleys Pallets,Supplies - WH,Crates & Pallets,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."


In [8]:
api_key = env_vars['CUSTOM_SEARCH_API_KEY']
search_engine_id = env_vars['SEARCH_ENGINE_ID']

def get_search(term):
    url = f'https://www.googleapis.com/customsearch/v1?key={api_key}&cx={search_engine_id}&q={term}'
    response = requests.get(url)
    data = response.json()

    res = ""
    if 'items' in data:
        for item in data['items']:
            if 'title' in item:
                res += item['title'] + " "
            if 'snippet' in item: 
                res += item['snippet']+ " "
    return res

get_search("google")

"Google Search the world's information, including webpages, images, videos and more. Google has many special features to help you find exactly what you're looking\xa0... Google Cloud: Cloud Computing Services Meet your business challenges head on with cloud computing services from Google, including data management, hybrid & multi-cloud, and AI & ML. Personal Cloud Storage & File Sharing Platform - Google Learn about Google Drive's file sharing platform that provides a personal, secure cloud storage option to share content with other users. Google Analytics Google Analytics lets you measure your advertising ROI as well as track your Flash, video, and social networking sites and applications. Google Maps Find local businesses, view maps and get driving directions in Google Maps. Google Ads - Get Customers and Sell More with Online Advertising Let Google's AI find your best performing ad formats across Youtube, Discover, Search, and more to maximize conversions. A line graph tracks conver

In [9]:
session = boto3.Session(
    aws_access_key_id=env_vars['AWS_API_KEY'],
    aws_secret_access_key=env_vars['AWS_API_SECRET'],
    region_name=env_vars['AWS_REGION']
)

dynamodb_client = session.client('dynamodb')

In [10]:
def get_table(term):
    try:
        key = {
            'name': {'S': term}
        }
        response = dynamodb_client.get_item(
            TableName=env_vars['AWS_TABLE_NAME'],
            Key=key
        )
        item = response.get('Item')
        if item:
            return item["content"]["S"]
        else:
            return None
    except Exception as e:
        print(f"An error occurred: {e}")
print(get_table("google"))

Google Search the world's information, including webpages, images, videos and more. Google has many special features to help you find exactly what you're looking ... Google Cloud: Cloud Computing Services Meet your business challenges head on with cloud computing services from Google, including data management, hybrid & multi-cloud, and AI & ML. Personal Cloud Storage & File Sharing Platform - Google Learn about Google Drive's file sharing platform that provides a personal, secure cloud storage option to share content with other users. Google Maps Find local businesses, view maps and get driving directions in Google Maps. Google Analytics Google Analytics lets you measure your advertising ROI as well as track your Flash, video, and social networking sites and applications. Google Ads - Get More Customers & Generate Leads with Online Ads Discover how Google can help grow your business. Drive sales, generate leads & increase brand awareness with online ads. Analytics Tools & Solutions fo

In [11]:
def put_table(term, content):
    try:
        item = {
            'name': {'S': term},
            'content': {'S': content},
        }
        response = dynamodb_client.put_item(
            TableName=env_vars['AWS_TABLE_NAME'],
            Item=item
        )
    except Exception as e:
        print(f"An error occurred: {e}")
put_table("google", get_search("google"))

In [12]:
def get_object(vendor):
    table_res = get_table(vendor)
    if table_res:
        return table_res
    search_res = get_search(vendor)
    put_table(vendor, search_res)
    return search_res

In [13]:
tqdm.pandas()
df['search'] = df['vendor'].progress_apply(lambda x: get_object(x))
df.head()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13372/13372 [22:23<00:00,  9.96it/s]


Unnamed: 0,vendor,description,mapping,label,search
0,Aptean,FA - CIP,IT Software,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Aptean: Enterprise Software to Help Your Busin...
1,Aptean,Pre Payments,IT Software,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Aptean: Enterprise Software to Help Your Busin...
2,Ashleys Pallets,Supplies - WH,Crates & Pallets,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",Ashley Johnson's flipping pallet center - Home...
3,Ashleys Pallets,Supplies - Warehouse,Crates & Pallets,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",Ashley Johnson's flipping pallet center - Home...
4,Ashleys Pallets,Supplies - WH,Crates & Pallets,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",Ashley Johnson's flipping pallet center - Home...


In [36]:
class VendorDF(Dataset):
    def __init__(self, df, max_len):
        self.df = df
        self.vendors = df.vendor
        self.descriptions = df.description
        self.mappings = df.mapping
        self.searches = df.search
        self.targets = df.label
        
        self.max_len = max_len
        
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english'))
        
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    def __len__(self):
        return len(self.targets)
    
    def clean_text(self, text):
        tokens = text.split()
        tokens = [token for token in tokens if token.lower() not in self.stopwords]

        lemmatized_tokens = [self.lemmatizer.lemmatize(token) for token in tokens]
        stemmed_tokens = [self.stemmer.stem(token) for token in lemmatized_tokens]

        processed_text = "".join([token for token in stemmed_tokens if token not in string.punctuation])

        return processed_text

    def encode_text(self, text):
        tokenized_text = self.tokenizer.encode_plus(text,
                                                    add_special_tokens=True,
                                                    padding='max_length',
                                                    truncation=True,
                                                    max_length=self.max_len,
                                                    return_tensors='pt')
        return tokenized_text
    
    def __getitem__(self, index):
        vendor = self.vendors[index]
        description = self.descriptions[index]
        mapping = self.mappings[index]
        search = self.searches[index]
        target = self.targets[index]
        
        vendor_tokens = self.encode_text(self.clean_text(vendor))
        description_tokens = self.encode_text(self.clean_text(description))
        mapping_tokens = self.encode_text(self.clean_text(mapping))
        search_tokens = self.encode_text(self.clean_text(search))
        
        vendor_input_ids = vendor_tokens['input_ids'].squeeze(0)
        vendor_attention_mask = vendor_tokens['attention_mask'].squeeze(0)

        description_input_ids = description_tokens['input_ids'].squeeze(0)
        description_attention_mask = description_tokens['attention_mask'].squeeze(0)

        mapping_input_ids = mapping_tokens['input_ids'].squeeze(0)
        mapping_attention_mask = mapping_tokens['attention_mask'].squeeze(0)
        
        search_input_ids = search_tokens['input_ids'].squeeze(0)
        search_attention_mask = search_tokens['attention_mask'].squeeze(0)
        input_ids = [vendor_input_ids, description_input_ids,
                                 mapping_input_ids, search_input_ids]
        attention_masks = [vendor_attention_mask, description_attention_mask,
                                       mapping_attention_mask, search_attention_mask]
        
        return input_ids, attention_masks, torch.FloatTensor(target)

In [37]:
class BERTClassifier(nn.Module):
    def __init__(self, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(3072, num_classes)
        
    def forward(self, input_ids, attention_masks):
        
        outputs1 = self.bert(input_ids=input_ids[0], attention_mask=attention_masks[0])
        outputs2 = self.bert(input_ids=input_ids[1], attention_mask=attention_masks[1])
        outputs3 = self.bert(input_ids=input_ids[2], attention_mask=attention_masks[2])
        outputs4 = self.bert(input_ids=input_ids[3], attention_mask=attention_masks[3])

        pooled_output1 = outputs1.pooler_output
        pooled_output2 = outputs2.pooler_output
        pooled_output3 = outputs3.pooler_output
        pooled_output4 = outputs4.pooler_output

        pooled_output = torch.cat((pooled_output1, 
                                   pooled_output2, 
                                   pooled_output3, 
                                   pooled_output4), dim=1)

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

In [39]:
def train_model(model, dataloader, criterion, optimizer, num_epochs):
    for epoch in range(num_epochs):
        running_loss = 0.0
        pbar = tqdm(dataloader, desc=f"Epoch {epoch+1}", leave=False)
        for batch in pbar:            
            input_ids, attention_masks, labels = batch
            outputs = model(input_ids, attention_masks)
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            pbar.set_postfix({'Loss': loss.item()})
            
        epoch_loss = running_loss / len(dataloader)
        print(f"Epoch {epoch+1} Loss: {epoch_loss:.4f}")
    return model

In [38]:
train_df, test_df = train_test_split(df, test_size=0.2)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
train_df.head()

Unnamed: 0,vendor,description,mapping,label,search
0,Chano And Sons Inc,Payroll - Prod,Non IT Temp Labor,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Chano and Sons Inc. - Home | Facebook CSI is t...
1,"Salt City Print & Packaging, Llc",Marketing - Advertising,Flexible Packaging,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Salt City Print Services STATIONED IN THE CITY...
2,Cibc Visa,Travel Expense - Sales,Bank/Other Finance Charges,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Credit Cards | CIBC Find the CIBC credit card ...
3,Premier Employee Solutions,Payroll - Prod,Non IT Temp Labor,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Premier Employee solutions Premier Employee so...
4,"Salt City Print & Packaging, Llc",Inventory - Raw Goods,Flexible Packaging,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Salt City Print Services STATIONED IN THE CITY...


In [40]:
dataset = VendorDF(train_df, MAX_LEN)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

num_classes = len(train_df.loc[0, 'label'])
model = BERTClassifier(num_classes)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

In [41]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

In [42]:
model = train_model(model, dataloader, criterion, optimizer, num_epochs=10)

                                                                                                                                                                                                                                                                                                                                

KeyboardInterrupt: 

In [None]:
# Inference on New Data
new_texts1 = ['New piece of text 1.', 'New piece of text 2.']
new_texts2 = ['Another new piece of text 1.', 'Another new piece of text 2.']
new_texts3 = ['Yet another new piece of text 1.', 'Yet another new piece of text 2.']
new_labels = [0, 1]

new_dataset = CustomDataset(new_texts1, new_texts2, new_texts3, new_labels)
new_dataloader = DataLoader(new_dataset, batch_size=batch_size)

model.eval()

with torch.no_grad():
    for batch in new_dataloader:
        input_ids1, attention_mask1, input_ids2, attention_mask2, input_ids3, attention_mask3, labels = batch
        input_ids1 = input_ids1.to(device)
        attention_mask1 = attention_mask1.to(device)
        input_ids2 = input_ids2.to(device)
        attention_mask2 = attention_mask2.to(device)
        input_ids3 = input_ids3.to(device)
        attention_mask3 = attention_mask3.to(device)

        logits = model(input_ids1, attention_mask1, input_ids2, attention_mask2, input_ids3, attention_mask3)
        predicted_labels = torch.argmax(logits, dim=1)

        print("Predicted Labels:", predicted_labels)