In [None]:
import requests
import boto3

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel

from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

from tqdm import tqdm

import pandas as pd
import numpy as np

import string
import time

from dotenv import dotenv_values

In [None]:
env_vars = dotenv_values('.env')

In [None]:
MAX_LEN=512
BATCH_SIZE=2

In [None]:
nltk.download('wordnet')

In [None]:
df = pd.read_csv("sample.csv")
df.head()

In [None]:
df = df[~df['Final Mapping'].str.isupper()]
label = pd.get_dummies(df['Final Mapping']).apply(lambda row: np.array(row.astype(int)), axis=1)
df = df.rename(columns={'Original Vendor': 'vendor', 
                        'GL Account Description': 'description',
                        'Vendor Mapping': 'mapping'})
df = df[["vendor", "description", "mapping"]]
df = df.assign(label=label)
df.head()

In [None]:
api_key = env_vars['CUSTOM_SEARCH_API_KEY']
search_engine_id = env_vars['SEARCH_ENGINE_ID']

def get_search(term):
    url = f'https://www.googleapis.com/customsearch/v1?key={api_key}&cx={search_engine_id}&q={term}'
    response = requests.get(url)
    data = response.json()

    res = ""
    if 'items' in data:
        for item in data['items']:
            if 'title' in item:
                res += item['title'] + " "
            if 'snippet' in item: 
                res += item['snippet']+ " "
    return res

get_search("google")

In [None]:
session = boto3.Session(
    aws_access_key_id=env_vars['AWS_API_KEY'],
    aws_secret_access_key=env_vars['AWS_API_SECRET'],
    region_name=env_vars['AWS_REGION']
)

dynamodb_client = session.client('dynamodb')

In [None]:
def get_table(term):
    try:
        key = {
            'name': {'S': term}
        }
        response = dynamodb_client.get_item(
            TableName=env_vars['AWS_TABLE_NAME'],
            Key=key
        )
        item = response.get('Item')
        if item:
            return item["content"]["S"]
        else:
            return None
    except Exception as e:
        print(f"An error occurred: {e}")
print(get_table("google"))

In [None]:
def put_table(term, content):
    try:
        item = {
            'name': {'S': term},
            'content': {'S': content},
        }
        response = dynamodb_client.put_item(
            TableName=env_vars['AWS_TABLE_NAME'],
            Item=item
        )
    except Exception as e:
        print(f"An error occurred: {e}")
put_table("google", get_search("google"))

In [None]:
def get_object(vendor):
    table_res = get_table(vendor)
    if table_res:
        return table_res
    search_res = get_search(vendor)
    put_table(vendor, search_res)
    return search_res

In [None]:
tqdm.pandas()
df['search'] = df['vendor'].progress_apply(lambda x: get_object(x))
df.head()

In [None]:
class VendorDF(Dataset):
    def __init__(self, df, max_len):
        self.df = df
        self.vendors = df.vendor
        self.descriptions = df.description
        self.mappings = df.mapping
        self.searches = df.search
        self.targets = df.label
        
        self.max_len = max_len
        
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english'))
        
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    def __len__(self):
        return len(self.targets)
    
    def clean_text(self, text):
        tokens = text.split()
        tokens = [token for token in tokens if token.lower() not in self.stopwords]

        lemmatized_tokens = [self.lemmatizer.lemmatize(token) for token in tokens]
        stemmed_tokens = [self.stemmer.stem(token) for token in lemmatized_tokens]

        processed_text = "".join([token for token in stemmed_tokens if token not in string.punctuation])

        return processed_text

    def encode_text(self, text):
        tokenized_text = self.tokenizer.encode_plus(text,
                                                    add_special_tokens=True,
                                                    padding='max_length',
                                                    truncation=True,
                                                    max_length=self.max_len,
                                                    return_tensors='pt')
        return tokenized_text
    
    def __getitem__(self, index):
        vendor = self.vendors[index]
        description = self.descriptions[index]
        mapping = self.mappings[index]
        search = self.searches[index]
        target = self.targets[index]
        
        vendor_tokens = self.encode_text(self.clean_text(vendor))
        description_tokens = self.encode_text(self.clean_text(description))
        mapping_tokens = self.encode_text(self.clean_text(mapping))
        search_tokens = self.encode_text(self.clean_text(search))
        
        vendor_input_ids = vendor_tokens['input_ids'].squeeze(0)
        vendor_attention_mask = vendor_tokens['attention_mask'].squeeze(0)

        description_input_ids = description_tokens['input_ids'].squeeze(0)
        description_attention_mask = description_tokens['attention_mask'].squeeze(0)

        mapping_input_ids = mapping_tokens['input_ids'].squeeze(0)
        mapping_attention_mask = mapping_tokens['attention_mask'].squeeze(0)
        
        search_input_ids = search_tokens['input_ids'].squeeze(0)
        search_attention_mask = search_tokens['attention_mask'].squeeze(0)
        input_ids = [vendor_input_ids, description_input_ids,
                                 mapping_input_ids, search_input_ids]
        attention_masks = [vendor_attention_mask, description_attention_mask,
                                       mapping_attention_mask, search_attention_mask]
        
        return input_ids, attention_masks, torch.FloatTensor(target)

In [None]:
class BERTClassifier(nn.Module):
    def __init__(self, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(3072, num_classes)
        
    def forward(self, input_ids, attention_masks):
        
        outputs1 = self.bert(input_ids=input_ids[0], attention_mask=attention_masks[0])
        outputs2 = self.bert(input_ids=input_ids[1], attention_mask=attention_masks[1])
        outputs3 = self.bert(input_ids=input_ids[2], attention_mask=attention_masks[2])
        outputs4 = self.bert(input_ids=input_ids[3], attention_mask=attention_masks[3])

        pooled_output1 = outputs1.pooler_output
        pooled_output2 = outputs2.pooler_output
        pooled_output3 = outputs3.pooler_output
        pooled_output4 = outputs4.pooler_output

        pooled_output = torch.cat((pooled_output1, 
                                   pooled_output2, 
                                   pooled_output3, 
                                   pooled_output4), dim=1)

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

In [None]:
def train_model(model, dataloader, criterion, optimizer, num_epochs):
    for epoch in range(num_epochs):
        running_loss = 0.0
        pbar = tqdm(dataloader, desc=f"Epoch {epoch+1}", leave=False)
        for batch in pbar:            
            input_ids, attention_masks, labels = batch
            outputs = model(input_ids, attention_masks)
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            pbar.set_postfix({'Loss': loss.item()})
            
        epoch_loss = running_loss / len(dataloader)
        print(f"Epoch {epoch+1} Loss: {epoch_loss:.4f}")
    return model

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
train_df.head()

In [None]:
dataset = VendorDF(train_df, MAX_LEN)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

num_classes = len(train_df.loc[0, 'label'])
model = BERTClassifier(num_classes)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [None]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

In [None]:
model = train_model(model, dataloader, criterion, optimizer, num_epochs=10)