In [63]:
from pymongo import MongoClient
from dotenv import dotenv_values
import requests
from googlesearch import search
from transformers import DistilBertTokenizer, DistilBertModel
import pandas as pd
import torch
import time
from tqdm import tqdm
import weaviate


env_vars = dotenv_values('.env')

In [2]:
df = pd.read_csv("sample.csv")
df

Unnamed: 0,Original Vendor,Deduped Vendor,Invoice Description,GL Description,Amount,Date,Vendor Mapping,GL Mapping,Final Category,Final Mapping,...,Description,GL-Acct,GL Account Description,Vendor,Vendor Code,Invoice Line Amt,PO Number,PO Line,Exclusion,Notes
0,Aptean,Aptean,FA - CIP,FA - CIP,618,1-May-22,IT Software,,Technology,IT Software,...,FA - CIP,1560-00,FA - CIP,Aptean,Aptean,$617.50,,,,
1,Aptean,Aptean,Pre Payments,Pre Payments,38496,1-May-22,IT Software,,Technology,IT Software,...,Pre Payments,1120-00,Pre Payments,Aptean,Aptean,"$38,496.07",,,,
2,Ashleys Pallets,Ashleys Pallets,Supplies - WH,Supplies - WH,7350,1-May-22,Crates & Pallets,,Packing & Shipping Supplies,Crates & Pallets,...,Supplies - WH,5740-00,Supplies - WH,Ashleys Pallets,Ashleys,"$7,350.00",,,,
3,Ashleys Pallets,Ashleys Pallets,Supplies - Warehouse,Supplies - Warehouse,12900,1-May-22,Crates & Pallets,,Packing & Shipping Supplies,Crates & Pallets,...,Supplies - Warehouse,5740-00,Supplies - Warehouse,Ashleys Pallets,Ashleys,"$12,900.00",,,,
4,Ashleys Pallets,Ashleys Pallets,Supplies - WH,Supplies - WH,6450,1-May-22,Crates & Pallets,,Packing & Shipping Supplies,Crates & Pallets,...,Supplies - WH,5740-00,Supplies - WH,Ashleys Pallets,Ashleys,"$6,450.00",,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16490,The Privatebank,The Privatebank,Interest Expense,Interest Expense,9123,30-Apr-23,Bank/Other Finance Charges,,Finance & Risk,Bank/Other Finance Charges,...,Interest Expense,8030-90,Interest Expense,The Privatebank,Privatebank,"$9,122.72",,,,
16491,The Privatebank,The Privatebank,Interest Expense,Interest Expense,23007,30-Apr-23,Bank/Other Finance Charges,,Finance & Risk,Bank/Other Finance Charges,...,Interest Expense,8030-90,Interest Expense,The Privatebank,Privatebank,"$23,006.67",,,,
16492,The Privatebank,The Privatebank,Interest Expense,Interest Expense,35686,30-Apr-23,Bank/Other Finance Charges,,Finance & Risk,Bank/Other Finance Charges,...,Interest Expense,8030-90,Interest Expense,The Privatebank,Privatebank,"$35,685.68",,,,
16493,The Privatebank,The Privatebank,Interest Expense,Interest Expense,57802,30-Apr-23,Bank/Other Finance Charges,,Finance & Risk,Bank/Other Finance Charges,...,Interest Expense,8030-90,Interest Expense,The Privatebank,Privatebank,"$57,801.56",,,,


In [64]:
weaviate_client = weaviate.Client(
    url = env_vars['WEAVIATE_API_URL'],
    auth_client_secret=weaviate.AuthApiKey(api_key=env_vars['WEAVIATE_API_KEY']),
    additional_headers={
        "X-HuggingFace-Api-Key": env_vars["HUGGINGFACE_API_KEY"]
    }
)
weaviate_client

<weaviate.client.Client at 0x7fcc45989c90>

In [81]:
class_obj = {
    'class': env_vars["WEAVIATE_CLASS"],
    'properties': [
        {
            'name': 'name',
            'dataType': ['text'],
            "tokenization": "lowercase",
            "moduleConfig": {
                "text2vec-huggingface": {
                    "skip": False,
                    "vectorizePropertyName": False
                }
            },
        },
        {
            'name': 'text',
            'dataType': ['text'],
            "tokenization": "lowercase",
            "moduleConfig": {
                "text2vec-huggingface": {
                    "skip": False,
                    "vectorizePropertyName": False
                }
            },

        },
    ],
    "vectorizer": "text2vec-huggingface",
    "moduleConfig": {
        "text2vec-huggingface": {
            "model": "bert-base-uncased",
            "options": {
                "waitForModel": True,
                "useGPU": False,
                "useCache": True
              }
        }
    },
}

weaviate_client.schema.create_class(class_obj)

In [80]:
weaviate_client.schema.delete_class(env_vars["WEAVIATE_CLASS"])

In [4]:
api_key = env_vars['CUSTOM_SEARCH_API_KEY']
search_engine_id = env_vars['SEARCH_ENGINE_ID']

def get_search(term):
    url = f'https://www.googleapis.com/customsearch/v1?key={api_key}&cx={search_engine_id}&q={term}'
    response = requests.get(url)
    data = response.json()

    res = ""
    if 'items' in data:
        for item in data['items']:
            if 'title' in item:
                res += item['title'] + " "
            if 'snippet' in item: 
                res += item['snippet']+ " "
    return res

get_search("google")

"Google Search the world's information, including webpages, images, videos and more. Google has many special features to help you find exactly what you're looking\xa0... Google Cloud: Cloud Computing Services Meet your business challenges head on with cloud computing services from Google, including data management, hybrid & multi-cloud, and AI & ML. Personal Cloud Storage & File Sharing Platform - Google Learn about Google Drive's file sharing platform that provides a personal, secure cloud storage option to share content with other users. Google Analytics Google Analytics lets you measure your advertising ROI as well as track your Flash, video, and social networking sites and applications. Google Maps Find local businesses, view maps and get driving directions in Google Maps. Google Ads - Get More Customers & Generate Leads with Online Ads Discover how Google can help grow your business. Drive sales, generate leads & increase brand awareness with online ads. Google Trends See how Goog

In [38]:
def get_object(vendor):
    text = get_search(vendor)
    return {
        "name": vendor,
        "text": text,
        #"vector": get_embeddings(text).float().numpy().flatten(),
    }
get_object("google")

{'name': 'google',
 'text': "Google Search the world's information, including webpages, images, videos and more. Google has many special features to help you find exactly what you're looking\xa0... Google Cloud: Cloud Computing Services Meet your business challenges head on with cloud computing services from Google, including data management, hybrid & multi-cloud, and AI & ML. Personal Cloud Storage & File Sharing Platform - Google Learn about Google Drive's file sharing platform that provides a personal, secure cloud storage option to share content with other users. Google Analytics Google Analytics lets you measure your advertising ROI as well as track your Flash, video, and social networking sites and applications. Google Maps Find local businesses, view maps and get driving directions in Google Maps. Google Ads - Get More Customers & Generate Leads with Online Ads Discover how Google can help grow your business. Drive sales, generate leads & increase brand awareness with online ads

In [94]:
where_filter = {
  "path": ["name"],
  "operator": "Equal",
  "valueText": 'Aptean'
}

query_result = (
    weaviate_client.query
    .get(env_vars["WEAVIATE_CLASS"], "name")
    .with_additional(["id vector"])
    .with_where(where_filter)
    .do()
)
query_result

{'data': {'Get': {'Vendor': [{'_additional': {'id': 'a0c68599-3a8b-4566-a759-bf5f2b26aa4a',
      'vector': [-0.11858152,
       0.13716806,
       0.5441321,
       -0.01696414,
       0.46025488,
       -0.15837538,
       0.14169039,
       0.46900746,
       0.077566154,
       -0.4449274,
       -0.01563998,
       -0.3577185,
       -0.12401764,
       0.34412396,
       0.07933413,
       0.40552786,
       0.3139181,
       0.09527322,
       -0.20206729,
       0.3767229,
       0.010667743,
       -0.24180947,
       0.28443405,
       0.615965,
       0.42417842,
       0.06734296,
       -0.16332923,
       -0.31421882,
       -0.34174392,
       -0.023501763,
       0.5870529,
       0.03464629,
       -0.20748906,
       -0.48511854,
       -0.05716093,
       0.071434684,
       -0.24753356,
       -0.08398187,
       -0.29798567,
       0.27194333,
       -0.63352364,
       -0.4374241,
       0.01592728,
       0.009585061,
       -0.31448907,
       -0.3018864,
      

In [82]:
res = get_object('Aptean')
properties = {
    "name": res["name"],
    "text": res["text"]
}

weaviate_client.data_object.create(
    data_object=properties,
    class_name=env_vars["WEAVIATE_CLASS"],
)

'a0c68599-3a8b-4566-a759-bf5f2b26aa4a'

In [86]:
total_rows = len(df)

for i, v in  tqdm(df['Vendor'].items(), total=total_rows):

    where_filter = {
      "path": ["name"],
      "operator": "Equal",
      "valueText": v
    }

    query_result = (
      weaviate_client.query
      .get(env_vars["WEAVIATE_CLASS"], "name")
      .with_where(where_filter)
      .do()
    )

    if len(query_result['data']['Get']['Vendor']) == 0:
        res = get_object(v)
        properties = {
            "name": res["name"],
            "text": res["text"]
        }
        finished = False
        while not finished:
            try:
                weaviate_client.data_object.create(
                    data_object=properties,
                    class_name=env_vars["WEAVIATE_CLASS"],
                )
                finished = True
            except:
                finished = False
                time.sleep(3)


  0%|▍                                                                                                                                                                       | 47/16495 [04:30<26:16:24,  5.75s/it]


KeyboardInterrupt: 