In [1]:
import os
import requests
from dotenv import load_dotenv

load_dotenv()
TOGETHER_API_KEY = os.getenv("TOGETHER_API_KEY")

def transaction_memo_to_tags(memo: str) -> str:
	url = "https://api.together.xyz/v1/chat/completions"

	payload = {
		"model": "lmsys/vicuna-13b-v1.5",
		"stop": ["</s>"],
		"temperature": 0.0,
		"messages": [
			{
				"role": "system",
				"content": "You are a helpful transactions tag labeler who is really good at outputting a list of tags for a transaction given its transaction memo. You will receive information from the user, find the memo, and output the tags in list, for example: `\"[\"tag1\", \"tag2\"]\"`. Do not output anything else"
			},
			{
				"role": "system",
				"content": "For example: input: `AMAZON.COM*H836V2M62`; output: `['🏆Prizes']`"
			},
			{
				"role": "system",
				"content": "More example: input: `💸 Travel Stipend for Ivoine Strachan (Int.)`; output: `['✈️Travel Expense', '🎁 Travel Stipends']`"
			},
			{
				"role": "system",
				"content": "You should only output the final structured json list of tags, for example: `['🏆Prizes']`\n",
			},
			{
				"role": "user",
				"content": f"\n---\n Please give me the tags for the following transaction: \n\n`{memo}`\n---\n "
    +"Simply Give me the tags for the transaction, I want to see something like `['️💙TagName']` Don't say anything else to me"
			}
		]
	}
	headers = {
		"accept": "application/json",
		"content-type": "application/json",
		"Authorization": f"Bearer {TOGETHER_API_KEY}"
	}

	response = requests.post(url, json=payload, headers=headers)
	response_json = response.json().get('choices', [{}])[0].get('message', {}).get('content', '')

	return response_json


In [106]:
transaction_memo_to_tags(
    "ROSSMANN REPAIR GROUP INC, CARD: SAM PODER, MEMBER: SAM PODER")

"`['🏭Retail', '💳Credit Card', '👤Member']`"

In [91]:
transaction_memo_to_tags(
    "Amazon.com: Student Hardware Grant Purchase")

"`['🏫Education', '💻Hardware', '🎓Student']`"

In [None]:
transaction_memo_to_tags(
	"ALIEXPRESS.COM, CARD: ROSHAN PALAKKAL, MEMBER: ROSHAN PALAKKAL"
)

"`['🛍️Shopping', '💳Credit Card', '📱Mobile Payment']`"

In [130]:
import pandas as pd

df = pd.read_csv('transactions3.csv').head(10)

# fillna tags = []
df['tags'] = df['tags'].fillna('[]')

df['tags'] = df.apply(lambda row: transaction_memo_to_tags(
    row['memo']) if row['tags'] == '[]' else row['tags'], axis=1)

df

Unnamed: 0,amount_cents,memo,date,type,tags,org_category,org_id,check_memo,donation_memo,invoice_memo,transfer_memo
0,-65773,TRANSFER TO STATE HIGH HACK CLUB,2022-09-21,transfer,"`['🏦Transfer', '🏫State High Hack Club']`",high_school_hackathon,org_Yvguja,,,,
1,-2832,NAME-CHEAP.COM* 8SG11P,2021-12-04,card_charge,"`['🛍️Shopping', '💻Internet Services']`",high_school_hackathon,org_Yvguja,,,,
2,33286,TRANSFER FROM HACK CLUB HQ,2021-03-15,transfer,`['💼Transfers']`,high_school_hackathon,org_Yvguja,,,,
3,-1316,NAME-CHEAP.COM,2021-03-15,card_charge,"`['💻🛍️Online Shopping', '💳💰E-commerce']`",high_school_hackathon,org_Yvguja,,,,
4,-33286,HACK CLUB BANK FEE (MISTAKE BY BANK),2021-03-08,bank_account_transaction,"`['💸Bank Fees', '🤖Automated']`",high_school_hackathon,org_Yvguja,,,,
5,-1316,NAME-CHEAP.COM,2020-12-11,card_charge,"`['💻🛍️Online Shopping', '💳💰E-commerce']`",high_school_hackathon,org_Yvguja,,,,
6,275,EMBURSE MIGRATION 💸 FROM EMBURSE CLEARINGHOUSE,2020-09-23,transfer,"`['💸Migration', '💳Clearinghouse']`",high_school_hackathon,org_Yvguja,,,,
7,-1498,"SAMSCLUB #6533, CARD: JOY LIU, MEMBER: JOY LIU",2019-09-21,bank_account_transaction,"`['🏦Shopping Clubs', '💳Credit/Debit Cards', '👤...",high_school_hackathon,org_Yvguja,,,,
8,-3603,"SAMS CLUB #6533, CARD: JOY LIU, MEMBER: JOY LIU",2019-09-19,bank_account_transaction,"`['🏦Shopping Clubs', '💳Credit/Debit Cards', '👩...",high_school_hackathon,org_Yvguja,,,,
9,-536,"WM SUPERCENTER #2230, CARD: JOY LIU, MEMBER: J...",2019-09-19,bank_account_transaction,"`['🏬Grocery Stores', '💳Credit Cards', '📝Member...",high_school_hackathon,org_Yvguja,,,,


In [2]:
import pandas as pd
from tqdm import tqdm
import concurrent.futures

df = pd.read_csv('transactions3.csv')
df = df.head(4000)

df['tags'] = df['tags'].fillna('[]')


def process_tags(row):
    if row['tags'] == '[]':
        return transaction_memo_to_tags(row['memo'])
    else:
        return row['tags']


def process_chunk(chunk, chunk_index):
    chunk['tags'] = chunk.apply(process_tags, axis=1)
    return chunk


num_chunks = 30
chunk_size = len(df) // num_chunks
chunks = [df.iloc[i:i + chunk_size] for i in range(0, len(df), chunk_size)]

with concurrent.futures.ThreadPoolExecutor(max_workers=num_chunks) as executor:
    future_to_chunk = {executor.submit(process_chunk, chunk, index): (
        chunk, index) for index, chunk in enumerate(chunks)}

    progress_bar = tqdm(concurrent.futures.as_completed(
        future_to_chunk), total=len(chunks), desc='Processing chunks')

    for future in progress_bar:
        chunk, index = future_to_chunk[future]
        try:
            processed_chunk = future.result()
            file_name = f'transactions_llm_chunk_{index}.csv'
            processed_chunk.to_csv(file_name, index=False)
        except Exception as exc:
            print(f'Chunk {index} generated an exception: {exc}')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['tags'] = chunk.apply(process_tags, axis=1)
Processing chunks: 100%|██████████| 31/31 [00:54<00:00,  1.76s/it]
