In [2]:
import pandas as pd
import random
from datetime import datetime, timedelta
from pymongo import MongoClient
from azure.core.exceptions import AzureError
from azure.core.credentials import AzureKeyCredential
import json
import urllib 
from openai import AzureOpenAI
from dotenv import dotenv_values
import os
config = dotenv_values()

In [3]:
from dotenv import load_dotenv

load_dotenv("variable.env", override=True)

True

In [4]:
from urllib.parse import quote_plus

MONGO_CONNECTION_STRING= os.getenv("MONGO_CONNECTION_STRING", "<YOUR-COSMOS-DB-CONNECTION-STRING>")
AOAI_KEY = os.getenv("AOAI_KEY")
AOAI_ENDPOINT =  os.getenv("AOAI_ENDPOINT")
API_VERSION =  os.getenv("API_VERSION")
AOAI_EMBEDDING_DEPLOYMENT =  os.getenv("AOAI_EMBEDDING_DEPLOYMENT")
AOAI_EMBEDDING_DEPLOYMENT_MODEL =  os.getenv("AOAI_EMBEDDING_DEPLOYMENT_MODEL")


client = AzureOpenAI(
  azure_endpoint= AOAI_ENDPOINT,
  api_key=AOAI_KEY,  
  api_version=API_VERSION
)

In [7]:
def generate_embedding(text):
    response = client.embeddings.create(
        model=AOAI_EMBEDDING_DEPLOYMENT_MODEL,
        input=text
    )
    return response.data[0].embedding

In [5]:
mongo_conn = MONGO_CONNECTION_STRING
mongo_client = MongoClient(mongo_conn)

db = mongo_client['account']

# Create collection if it doesn't exist
COLLECTION_NAME = "transactions"

collection = db[COLLECTION_NAME]

if COLLECTION_NAME not in db.list_collection_names():
    db.create_collection(COLLECTION_NAME)
    print("Created collection '{}'.\n".format(COLLECTION_NAME))
else:
    print("Using collection: '{}'.\n".format(COLLECTION_NAME))

  mongo_client = MongoClient(mongo_conn)


Using collection: 'transactions'.



In [8]:
# Load JSON data from file
with open("test.json", 'r') as file:
    data = json.load(file)

# Remove _id field from each item
data = [{k: v for k, v in item.items() if k != '_id'} for item in data]

df = pd.DataFrame(data, columns= ["TransactionID",
        "Amount",
        "Timestamp",
        "Location",
        "Merchant",
        "Fraud"])
# Print the cleaned data
print(df.head())


# List to store embeddings along with transaction details
embeddings = []

# Loop through each record in the JSON data and generate embeddings
for index, row in df.iterrows():
    # Combine relevant fields into a single text input for embedding generation
    text = f"TransactionID: {row['TransactionID']}, Amount: {row['Amount']}, Timestamp: {row['Timestamp']}, Location: {row['Location']}, Merchant: {row['Merchant']}"
    
    # Generate embedding based on the text
    embedding = generate_embedding(text)
    
    # Append the result (transaction + embedding) to the embeddings list
    embeddings.append({
        "TransactionID": row["TransactionID"],
        "Amount": row["Amount"],
        "Timestamp": str(row["Timestamp"]),  # Convert Timestamp to string for consistency
        "Location": row["Location"],
        "Merchant": row["Merchant"],
        "Fraud": row["Fraud"],
        "Embedding": embedding
    })
    
with open("transactions_with_embeddings.json", "w") as f:
    json.dump(embeddings, f, indent=4)

  TransactionID  Amount            Timestamp  Location   Merchant  Fraud
0         T7037  406.12  2024-09-15 13:50:38    Boston       Lyft  False
1         T2243  165.86  2024-09-15 14:23:38  New York  Starbucks  False
2         T7356  360.73  2024-09-15 14:04:38  New York     Amazon  False
3         T3235  282.75  2024-09-15 14:28:38    Boston     Amazon  False
4         T5109  108.79  2024-09-15 14:02:38    Boston       Lyft  False


In [81]:
db.command({
  'createIndexes': 'transactions',
  'indexes': [
    {
      'name': 'transactionsIndex',
      'key': {
        "Embedding": "cosmosSearch"
      },
      'cosmosSearchOptions': {
        'kind': 'vector-hnsw',
        'numLists': 1,
        'similarity': 'COS',
        'dimensions': 1536
      }
    }
  ]
})

{'raw': {'defaultShard': {'numIndexesBefore': 1,
   'numIndexesAfter': 2,
   'createdCollectionAutomatically': False,
   'ok': 1}},
 'ok': 1}

In [78]:
db.command({'dropIndexes': 'transactions', 'index': 'transactionsIndex'})

{'nIndexesWas': 2, 'ok': 1.0}

In [9]:
collection.delete_many({})

DeleteResult({'n': 100, 'ok': 1.0}, acknowledged=True)

In [10]:
with open('transactions_with_embeddings.json') as file:
    file_data = json.load(file)
collection.insert_many(file_data)

InsertManyResult([ObjectId('66e7746f43ef52d37e998307'), ObjectId('66e7746f43ef52d37e998308'), ObjectId('66e7746f43ef52d37e998309'), ObjectId('66e7746f43ef52d37e99830a'), ObjectId('66e7746f43ef52d37e99830b'), ObjectId('66e7746f43ef52d37e99830c'), ObjectId('66e7746f43ef52d37e99830d'), ObjectId('66e7746f43ef52d37e99830e'), ObjectId('66e7746f43ef52d37e99830f'), ObjectId('66e7746f43ef52d37e998310'), ObjectId('66e7746f43ef52d37e998311'), ObjectId('66e7746f43ef52d37e998312'), ObjectId('66e7746f43ef52d37e998313'), ObjectId('66e7746f43ef52d37e998314'), ObjectId('66e7746f43ef52d37e998315'), ObjectId('66e7746f43ef52d37e998316'), ObjectId('66e7746f43ef52d37e998317'), ObjectId('66e7746f43ef52d37e998318'), ObjectId('66e7746f43ef52d37e998319'), ObjectId('66e7746f43ef52d37e99831a'), ObjectId('66e7746f43ef52d37e99831b'), ObjectId('66e7746f43ef52d37e99831c'), ObjectId('66e7746f43ef52d37e99831d'), ObjectId('66e7746f43ef52d37e99831e'), ObjectId('66e7746f43ef52d37e99831f'), ObjectId('66e7746f43ef52d37e9983

In [16]:
def vector_search(new_transaction, num_results=5):
    # Generate the embedding for the new transaction
    query_embedding = generate_embedding(new_transaction)

    # Define the search pipeline with vector search using the `Embedding` field
    pipeline = [
        {
            '$search': {
                "cosmosSearch": {
                    "vector": query_embedding,  # The query vector (embedding of the new transaction)
                    "path": "Embedding",  # The field that contains embeddings in the collection
                    "k": num_results,  # Number of results to return
                    "efSearch": 4,
                #    "filter": {"Fraud": {"neq": "False"}}
                },
                "returnStoredSource": True
            }
        },
        {
            '$project': {
                'similarityScore': { '$meta': 'searchScore' },  # Project the similarity score
                'TransactionID': 1,  # Project relevant fields
                'Amount': 1,
                'Timestamp': 1,
                'Location': 1,
                'Merchant': 1,
                'Fraud': 1
            }
        }
    ]

    # Execute the aggregation pipeline in Cosmos DB
    results = collection.aggregate(pipeline)
    
    
    return results



In [17]:
# Example new transaction (replace with actual transaction data)
new_transaction = {
    "TransactionID": "T5978",
    "Amount": 1005.00,
    "Timestamp": "2024-09-15 14:30:00",
    "Location": "Texas",
    "Merchant": "XYZ"
}

new_transaction_text=f"TransactionID: {new_transaction['TransactionID']}, Timestamp: {new_transaction['Timestamp']}, Location: {new_transaction['Location']}, Merchant{new_transaction['Merchant']}"

# Perform the vector search and get the top 5 most similar fraudulent transactions
results = vector_search(new_transaction_text, num_results=5)
# Output results
for r in results:
    print(r) 
print(f"TransactionID: {new_transaction['TransactionID']}, Location: {new_transaction['Location']}, Merchant: {new_transaction['Merchant']}")


{'_id': ObjectId('66e7746f43ef52d37e998342'), 'TransactionID': 'T1805', 'Amount': 27.51, 'Timestamp': '2024-09-15 14:27:38', 'Location': 'Texas', 'Merchant': 'Random Store', 'Fraud': True, 'similarityScore': 0.846534210856657}
{'_id': ObjectId('66e7746f43ef52d37e99835b'), 'TransactionID': 'T6029', 'Amount': 22.22, 'Timestamp': '2024-09-15 14:20:38', 'Location': 'Texas', 'Merchant': 'Unrecognized Merchant', 'Fraud': True, 'similarityScore': 0.8330469131469727}
{'_id': ObjectId('66e7746f43ef52d37e998366'), 'TransactionID': 'T1263', 'Amount': 10.66, 'Timestamp': '2024-09-15 14:21:38', 'Location': 'Texas', 'Merchant': 'Unrecognized Merchant', 'Fraud': True, 'similarityScore': 0.8305064687283779}
{'_id': ObjectId('66e7746f43ef52d37e998317'), 'TransactionID': 'T6345', 'Amount': 60.86, 'Timestamp': '2024-09-15 13:58:38', 'Location': 'Texas', 'Merchant': 'Unrecognized Merchant', 'Fraud': True, 'similarityScore': 0.8300263386949114}
TransactionID: T5978, Location: Texas, Merchant: XYZ


In [None]:
# TODO currently using location as a primary for vector search