In [1]:
import time
import random
from tqdm import tqdm
import pandas as pd
from datetime import datetime
from geopy.geocoders import Photon, Nominatim
from os.path import join as pjoin

In [2]:
root = '/home/quamer23nasim38/reverse-recommendation-for-anomaly-detection/'
data_path = 'data/fraudTrain.csv'

max_user = 5
convert_coordinates_to_address = False

In [3]:
def dob_to_age(dob_str):
    """
    Converts date of birth (dob) in 'YYYY-MM-DD' format to age in years.
    
    Args:
        dob_str (str): Date of birth in 'YYYY-MM-DD' format.
    
    Returns:
        int: Age in years.
    """
    # Convert the dob_str into a datetime object
    dob = datetime.strptime(dob_str, '%Y-%m-%d')
    
    # Get the current date
    today = datetime.today()
    
    # Calculate age
    age = today.year - dob.year
    
    # Adjust age if birthday hasn't occurred yet this year
    if (today.month, today.day) < (dob.month, dob.day):
        age -= 1
    
    return age

def lat_long_to_address(lat, long):
    """
    Converts latitude and longitude to an address using Geopy.
    
    Args:
        lat (float): Latitude value.
        long (float): Longitude value.
    
    Returns:
        str: Corresponding address.
    """
    photon, nominatim = False, False
    try:
        GEOLOCATOR = Photon(user_agent="measurements")
        location = GEOLOCATOR.reverse((lat, long), language='en')
        photon = True
    except:
        try:
            GEOLOCATOR = Nominatim(user_agent="measurements")
            location = GEOLOCATOR.reverse((lat, long), language='en')
            nominatim = True
        except:
            return None
    if location:
        if photon:
            properties = location.raw['properties']
        elif nominatim:
            properties = location.raw['address']
        if properties.get('extent'):
            del properties['extent']

        if properties.get('osm_id'):
            del properties['osm_id']
        return properties
    else:
        return None
    
def convert_transaction_data_to_str(transaction_information, merchant_information, payment_address, merchant_address):
    template = f'''
{transaction_information['amt']}
-----------------------
{merchant_information['merchant']}; {merchant_information['category']}
-----------------------
{payment_address}
-----------------------
{merchant_address}
'''
    return template

def get_user_basic_info(transaction_detail):
    customer_information = transaction_detail[['name', 'gender', 'job', 'age']]
    registered_address = transaction_detail[['street', 'city', 'state', 'zip']]
    return customer_information, registered_address

def get_transactional_data(transaction_detail, convert_coordinates_to_address=True):
    transaction_information = transaction_detail[['trans_date_trans_time', 'amt']]
    merchant_information = transaction_detail[['merchant', 'category']]

    payment_lat, payment_long = transaction_detail[['lat', 'long']].values
    if convert_coordinates_to_address:
        payment_address = lat_long_to_address(payment_lat, payment_long)
    else:
        payment_address = None
    if payment_address:
        payment_address.update({'lat': payment_lat, 'long': payment_long})
    else:
        payment_address = {'lat': payment_lat, 'long': payment_long}
    payment_address = '; '.join([str(v) for _, v in payment_address.items()])

    merchant_lat, merchant_long = transaction_detail[['merch_lat', 'merch_long']].values
    if convert_coordinates_to_address:
        merchant_address = lat_long_to_address(merchant_lat, merchant_long)
    else:
        merchant_address = None
    if merchant_address:
        merchant_address.update({'lat': merchant_lat, 'long': merchant_long})
    else:
        merchant_address = {'lat': merchant_lat, 'long': merchant_long}
    merchant_address = '; '.join([str(v) for _, v in merchant_address.items()])
    
    return transaction_information, merchant_information, payment_address, merchant_address

In [4]:
df = pd.read_csv(pjoin(root, data_path))

In [5]:
random_cc_num = random.sample(list(df['cc_num'].unique()), max_user)
filtered_data = df[df['cc_num'].isin(random_cc_num)]
data = filtered_data.copy()

In [6]:
data['age'] = data['dob'].map(dob_to_age)
data.gender = data.gender.replace({
    'F': 'Female',
    'M': 'Male'
})
data['name'] = data['first'] + ' ' + data['last']
data['merchant'] = data.merchant.str.replace('fraud_', '')

In [7]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load the pre-trained model (e.g., Sentence-BERT)
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-small-en")
model = AutoModel.from_pretrained("BAAI/bge-small-en")
model.eval()

# Function to create embeddings from a transaction description
def embed_transaction(description):
    inputs = tokenizer(description, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        embeddings = model(**inputs)
        embeddings = embeddings[0][:, 0]
    embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
    return embeddings

In [8]:
from qdrant_client import QdrantClient, models

# Initialize in-memory Qdrant client
client = QdrantClient(":memory:")

# Create a collection in Qdrant for storing transaction embeddings
client.create_collection(
    collection_name="transactions",
    vectors_config=models.VectorParams(size=384, distance=models.Distance.COSINE)
)

def insert_transaction(transaction_embedding, payload, idx):

    client.upsert(
        collection_name="transactions",
        points=[
            models.PointStruct(
                id=idx,
                payload={
                    "transaction_data": payload,
                },
                vector=transaction_embedding,
            ),
        ],
    )

In [9]:
for user in random_cc_num:
    user_data = data[data['cc_num'] == user]
    user_data = user_data[user_data['is_fraud'] == 0]
    if user_data.shape[0]>0:
        customer_information, registered_address = get_user_basic_info(user_data.iloc[0])
        print(user_data.shape[0])
        break

508


In [10]:
for idx, (_, transaction) in tqdm(enumerate(user_data.iterrows()), total=len(user_data)):
    transaction_information, merchant_information, payment_address, merchant_address = get_transactional_data(transaction, convert_coordinates_to_address=True)
    transaction_description = convert_transaction_data_to_str(transaction_information, merchant_information, payment_address, merchant_address)
    embedding = embed_transaction(transaction_description)
    embedding = embedding[0].tolist()
    insert_transaction(embedding, transaction_description, idx)
    time.sleep(1)
    if idx == 100:
        break

 20%|█▉        | 100/508 [05:02<20:35,  3.03s/it]


In [11]:
new_transaction_info = '''
420000.54
-----------------------
Rajesh, Kumar; savings_account
-----------------------
Chandini Chowk; Delhi; India; 20.0583; 16.008
-----------------------
Vietnaam; 20.152538; 16.227746
'''

In [12]:
new_embedding = embed_transaction(new_transaction_info)

results = client.query_points(
    collection_name="transactions",
    query=new_embedding[0].tolist(),
    limit=10,
).points

In [30]:
import numpy as np

In [31]:
def detect_anomalies(query_response, threshold=0.95):
    similarity_scores = []
    for result in query_response:
        similarity_scores.append(result.score)

    if np.mean(similarity_scores) < threshold:
        return True
    else:
        return False

In [33]:
detect_anomalies(results)

True

In [None]:
context = []
for res in results:
    context.append(res.payload['transaction_data'])
context = "\n=============================NEW EXAMPLE===================================\n".join(context)

In [15]:
system_prompt = '''
You're an intelligent AI assistant that helps in detecting fraudulent transactions. 

You're provided with the three key information:
    1. CUSTOMER INFORMATION: This has all the basic information about the customer which should give some idea about customer behaviour. The template is provided below.
    2. CONTEXT: This has several  examples of a normal and non-fraudulent transactional information for the user. The template for each transaction is provided below.
    3. NEW TRANSACTIONAL INFORMATION: This is the new transactional information that you need to classify as fraudulent or not. The template is same as normal transactional information 

Template for CUSTOMER INFORMATION and TRANSACTIONAL INFORMATION are provided below:
    1. CUSTOMER INFORMATION TEMPLATE
        {NAME}; {GENDER}; {AGE}; {JOB}
        -----------------------
        {REGISTERED ADDRESS}

    2. TRANSACTIONAL INFORMATION TEMPLATE: 
        {AMOUNT}
        -----------------------
        {MERCHANT NAME}; {CATEGORY}
        -----------------------
        {PAYMENT ADDRESS}
        -----------------------
        {MERCHANT ADDRESS} 

Your task is to uderstand USER's personal information, registered address, and examples of normal transactional information based on template provided and classify the new transactional information as fraudulent or not based on the context provided and also provide the reason for your classification.

You're only allowed to provide response in a json format with the following keys:
    1. classification: This should be either of the following:
        a. Fraudulent
        b. Non-Fraudulent
    2. reason: This should be a string explaining the reason for your classification.

Example of the response:
{
    "classification": "Fraudulent",
    "reason": "The transaction amount is significantly higher than the average transaction amount."
}
    
You can not provide any other response apart from the above mentioned json format with the keys mentioned above. In the classification key, you can only provide either "Fraudulent" or "Non-Fraudulent" as the value.
'''

prompt_template = f'''
1. CUSTOMER INFORMATION:
    {customer_information['name']}; {customer_information['gender']}; {customer_information['age']}; {customer_information['job']}
    -----------------------
    {registered_address['street']}; {registered_address['city']}; {registered_address['state']}; {registered_address['zip']}

2. CONTEXT:
    {context}

3. NEW TRANSACTIONAL INFORMATION:
    {new_transaction_info}

RESPONSE:
'''

In [16]:
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer
import transformers
import torch

In [17]:
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

quantization_config = BitsAndBytesConfig(load_in_4bit=True)

tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="cuda",
    torch_dtype=torch.bfloat16,
    quantization_config=quantization_config
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [18]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    model_kwargs={"torch_dtype": torch.bfloat16}
)

In [19]:
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": prompt_template},
]

outputs = pipeline(
    messages,
    max_new_tokens=256,
)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{'role': 'assistant', 'content': '{\n    "classification": "Fraudulent",\n    "reason": "The transaction amount is significantly higher than the average transaction amount and the transaction is from a different country which is not present in the context."\n}'}


In [None]:
print(outputs[0]["generated_text"][-1])

In [28]:
eval(outputs[0]["generated_text"][-1]['content'])

{'classification': 'Fraudulent',
 'reason': 'The transaction amount is significantly higher than the average transaction amount and the transaction is from a different country which is not present in the context.'}