In [9]:
# https://pypi.org/project/pgeocode/
# https://github.com/thampiman/reverse-geocoder
# https://www.geonames.org/export/


Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Downloading scikit_learn-1.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Collecting joblib>=1.2.0
  Downloading joblib-1.4.2-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.8/301.8 KB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.4.2 threadpoolctl-3.5.0


In [28]:
import pandas as pd
import json
from sklearn.cluster import KMeans
import reverse_geocoder as rg


In [35]:
transactions = []
transactions_grouped = {}
date_to_index = {}
N_GROUPS = 60
with open('files/transactions.json') as f:
    data = json.load(f)

    for transaction in data:
        keys = transaction.keys()

        norm_transaction = {}
        should_skip = False
        for key in keys:
            if key == "('order_date', '')":
                norm_transaction['order_date'] = transaction[key]
            elif key == "('services__ipv4__geo', '')":
                norm_transaction['services__ipv4__geo'] = transaction[key]
                if not ',' in norm_transaction['services__ipv4__geo']:
                    should_skip = True
                    break
            elif key == "('count', 'request__transaction__order_total')":
                norm_transaction['order_total_count'] = transaction[key]
            elif key == "('sum', 'request__transaction__order_total')":
                norm_transaction['order_total_sum'] = transaction[key]
            elif key == "('mean', 'request__transaction__order_total')":
                norm_transaction['order_total_mean'] = transaction[key]
            elif key == "('count', 'risk_score')":
                norm_transaction['risk_score_count'] = transaction[key]
            elif key == "('sum', 'risk_score')":
                norm_transaction['risk_score_sum'] = transaction[key]
            elif key == "('mean', 'risk_score')":
                norm_transaction['risk_score_mean'] = transaction[key]
            else:
                norm_transaction['UNK_' + key] = transaction[key]
        
        if should_skip:
            continue
        if norm_transaction['order_date'] not in date_to_index:
            date_to_index[norm_transaction['order_date']] = len(date_to_index)

        norm_transaction['key_id'] = date_to_index[norm_transaction['order_date']]

        transactions.append(norm_transaction)

for transaction in transactions:
    if transaction['key_id'] not in transactions_grouped:
        transactions_grouped[transaction['key_id']] = []

    transactions_grouped[transaction['key_id']].append(transaction)

# Keep only the first N keys of the transactions
transactions_grouped = {k: transactions_grouped[k] for k in list(transactions_grouped.keys())[:N_GROUPS]}

merged_groups = {}

CLUSTERS = 12

for key in transactions_grouped:
    group = transactions_grouped[key]

    merged_group = []
    X = []
    for t in group:
        lat, lon = t['services__ipv4__geo'].split(',')
        lat = float(lat)
        lon = float(lon)
        X.append([lat, lon])
    kmeans = KMeans(n_clusters=CLUSTERS, random_state=12, n_init="auto").fit(X)

    center_to_neghbor = {}
    
    # find the nearest neighbor of each kmeans.cluster_centers_
    for ic, c in enumerate(kmeans.cluster_centers_): 
        min_dist = 999999999
        min_index = -1
        for i, x in enumerate(X):
            dist = (c[0] - x[0])**2 + (c[1] - x[1])**2
            if dist < min_dist:
                min_dist = dist
                min_index = i
        center_to_neghbor[ic] = min_index

    for i in range(CLUSTERS):
        cluster = []
        for j in range(len(kmeans.labels_)):
            if kmeans.labels_[j] == i:
                cluster.append(group[j])

        t = {}
        t['lat'] = X[center_to_neghbor[i]][0]
        t['lon'] = X[center_to_neghbor[i]][1]
        t['order_date'] = group[0]['order_date']
        t['order_total_count'] = round(sum([x['order_total_count'] for x in cluster]), 2)
        t['order_total_sum'] = round(sum([x['order_total_sum'] for x in cluster]), 2)
        t['risk_score_count'] = round(sum([x['risk_score_count'] for x in cluster]), 2)
        t['risk_score_sum'] = round(sum([x['risk_score_sum'] for x in cluster]), 2)
        t['risk_score_mean'] = round(sum([x['risk_score_mean'] for x in cluster]) / len(cluster), 2)

        merged_group.append(t)

    merged_groups[key] = merged_group

transactions_grouped = merged_groups

cache = {}
grouped_transactions_norm = {}

for k in transactions_grouped:
    print(f'Processing {len(transactions_grouped[k])} transactions of key {k}')
    i = 0
    for transaction in transactions_grouped[k]:
        i += 1
        coordinates = transaction['lat'], transaction['lon']

        if coordinates in cache:
            result = cache[coordinates]
        else:
            result = rg.search(coordinates)[0]
            cache[coordinates] = result

        transaction['city'] = result['admin1']

        if k not in grouped_transactions_norm:
            grouped_transactions_norm[k] = []
        
        grouped_transactions_norm[k].append(transaction)


transactions_grouped = grouped_transactions_norm

# Write to json
with open('files/transactions_norm.json', 'w') as f:
    json.dump(transactions_grouped, f)

Processing 12 transactions of key 0
Processing 12 transactions of key 1
Processing 12 transactions of key 2
Processing 12 transactions of key 3
Processing 12 transactions of key 4
Processing 12 transactions of key 5
Processing 12 transactions of key 6
Processing 12 transactions of key 7
Processing 12 transactions of key 8
Processing 12 transactions of key 9
Processing 12 transactions of key 10
Processing 12 transactions of key 11
Processing 12 transactions of key 12
Processing 12 transactions of key 13
Processing 12 transactions of key 14
Processing 12 transactions of key 15
Processing 12 transactions of key 16
Processing 12 transactions of key 17
Processing 12 transactions of key 18
Processing 12 transactions of key 19
Processing 12 transactions of key 20
Processing 12 transactions of key 21
Processing 12 transactions of key 22
Processing 12 transactions of key 23
Processing 12 transactions of key 24
Processing 12 transactions of key 25
Processing 12 transactions of key 26
Processing 

In [30]:
for group in transactions_grouped:
    the_transactions = transactions_grouped[group]

    min_count = 10000
    max_count = 0
    min_sum = 100000
    max_sum = 0
    min_risk = 10
    max_risk = 0

    for transaction in the_transactions:
        if transaction['order_total_count'] < min_count:
            min_count = transaction['order_total_count']
        if transaction['order_total_count'] > max_count:
            max_count = transaction['order_total_count']

        if transaction['order_total_sum'] < min_sum:
            min_sum = transaction['order_total_sum']
        if transaction['order_total_sum'] > max_sum:
            max_sum = transaction['order_total_sum']

        if transaction['risk_score_mean'] < min_risk:
            min_risk = transaction['risk_score_mean']
        if transaction['risk_score_mean'] > max_risk:
            max_risk = transaction['risk_score_mean']
    print(f"Count: {max_count}, Sum: {max_sum}, Risk: {min_risk} - {max_risk} ({len(the_transactions)})")


Count: 9100, Sum: 155327.64, Risk: 2.11 - 3.83 (10)
Count: 7210, Sum: 116863.24, Risk: 1.81 - 3.78 (10)
Count: 7675, Sum: 108626.92, Risk: 1.65 - 3.68 (10)
Count: 8793, Sum: 97853.17, Risk: 1.68 - 3.44 (10)
Count: 7598, Sum: 154561.06, Risk: 1.84 - 3.63 (10)
Count: 9274, Sum: 159470.83, Risk: 1.73 - 3.7 (10)
Count: 9542, Sum: 158665.57, Risk: 1.76 - 3.64 (10)
Count: 7552, Sum: 119952.63, Risk: 2.12 - 3.36 (10)
Count: 8452, Sum: 122066.43, Risk: 1.68 - 3.65 (10)
Count: 9803, Sum: 146875.65, Risk: 1.7 - 3.82 (10)


In [18]:
transactions_grouped[0][0]

{'lat': 25.19166666666667,
 'lon': 75.97,
 'order_date': '2023-05-01',
 'order_total_count': 10,
 'order_total_sum': 86.8,
 'risk_score_count': 10,
 'risk_score_sum': 29,
 'risk_score_mean': 2.8333333333333335}

In [22]:
transactions_grouped[1][-1]

{'order_date': '2023-05-02',
 'services__ipv4__geo': 'nan',
 'order_total_count': 50424,
 'risk_score_count': 50424,
 'order_total_sum': 124499.55,
 'risk_score_sum': 153473,
 'order_total_mean': 2.4690534269,
 'risk_score_mean': 3.0436498493,
 'key_id': 1}

In [21]:
import reverse_geocoder as rg

cache = {}
grouped_transactions_norm = {}

print(f'Processing {len(transactions_grouped[1])} transactions')
i = 0
for transaction in transactions_grouped[1]:
    i += 1
    print(f'Processing transaction {i}')
    coordinates = transaction['services__ipv4__geo'].split(',')

    if len(coordinates) != 2:
        continue
    coordinates = (float(coordinates[0]), float(coordinates[1]))

    if coordinates in cache:
        result = cache[coordinates]
    else:
      result = rg.search(coordinates)[0]
      cache[coordinates] = result

    transaction['city'] = result['admin1']

    if transaction['city'] not in grouped_transactions_norm:
        grouped_transactions_norm[transaction['city']] = []
    
    grouped_transactions_norm[transaction['city']].append(transaction)

grouped_transactions_norm
  

Processing 3053 transactions
Processing transaction 1
Processing transaction 2
Processing transaction 3
Processing transaction 4
Processing transaction 5
Processing transaction 6
Processing transaction 7
Processing transaction 8
Processing transaction 9
Processing transaction 10
Processing transaction 11
Processing transaction 12
Processing transaction 13
Processing transaction 14
Processing transaction 15
Processing transaction 16
Processing transaction 17
Processing transaction 18
Processing transaction 19
Processing transaction 20
Processing transaction 21
Processing transaction 22
Processing transaction 23
Processing transaction 24
Processing transaction 25
Processing transaction 26
Processing transaction 27
Processing transaction 28
Processing transaction 29
Processing transaction 30
Processing transaction 31
Processing transaction 32
Processing transaction 33
Processing transaction 34
Processing transaction 35
Processing transaction 36
Processing transaction 37
Processing transac

ValueError: not enough values to unpack (expected 2, got 1)

In [10]:
import pgeocode

nomi = pgeocode.Nominatim('IN')
print(nomi)

<pgeocode.Nominatim object at 0x7fd02bfc7d00>
