In [1]:
import pandas as pd
from pymongo import MongoClient

# 1. Connect to your warehouse
client = MongoClient("mongodb://localhost:27017/")
db = client["ethos_db"]
collection = db["raw_transactions"]

def extract_features():
    # Load data from Mongo into Pandas
    data = list(collection.find())
    if not data:
        print("Empty database! Run load_data.py first.")
        return
        
    df = pd.DataFrame(data)

    # 2. Group by 'from' (the Wallet Address)
    # We want to see how EACH wallet behaves
    wallet_stats = df.groupby('from').agg({
        'hash': 'count',               # Total transactions (Frequency)
        'value': ['mean', 'std'],      # Average & Variation in money sent
        'gasPrice': 'mean'             # Average gas paid (Urgency)
    })

    # Flatten the column names
    wallet_stats.columns = ['tx_count', 'avg_value', 'value_std', 'avg_gas']
    
    # 3. Apply "Ethos" Logic
    # If std (standard deviation) is 0, they send the EXACT same amount every time = BOT behavior
    wallet_stats['is_bot_suspicion'] = wallet_stats['value_std'].apply(lambda x: "HIGH" if x == 0 else "LOW")

    print("ðŸ“Š Behavioral Analysis Complete!")
    print(wallet_stats.head(10))
    
    # Save this for our ML model later
    wallet_stats.to_csv("wallet_features.csv")
    print("ðŸ’¾ Features saved to wallet_features.csv")

if __name__ == "__main__":
    extract_features()

ðŸ“Š Behavioral Analysis Complete!
                                            tx_count     avg_value  \
from                                                                 
0x0000000000001EfE53a797754F094cAF01bf92C7         2  7.881854e+08   
0x0000000000008359421A71a3d1f8c72921C8504D         1  7.241728e+07   
0x00000000000124d994209fbB955E0217B5C2ECA1         1  5.145168e+14   
0x000000000121D39fc0E48F7aa4fa1fbb819c377c         1  0.000000e+00   
0x00000027F490ACeE7F11ab5fdD47209d6422C5a7         2  1.390000e+02   
0x000000AEd6068D981D76641494752acF64012032         1  0.000000e+00   
0x00003f0B7045822689cE3A6118846218A2C22cFd         2  1.943501e+14   
0x0000fDF3aF21c329851De8E981178928e472Edd9         1  1.262663e+14   
0x000342A2CD2695b7608b4c3a1bE5bcdb1b6C8D4e         1  1.000000e+00   
0x000568B3dfE407a9C6b902d53bDF7278fF8C593F         1  0.000000e+00   

                                               value_std       avg_gas  \
from                                              