# *1. Load data and initial preprocessing* 

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
import sqlite3


# Cambiar directorio a carpeta del proyecto
os.chdir('C:/Python/my_projects/dexboost/')
print(os.getcwd())

def conectar_db(ruta_db: str):
    try:
        conn = sqlite3.connect(ruta_db)
        print("Conexión exitosa a la base de datos.")
        return conn
    except sqlite3.Error as e:
        print(f"Error al conectar con la base de datos: {e}")
        raise

def cerrar_conexion(conn):
    try:
        if conn:
            conn.close()
            print("Conexión cerrada correctamente.")
    except sqlite3.Error as e:
        print(f"Error al cerrar la conexión: {e}")
        raise

def cargar_tabla(conn, tabla):
    try:
        query = f"SELECT * FROM {tabla}"
        df = pd.read_sql(query, conn)
        return df
    except sqlite3.DatabaseError as e:
        print(f"Error al cargar la tabla '{tabla}': {e}")
        raise

# Conexión a la base de datos
conn = conectar_db('data/main_2025-02-22_13-37-47.db')

# Cargar tabla 'Analysis'
data = cargar_tabla(conn, 'analysisLiquidityPool')

# Cerrar conexión

cerrar_conexion(conn)

C:\Python\my_projects\dexboost
Conexión exitosa a la base de datos.
Conexión cerrada correctamente.


*Here we separate into 2 different dataframe, regarding IsLP column*

In [3]:
def initial_processing(df):
    """
    Preprocessing of analysisLiquidityPool table to be able to have quality data. 
    Separates Boost data and LP data regarding IsLP column. Resulting in 2 different DataFrames
    
    Args:
        - df (pd.DataFrame): Initial data to be processed.
    
    Returns:
        - Liquidity df (pd.DataFrame): data with LP creations
        - Boosts df (pd.DataFrame): data with Boosts
    """

    df = df.copy()
    
    # Drop ID column if exists
    df.drop(columns='id', errors='ignore', inplace=True)

    # Reset index
    df.reset_index(drop=True, inplace=True)
    
    # Format initial columns
    df['DetectedAt'] = pd.to_datetime(df['DetectedAt'].astype(str).str[:19])
    df['IsLP'] = df['IsLP'].astype(bool)
    df['IsPump'] = df['IsPump'].astype(bool)
    df['TokenName'] = df['TokenName'].astype(str)
    df['TokenMint'] = df['TokenMint'].astype(str)
    df['TotalLiquidity'] = df['TotalLiquidity'].astype(int)
    df['TotalLPProviders'] = df['TotalLPProviders'].astype(int)
    df['RugScore'] = df['RugScore'].astype(int)

    # Separate into different datasets
    df_creation_lp = df[df['IsLP'] == True]
    df_boost = df[df['IsLP'] == False]
    
    return df_creation_lp, df_boost


df_lp, df_boost = initial_processing(data)


KeyError: 'IsLP'

In [16]:
def parse_price_history(df, tp=50, sl=-40):
    """
    Expands the PriceHistory column into multiple rows while retaining original columns.
    Calculates price variation percentage and time since the boost.
    """

    df = df.copy()

    # Ensure BoostTime is in datetime format without timezone
    df['DetectedAt'] = pd.to_datetime(df['DetectedAt'], errors='coerce').dt.tz_localize(None)

    # Clean and format PriceHistory column before JSON parsing
    df['PriceHistory'] = df['PriceHistory'].astype(str).str.replace(r'\\"', '"', regex=True).str.strip('"')

    # Remove NaN and empty values from PriceHistory
    df = df[df['PriceHistory'].notna() & (df['PriceHistory'] != 'nan') & (df['PriceHistory'] != '')]

    # Parse PriceHistory from string to JSON
    df['PriceHistory'] = df['PriceHistory'].apply(json.loads)

    # Remove rows where PriceHistory is an empty list
    df = df[df['PriceHistory'].apply(lambda x: isinstance(x, list) and len(x) > 0)]

    # Expand PriceHistory into separate rows while retaining original columns
    df_expanded = df.explode('PriceHistory').reset_index(drop=True)

    if df_expanded.empty:
        return pd.DataFrame(columns=['TokenMint', 'TokenName', 'DetectedAt','price', 'PriceVariation_%', 'Trigger', 'TimeSinceBoostStart'])

    # Normalize the JSON PriceHistory column into separate columns
    price_data = pd.json_normalize(df_expanded['PriceHistory'])

    if price_data.empty or 'price' not in price_data.columns or 'time' not in price_data.columns:
        return pd.DataFrame(columns=['TokenMint', 'TokenName', 'PriceTime', 'price', 'Trigger', 'TimeSinceBoostStart'])

    # Rename extracted columns
    price_data.columns = ['price', 'PriceTime']

    # Convert PriceTime and price to correct data types
    price_data['PriceTime'] = pd.to_datetime(price_data['PriceTime'], errors='coerce').dt.tz_localize(None)
    price_data['price'] = pd.to_numeric(price_data['price'], errors='coerce')

    # Merge processed data with original DataFrame
    df_expanded = df_expanded.drop(columns=['PriceHistory']).reset_index(drop=True)
    df_expanded = pd.concat([df_expanded, price_data], axis=1)

    # Ensure PriceHistory is correctly sorted before computing TimeSinceBoostStart
    df_expanded = df_expanded.sort_values(by=['TokenMint', 'PriceTime'], ascending=True)

    # Calculate time difference since boost (in minutes)
    df_expanded['TimeSinceBoostStart'] = (df_expanded['PriceTime'] - df_expanded['DetectedAt']).dt.total_seconds()
    df_expanded['TimeSinceBoostStart'] = df_expanded['TimeSinceBoostStart'].fillna(0)  # Fill NaN values with 0
    df_expanded['TimeSinceBoostStart'] = df_expanded['TimeSinceBoostStart'].clip(lower=0)  # Clip negative values
    df_expanded['TimeSinceBoostStart'] = df_expanded['TimeSinceBoostStart'].astype('Int64')  # Use Int64 which can handle NaN

    # Calculate price variation percentage
    df_expanded['PriceVariation_%'] = ((df_expanded['price'] - df_expanded['StartPrice']) / df_expanded['StartPrice']) * 100
    df_expanded['PriceVariation_%'] = df_expanded['PriceVariation_%'].round(2)

    # Assign triggers (Take Profit / Stop Loss)
    df_expanded['Trigger'] = np.select(
        [df_expanded['PriceVariation_%'] >= tp, df_expanded['PriceVariation_%'] <= sl],
        ['TP', 'SL'], default='No event'
    )

    # Select relevant columns and define data types
    cols = ['TokenMint', 'TokenName', 'price', 'PriceVariation_%', 'Trigger', 'TimeSinceBoostStart']

    dtypes = {
        "TokenMint": 'str',
        "TokenName": 'str',
        "price": 'float32',
        "TimeSinceBoostStart": 'int64',
        "Trigger": 'str'
    }

    return df_expanded[cols].astype(dtypes).reset_index(drop=True)

df_to_parse = df_analysis.copy()
df_to_parse = df_to_parse[
    (df_to_parse['StartLiquidityUSD'] > 10000) &
    (df_to_parse['RugScore'] <= 1000)
]

price_evo_lp = parse_price_history(df_to_parse)

  price_data['PriceTime'] = pd.to_datetime(price_data['PriceTime'], errors='coerce').dt.tz_localize(None)


KeyError: 'StartPrice'