In [1]:
import pandas as pd

# Fetch Data

In [3]:
import requests
import pandas as pd
from datetime import datetime
from tqdm import tqdm

BINANCE_CANDLE_COLUMNS = ['opentime', 'openprice', 'highprice', 'lowprice', 'closeprice', 'volume', 'closetime',
                          'quotevolume', 'trades', 'taker_buy_volume', 'taker_buy_quote', 'unused']

def binance_recursive_fetch_2(coins, interval, starttime, endtime, data_type='spot'):
    all_coins_result = {}
    data_list = []
    call_dict = {}
    
    for coin in tqdm(coins):
        result_list = []
        current_time = starttime
        call = 0
        timestamps = []
        
        while current_time < endtime:
            if ((int((endtime - current_time) / (1000 * 60))) + 1) >= 1000:
                limit = 1000
            else:
                limit = int((endtime - current_time) / (1000 * 60) + 1)
            
            if data_type == 'spot':
                url = (f'https://api.binance.com/api/v3/klines'
                       f'?symbol={coin}USDT'
                       f'&startTime={str(current_time)}'
                       f'&interval={interval}'
                       f'&limit={str(limit)}')
            elif data_type == 'futures':
                url = (f'https://fapi.binance.com/fapi/v1/klines'
                       f'?symbol={coin}USDT'
                       f'&startTime={str(current_time)}'
                       f'&interval={interval}'
                       f'&limit={str(limit)}')
            
            result_list += requests.get(url).json()
            
            if result_list:
                # Update current_time with the timestamp of the last data point fetched, plus 1 minute (60000 ms)
                current_time = result_list[-1][0] + 60000
                timestamps.append(current_time)
                call += 1
                
                # Check if the last fetched timestamp is greater than or equal to endtime
                if current_time >= endtime:
                    print(f"Reached endtime at {datetime.fromtimestamp(current_time / 1000).strftime('%Y-%m-%d %H:%M:%S')}. Stopping fetch.")
                    break
                
                print((datetime.fromtimestamp(current_time / 1000).strftime('%Y-%m-%d %H:%M:%S')) + 
                      f' status : {current_time < endtime}, time : {current_time}, limit : {call * 2}')
            
            # Ensure there's no continuous fetching of the same timestamp
            if len(timestamps) > 1 and timestamps[-1] == timestamps[-2]:
                print("Duplicate timestamp detected. Stopping fetch.")
                break
            
            # Sleep if needed to avoid rate limiting (adjust based on your rate limit)
            # time.sleep(0.1)  # Uncomment if needed
            
        current_df = pd.DataFrame(result_list, columns=BINANCE_CANDLE_COLUMNS)
        current_df['coin'] = coin
        current_df = current_df[['coin'] + BINANCE_CANDLE_COLUMNS]
        current_df = current_df.values.tolist()
        
        data_list += current_df
        call_dict.update({coin: call})
    
    return {'data': data_list, 'call': call_dict}

# Set endtime to the current time (today)
endtime = int(datetime.utcnow().timestamp() * 1000)

# # Example usage with today's date as the end time
# sample_spot = binance_recursive_fetch_2(
#     ['USDT'],
#     '1m',
#     starttime=int(pd.to_datetime('2023-01-01 00:00', utc=True).timestamp() * 1000),
#     endtime=endtime,
#     data_type='spot'  # Fetch spot data
# )

# print(sample_spot['data'])

  endtime = int(datetime.utcnow().timestamp() * 1000)


# Read Prediction

In [48]:
# Read prediction
df = pd.read_csv('/home/ubuntu/Charles/predict_near_volatility/csv/live_test_predictions_csv/live_test_NEAR.csv')

df['opentime'] = pd.to_datetime(df['opentime'])
df['opentime'] = df['opentime'].astype('int64') // 10**6  # Convert to milliseconds

# Define the starttime and endtime
starttime = 1735624800000  # 1 hour earlier from the original starttime
endtime = 1735862400000    # 12 hours earlier from the original endtime

# Filter the DataFrame based on the 'opentime' column
df = df[(df['opentime'] <= endtime)]

# Show the filtered DataFrame
df = df.tail(48)

df

Unnamed: 0,y_pred_logreg,y_pred_linreg,opentime
66,0,0.00974,1735693200000
67,0,0.009237,1735696800000
68,0,0.009514,1735700400000
69,0,0.009646,1735704000000
70,0,0.009053,1735707600000
71,0,0.009215,1735711200000
72,0,0.008741,1735714800000
73,0,0.011442,1735718400000
74,0,0.012486,1735722000000
75,0,0.011418,1735725600000


# Fetch Data

In [49]:
# Fetch the data from Binance using the updated time range
sample = binance_recursive_fetch_2(
    ['NEAR'],
    '1h',
    starttime=int(1735693200000),
    endtime = int(datetime.utcnow().timestamp() * 1000),
    data_type='futures'  # Fetch spot/futures data
)

# Define the column names for the DataFrame based on the Binance API response structure
columns = ['coin', 'opentime', 'openprice', 'highprice', 'lowprice', 'closeprice', 'volume', 'closetime', 
           'quotevolume', 'trades', 'taker_buy_volume', 'taker_buy_quote', 'unused']

# Convert the list of data into a DataFrame
df_actual = pd.DataFrame(sample['data'], columns=columns)

# Filter the DataFrame to include only data where opentime <= the endtime (both in Unix time)
df_actual = df_actual[df_actual['opentime'] <= 1735884000000]

# Ensure the 'highprice' and 'lowprice' columns are numeric
df_actual['highprice'] = pd.to_numeric(df_actual['highprice'], errors='coerce')
df_actual['lowprice'] = pd.to_numeric(df_actual['lowprice'], errors='coerce')

# Calculate volatility as the percentage difference between high and low prices
df_actual['vol'] = (df_actual['highprice'] - df_actual['lowprice']) / df_actual['lowprice']

# Calculate the 10-hour rolling average of volatility, shifted by 10 rows
df_actual['next_10h_avg_vol'] = df_actual['vol'].shift(10).rolling(window=10).mean()

# Keep only the 'opentime' and 'next_10h_avg_vol' columns
df_actual = df_actual[['opentime', 'next_10h_avg_vol']]

df_actual.dropna(inplace=True)

# Perform the inner join on 'opentime' column
df_combined = pd.merge(df_actual, df, on='opentime', how='inner')

# Keep the relevant columns
df_combined = df_combined[['opentime', 'next_10h_avg_vol', 'y_pred_linreg', 'y_pred_logreg']]

# Display the resulting DataFrame
df_combined

from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score

# Extract the actual and predicted values
y_actual = df_combined['next_10h_avg_vol']
y_pred_linreg = df_combined['y_pred_linreg']

# Calculate MAE, MAPE, and R² for the linear regression model predictions
mae_linreg = mean_absolute_error(y_actual, y_pred_linreg)
mape_linreg = mean_absolute_percentage_error(y_actual, y_pred_linreg)
r2_linreg = r2_score(y_actual, y_pred_linreg)

# Print the results
print(f"Linear Regression Model - MAE: {mae_linreg}, MAPE: {mape_linreg * 100}%, R²: {r2_linreg}")

  endtime = int(datetime.utcnow().timestamp() * 1000),
100%|██████████| 1/1 [00:00<00:00,  5.25it/s]

2025-01-03 07:01:00 status : True, time : 1735887660000, limit : 2
2025-01-03 07:01:00 status : True, time : 1735887660000, limit : 4
Duplicate timestamp detected. Stopping fetch.
Linear Regression Model - MAE: 0.0021868618883068703, MAPE: 15.724715981577214%, R²: -1.524015308497444





Unnamed: 0,opentime,next_10h_avg_vol,y_pred_linreg,y_pred_logreg
0,1735761600000,0.009183,0.011235,0
1,1735765200000,0.009714,0.010342,0
2,1735768800000,0.011012,0.010697,0
3,1735772400000,0.011853,0.011024,0
4,1735776000000,0.011829,0.012572,0
5,1735779600000,0.012615,0.012377,0
6,1735783200000,0.012848,0.011461,0
7,1735786800000,0.013632,0.011058,0
8,1735790400000,0.01481,0.011107,0
9,1735794000000,0.01545,0.010781,0


In [35]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score

# Extract the actual and predicted values
y_actual = df_combined['next_10h_avg_vol']
y_pred_linreg = df_combined['y_pred_linreg']

# Calculate MAE, MAPE, and R² for the linear regression model predictions
mae_linreg = mean_absolute_error(y_actual, y_pred_linreg)
mape_linreg = mean_absolute_percentage_error(y_actual, y_pred_linreg)
r2_linreg = r2_score(y_actual, y_pred_linreg)

# Print the results
print(f"Linear Regression Model - MAE: {mae_linreg}, MAPE: {mape_linreg * 100}%, R²: {r2_linreg}")

Linear Regression Model - MAE: 0.0021868618883068703, MAPE: 15.724715981577214%, R²: -1.524015308497444
