<a href="https://colab.research.google.com/github/saecula/notebooks/blob/main/markets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd;

In [None]:
df = pd.read_csv('/content/drive/MyDrive/data/manifold/markets.csv');

In [None]:
pd.options.display.max_columns = None
print(df.head())

In [None]:
# format
pd.options.display.float_format = '{:.0f}'.format

df['totalLiquidity'] = pd.to_numeric(df['totalLiquidity'], errors='coerce')
df['volume'] = pd.to_numeric(df['volume'], errors='coerce')
df['pool_yes'] = pd.to_numeric(df['pool_yes'], errors='coerce')
df['pool_no'] = pd.to_numeric(df['pool_yes'], errors='coerce')

print('info', df.info())

In [None]:
# filter

# filtered_df = df[df['field'] == 'value']

january_2023_to_june_2023 = df[(df['createdTime'] >= '2023-01-01') & (df['createdTime'] <= '2023-06-30')]
outcometype_binary = january_2023_to_june_2023[january_2023_to_june_2023['outcomeType'] == 'BINARY']

activity = outcometype_binary[['id', 'totalLiquidity', 'volume', 'pool_yes', 'pool_no']]


print('stats:')
print(activity.describe())

markets = activity[activity['volume'] > 100] # caps at 999?

print('active markets:')
print(markets.describe())

In [None]:
import matplotlib.pyplot as plt
import datetime

In [None]:
# a date histogram showing markets by createdTime, bucketed in 30-day increments.

df['createdTime'] = pd.to_datetime(df['createdTime'], unit='ms')
df['createdTime_seconds'] = df['createdTime'].apply(lambda x: x.timestamp())
df['createdTime_bucket'] = (df['createdTime_seconds'] // (3600*24*30)).astype(int)

# Create a PeriodIndex for the x-axis to show human-readable month labels
period_index = pd.period_range(start=df['createdTime'].min().normalize(),
                               end=df['createdTime'].max().normalize(),
                               freq='M')

# Plot the data
df.groupby('createdTime_bucket')['id'].count().plot(kind='bar')

# Set the x-axis tick labels
plt.xticks(range(len(period_index)), period_index.to_timestamp().strftime('%b %y'))

plt.xlabel('Created Time (30-day buckets)')
plt.ylabel('Number of Markets')
plt.show()


In [None]:
# plot distribution

volume = markets['volume']
plt.hist(volume, bins=50)
plt.xlabel('Total Volume')
plt.ylabel('Frequency')
plt.show()


In [None]:
import json
import os

In [None]:
json_dir = '/content/drive/MyDrive/data/manifold/bets'

market_ids = markets['id'].tolist()

columns_to_include = ['id', 'createdTime', 'userId', 'contractId', 'amount', 'shares', 'outcome', 'visibility', 'isFilled', 'isCancelled', 'isChallenge', 'orderAmount', 'probBefore', 'probAfter']

def load_bets_from_json(file_path, ids):
    with open(file_path, 'r') as f:
        bets_json = json.load(f)

    # print first bet
    print('new file, first bet:')
    print(bets_json[0])

    df_raw = pd.DataFrame(bets_json)

    print(df_raw.columns)

    df_bets = df_raw[columns_to_include]
    df_market_filtered = df_bets[df_bets['contractId'].isin(ids)]
    return df_market_filtered

# load files that end with json and contain string '2023'
bets_dfs = []
for file in os.listdir(json_dir):
    if file.endswith('.json') and '2023' in file:
        file_path = os.path.join(json_dir, file)
        bets_df = load_bets_from_json(file_path, market_ids)
        bets_dfs.append(bets_df)

bets_df = pd.concat(bets_dfs, ignore_index=True)

In [None]:
bets_df.info()