# Check whether the our prices pass common sense tests. 


This notebook checks a file of prices for the entire universe to see whether the are crossovers (i.e., bid side is greater than the offered side) and other anomalies.

In [1]:
import pandas as pd
import os
import pandas as pd
import pickle
import redis
import tqdm
from google.cloud import bigquery

In [2]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/home/jupyter/creds.json"
bq_client = bigquery.Client()
project = "eng-reactor-287421"

In [3]:
%%time
import gcsfs
import pandas as pd

# Create a GCS file system instance
fs = gcsfs.GCSFileSystem(project='eng-reactor-287421')

# Read the first CSV file
with fs.open('gs://enitre-universe-project/20230930_ficc_25k_100k_1000k_5000k.csv') as f:
    data = pd.read_csv(f)
columns_with_ytw = [col for col in data.columns if 'ytw' in col]

# Apply the conversion to numeric with 'coerce' to each column
for col in columns_with_ytw:
    data[col] = pd.to_numeric(data[col], errors='coerce')

CPU times: user 18.2 s, sys: 1.23 s, total: 19.4 s
Wall time: 30.2 s


In [4]:
data.head()

Unnamed: 0,cusip,25_sale_to_customer_price,25_sale_to_customer_ytw,25_yield_to_worst_date,25_purchase_from_customer_price,25_purchase_from_customer_ytw,100_sale_to_customer_price,100_sale_to_customer_ytw,100_yield_to_worst_date,100_purchase_from_customer_price,...,1000_sale_to_customer_ytw,1000_yield_to_worst_date,1000_purchase_from_customer_price,1000_purchase_from_customer_ytw,5000_sale_to_customer_price,5000_sale_to_customer_ytw,5000_yield_to_worst_date,5000_purchase_from_customer_price,5000_purchase_from_customer_ytw,yield_to_worst_date
0,00036PAE2,One or more of the following dates necessary t...,,One or more of the following dates necessary t...,One or more of the following dates necessary t...,,One or more of the following dates necessary t...,,One or more of the following dates necessary t...,One or more of the following dates necessary t...,...,,One or more of the following dates necessary t...,One or more of the following dates necessary t...,,One or more of the following dates necessary t...,,One or more of the following dates necessary t...,One or more of the following dates necessary t...,,
1,000379BY2,100.666,3.723,03-01-2024,100.266,4.698,100.653,3.754,03-01-2024,100.273,...,,The quantity attempting to be priced is larger...,The quantity attempting to be priced is larger...,,The quantity attempting to be priced is larger...,,The quantity attempting to be priced is larger...,The quantity attempting to be priced is larger...,,
2,000379BZ9,101.763,,,100.82,,101.839,,,101.047,...,,,101.323,,The quantity attempting to be priced is larger...,,The quantity attempting to be priced is larger...,The quantity attempting to be priced is larger...,,
3,000379CB1,100.538,3.786,03-01-2024,100.286,4.401,100.539,3.785,03-01-2024,100.297,...,,The quantity attempting to be priced is larger...,The quantity attempting to be priced is larger...,,The quantity attempting to be priced is larger...,,The quantity attempting to be priced is larger...,The quantity attempting to be priced is larger...,,
4,000379CC9,100.633,3.68,03-01-2024,100.403,4.238,100.642,3.658,03-01-2024,100.42,...,,The quantity attempting to be priced is larger...,The quantity attempting to be priced is larger...,,The quantity attempting to be priced is larger...,,The quantity attempting to be priced is larger...,The quantity attempting to be priced is larger...,,


In [10]:
column_name = "25_sale_to_customer_ytw"

# Step 3: Create a boolean mask for NaN values in the specified column
nan_mask = data[column_name].isna()

# Step 4: Count the number of NaN values in the column
nan_count = nan_mask.sum()

# Step 5: Calculate the percentage of NaN values
percentage_nan = (nan_count / len(data)) * 100

print(f"Percentage of NaN values in '{column_name}': {percentage_nan:.2f}%")

Percentage of NaN values in '25_sale_to_customer_price': 8.98%


In [5]:
data.columns

Index(['cusip', '25_sale_to_customer_price', '25_sale_to_customer_ytw',
       '25_yield_to_worst_date', '25_purchase_from_customer_price',
       '25_purchase_from_customer_ytw', '100_sale_to_customer_price',
       '100_sale_to_customer_ytw', '100_yield_to_worst_date',
       '100_purchase_from_customer_price', '100_purchase_from_customer_ytw',
       '1000_sale_to_customer_price', '1000_sale_to_customer_ytw',
       '1000_yield_to_worst_date', '1000_purchase_from_customer_price',
       '1000_purchase_from_customer_ytw', '5000_sale_to_customer_price',
       '5000_sale_to_customer_ytw', '5000_yield_to_worst_date',
       '5000_purchase_from_customer_price', '5000_purchase_from_customer_ytw',
       'yield_to_worst_date'],
      dtype='object')

## Here we check whether there are crossed prices.

In [None]:
def convert_to_numeric(df, col_name):
    df[col_name] = pd.to_numeric(df[col_name], errors='coerce')

# Convert relevant columns to numeric with 'coerce'
for col in data.columns:
    if '_ytw' in col or '_price' in col:
        convert_to_numeric(data, col)

# Quantities to check
quantities_to_check = [25, 100, 1000, 5000]

for quantity in quantities_to_check:
    purchase_col = f"{quantity}_purchase_from_customer_price"
    sale_col = f"{quantity}_sale_to_customer_price"

    if purchase_col in data.columns and sale_col in data.columns:
        mask = (data[purchase_col] > data[sale_col])
        #mask = data[data[purchase_col] < data[sale_col]]
        if mask.any():
            print(f"For Quantity {quantity}, there are cases where purchase from customer price is higher than sale to customer price.")
        else:
            print(f"For Quantity {quantity}, there are no cases where purchase from customer price is higher than sale to customer price.")
    else:
        print(f"Quantity {quantity} columns not found.")

For Quantity 25, there are no cases where purchase from customer price is higher than sale to customer price.
For Quantity 100, there are no cases where purchase from customer price is higher than sale to customer price.
For Quantity 1000, there are no cases where purchase from customer price is higher than sale to customer price.
For Quantity 5000, there are no cases where purchase from customer price is higher than sale to customer price.


## Here we check whether there are crossed yields. 

In [7]:
for quantity in quantities_to_check:
    purchase_ytw_col = f"{quantity}_purchase_from_customer_ytw"
    sale_ytw_col = f"{quantity}_sale_to_customer_ytw"
    purchase_price_col = f"{quantity}_purchase_from_customer_price"
    sale_price_col = f"{quantity}_sale_to_customer_price"
    
    print_count = 0  # Initialize a count of printed instances
    total_cases = 0  # Initialize a count of total cases
    
    if purchase_ytw_col in data.columns and sale_ytw_col in data.columns:
        mask = (data[purchase_ytw_col] != -1) & (data[purchase_ytw_col] <= data[sale_ytw_col])
        total_cases = mask.sum()  # Count total cases
        
        if mask.any():
            print(f"For Quantity {quantity}, there are {total_cases} cases ({(total_cases / len(data)) * 100:.2f}% of total) where purchase from customer YTW is not higher than sale to customer YTW (ignoring -1 values):")
            # Print the relevant rows with cusip, YTW, and price values
            for index, row in data[mask].iterrows():
                if print_count >= 10:
                    break  # Stop printing after 10 instances
                cusip = row['cusip']
                purchase_ytw = row[purchase_ytw_col]
                sale_ytw = row[sale_ytw_col]
                purchase_price = row[purchase_price_col]
                sale_price = row[sale_price_col]
                print(f"Cusip: {cusip}, Purchase YTW: {purchase_ytw}, Sale YTW: {sale_ytw}, Purchase Price: {purchase_price}, Sale Price: {sale_price}")
                print_count += 1  # Increment the count of printed instances
        else:
            print(f"For Quantity {quantity}, purchase from customer YTW is always higher than sale to customer YTW (ignoring -1 values).")
    else:
        print(f"Quantity {quantity} columns not found.")


For Quantity 25, there are 481 cases (0.05% of total) where purchase from customer YTW is not higher than sale to customer YTW (ignoring -1 values):
Cusip: 010268CH1, Purchase YTW: 4.076, Sale YTW: 4.076, Purchase Price: 106.192, Sale Price: 106.193
Cusip: 01026CAA9, Purchase YTW: 4.776, Sale YTW: 4.776, Purchase Price: 94.396, Sale Price: 94.396
Cusip: 010869KQ7, Purchase YTW: 4.686, Sale YTW: 10.001, Purchase Price: 100.004, Sale Price: 100.019
Cusip: 01179RC35, Purchase YTW: 4.203, Sale YTW: 4.203, Purchase Price: 104.184, Sale Price: 104.185
Cusip: 013572NE8, Purchase YTW: 3.736, Sale YTW: 3.736, Purchase Price: 105.532, Sale Price: 105.532
Cusip: 013842X95, Purchase YTW: 5.453, Sale YTW: 5.453, Purchase Price: 71.488, Sale Price: 71.494
Cusip: 016249PF5, Purchase YTW: 4.532, Sale YTW: 4.532, Purchase Price: 90.486, Sale Price: 90.487
Cusip: 035438HG8, Purchase YTW: 4.593, Sale YTW: 4.593, Purchase Price: 88.039, Sale Price: 88.042
Cusip: 04052BJM7, Purchase YTW: 6.612, Sale YTW: 6

## Here we check for the case where yield and price are both lower than yield and price for another trade direction.  Logically this should not be possible: the lower the yield, the higher the price and vice versa. 

In [8]:
for quantity in quantities_to_check:
    purchase_ytw_col = f"{quantity}_purchase_from_customer_ytw"
    sale_ytw_col = f"{quantity}_sale_to_customer_ytw"
    purchase_price_col = f"{quantity}_purchase_from_customer_price"
    sale_price_col = f"{quantity}_sale_to_customer_price"
    
    print_count = 0  # Initialize a count of printed instances
    total_cases = 0  # Initialize a count of total cases
    
    if purchase_ytw_col in data.columns and sale_ytw_col in data.columns and purchase_price_col in data.columns and sale_price_col in data.columns:
        mask = (data[purchase_ytw_col] < data[sale_ytw_col]) & (data[purchase_price_col] < data[sale_price_col])
        total_cases = mask.sum()  # Count total cases
        
        if total_cases > 0:
            print(f"For Quantity {quantity}, there are {total_cases} cases ({(total_cases / len(data)) * 100:.2f}% of total) where both purchase from customer YTW and price are lower than sale to customer YTW and price:")
            # Print the relevant rows with cusip, YTW, and price values
            for index, row in data[mask].iterrows():
                if print_count >= 10:
                    break  # Stop printing after 10 instances
                cusip = row['cusip']
                purchase_ytw = row[purchase_ytw_col]
                sale_ytw = row[sale_ytw_col]
                purchase_price = row[purchase_price_col]
                sale_price = row[sale_price_col]
                print(f"Cusip: {cusip}, Purchase YTW: {purchase_ytw}, Sale YTW: {sale_ytw}, Purchase Price: {purchase_price}, Sale Price: {sale_price}")
                print_count += 1  # Increment the count of printed instances
        else:
            print(f"For Quantity {quantity}, no cases found where both purchase from customer YTW and price are lower than sale to customer YTW and price.")
    else:
        print(f"Quantity {quantity} columns not found.")

For Quantity 25, there are 91 cases (0.01% of total) where both purchase from customer YTW and price are lower than sale to customer YTW and price:
Cusip: 010869KQ7, Purchase YTW: 4.686, Sale YTW: 10.001, Purchase Price: 100.004, Sale Price: 100.019
Cusip: 04052BPC2, Purchase YTW: 9.562, Sale YTW: 10.287, Purchase Price: 90.881, Sale Price: 92.543
Cusip: 041806F62, Purchase YTW: 8.522, Sale YTW: 10.358, Purchase Price: 91.723, Sale Price: 98.091
Cusip: 072325AG8, Purchase YTW: 9.77, Sale YTW: 10.569, Purchase Price: 55.577, Sale Price: 60.14
Cusip: 073843AB0, Purchase YTW: 8.552, Sale YTW: 10.657, Purchase Price: 100.011, Sale Price: 100.017
Cusip: 08451PAV3, Purchase YTW: 9.888, Sale YTW: 10.073, Purchase Price: 46.844, Sale Price: 47.761
Cusip: 084538HU4, Purchase YTW: 9.829, Sale YTW: 11.018, Purchase Price: 83.577, Sale Price: 86.547
Cusip: 08675TAD5, Purchase YTW: 8.529, Sale YTW: 10.304, Purchase Price: 98.981, Sale Price: 99.269
Cusip: 13012TAH8, Purchase YTW: 9.984, Sale YTW: 1

## Here we check whether the purchase from customer price for a small lot is higher than for a large lot.  (Intuitively, the customer should get a better price the larger the lot sold.)  

In [9]:
quantities_to_check = [(25, 100), (25, 1000), (100, 1000)]

for small_quantity, large_quantity in quantities_to_check:
    small_quantity_purchase_price_col = f"{small_quantity}_purchase_from_customer_price"
    large_quantity_purchase_price_col = f"{large_quantity}_purchase_from_customer_price"
    
    print_count = 0  # Initialize a count of printed instances
    total_cases = 0  # Initialize a count of total cases
    
    if (
        small_quantity_purchase_price_col in data.columns
        and large_quantity_purchase_price_col in data.columns
    ):
        mask = data[small_quantity_purchase_price_col] > data[large_quantity_purchase_price_col]
        total_cases = mask.sum()  # Count total cases
        
        if total_cases > 0:
            print(f"For Quantity {small_quantity} vs. {large_quantity}, there are {total_cases} cases ({(total_cases / len(data)) * 100:.2f}% of total) where purchase from customer price at {small_quantity} is higher than {large_quantity}:")
            # Print the relevant rows with cusip and price values
            for index, row in data[mask].iterrows():
                if print_count >= 10:
                    break  # Stop printing after 10 instances
                cusip = row['cusip']
                small_quantity_price = row[small_quantity_purchase_price_col]
                large_quantity_price = row[large_quantity_purchase_price_col]
                print(f"Cusip: {cusip}, {small_quantity} Price: {small_quantity_price}, {large_quantity} Price: {large_quantity_price}")
                print_count += 1  # Increment the count of printed instances
        else:
            print(f"For Quantity {small_quantity} vs. {large_quantity}, no cases found where purchase from customer price at {small_quantity} is higher than {large_quantity}.")
    else:
        print(f"Quantity {small_quantity} vs. {large_quantity} columns not found.")


For Quantity 25 vs. 100, there are 154295 cases (15.74% of total) where purchase from customer price at 25 is higher than 100:
Cusip: 00037CWW6, 25 Price: 104.965, 100 Price: 104.898
Cusip: 00037NJY3, 25 Price: 100.048, 100 Price: 100.045
Cusip: 00037NNU6, 25 Price: 75.182, 100 Price: 75.167
Cusip: 00037NNW2, 25 Price: 81.139, 100 Price: 81.073
Cusip: 000416Y27, 25 Price: 108.967, 100 Price: 108.913
Cusip: 00109QAM4, 25 Price: 87.538, 100 Price: 87.493
Cusip: 00109QAQ5, 25 Price: 75.934, 100 Price: 75.926
Cusip: 00126QAP6, 25 Price: 97.027, 100 Price: 97.017
Cusip: 00126QAR2, 25 Price: 99.603, 100 Price: -1.0
Cusip: 001277AK8, 25 Price: 85.482, 100 Price: 85.469
For Quantity 25 vs. 1000, there are 546905 cases (55.79% of total) where purchase from customer price at 25 is higher than 1000:
Cusip: 000379BY2, 25 Price: 100.266, 1000 Price: -1.0
Cusip: 000379CB1, 25 Price: 100.286, 1000 Price: -1.0
Cusip: 000379CC9, 25 Price: 100.403, 1000 Price: -1.0
Cusip: 000379CD7, 25 Price: 100.449, 1