In [2]:
### Initial Setup: DO NOT RUN THIS CELL
## Terminals (order): michael, riley, kensho

import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

from require import require

import pandas as pd
import numpy as np
import sklearn as sk
import pyarrow as pa
import gdown
from IPython.display import display, HTML

data = {
    'bank': "1dzL_SWBkBs5xrUxuGQTm04oe3USgkL9u",    # banking data
    'sales': "1QK-VgSU3AxXUw330KjYFUj8S9hzKJsG6",   # sales data
    'mcc': "1JN0bR84sgZ_o4wjKPBUmz45NeEEkVgt7",     # mcc description
}

# Download all files from Google Drive
for name, file_id in data.items():
    gdown.download(f'https://drive.google.com/uc?id={file_id}', name + '.parquet', quiet=False)
    
# Read all files and store on a dictionary of pandas dataframes
df = {}
for name in data.keys():
    df[name] = pd.read_parquet(name + '.parquet')
    
# Set pandas option to display all columns
pd.set_option('display.max_columns', None)

# Custom CSS to enable horizontal scrolling
scrolling_css = """
<style>
    .dataframe-div {
        overflow-x: auto;
        white-space: nowrap;
    }
</style>
"""

Downloading...
From: https://drive.google.com/uc?id=1dzL_SWBkBs5xrUxuGQTm04oe3USgkL9u
To: /Users/riley/VSCode/HackBRASA/backend/bank.parquet
100%|██████████| 1.57M/1.57M [00:00<00:00, 11.2MB/s]
Downloading...
From: https://drive.google.com/uc?id=1QK-VgSU3AxXUw330KjYFUj8S9hzKJsG6
To: /Users/riley/VSCode/HackBRASA/backend/sales.parquet
100%|██████████| 6.37M/6.37M [00:00<00:00, 16.6MB/s]
Downloading...
From: https://drive.google.com/uc?id=1JN0bR84sgZ_o4wjKPBUmz45NeEEkVgt7
To: /Users/riley/VSCode/HackBRASA/backend/mcc.parquet
100%|██████████| 57.3k/57.3k [00:00<00:00, 1.76MB/s]


In [17]:
# Useful Functions
def display_head_with_scroll(df, num_rows=5):
    """
    Display a DataFrame with horizontal scrolling enabled.
    
    Parameters:
    df (pd.DataFrame): The DataFrame to display.
    num_rows (int): The number of rows to display. Default is 5.
    """
    display(HTML(scrolling_css + df.head(num_rows).to_html(classes='dataframe-div')))

In [36]:
print(df['sales'].shape)
display_head_with_scroll(df['sales'])

(264933, 7)


Unnamed: 0,document_id,date_time,value,card_number,type,mcc,state
0,9132021237731236867,2022-12-08 01:09:53.352600,240.0,,Pix,5499,RN
1,9132021237731236867,2022-12-04 03:04:08.115900,88.0,,Pix,5499,RN
2,9132021237731236867,2022-12-07 03:07:05.865600,76.0,,Pix,5499,RN
3,9132021237731236867,2022-12-05 02:45:01.539400,48.0,,Pix,5499,RN
4,9132021237731236867,2022-12-08 03:10:59.259900,8.0,,Pix,5499,RN


In [37]:
print(df['bank'].shape)
display_head_with_scroll(df['bank'])

(66189, 5)


Unnamed: 0,document_id,date_time,value,counterparty_document,type
0,5615027685943047372,2023-02-08 19:02:36.289545,400000.0,2701672467485454263,pix_in
1,6321206883189082161,2023-05-21 17:45:10.407340,330000.0,5674766186099233601,pix_in
2,6204525363384429949,2023-05-19 14:53:21.567099,200000.0,2193750750108086695,pix_out
3,6347736874608223396,2023-04-05 12:13:38.056087,200000.0,904790816053028747,pix_out
4,6347736874608223396,2023-04-07 23:44:04.727672,200000.0,904790816053028747,pix_out


In [40]:
print(df['mcc'].shape)
display_head_with_scroll(df['mcc'])

(981, 6)


Unnamed: 0,mcc,edited_description,combined_description,usda_description,irs_description,irs_reportable
0,742,Veterinary Services,Veterinary Services,Veterinary Services,Veterinary Services,Yes
1,763,Agricultural Co-operatives,Agricultural Co-operatives,Agricultural Co-operatives,Agricultural Cooperative,Yes
2,780,"Horticultural Services, Landscaping Services","Horticultural Services, Landscaping Services",Horticultural Services,Landscaping Services,Yes
3,1520,General Contractors-Residential and Commercial,General Contractors-Residential and Commercial,General Contractors-Residential and Commercial,General Contractors,Yes
4,1711,"Air Conditioning Contractors – Sales and Installation, Heating Contractors – Sales, Service, Installation","Air Conditioning Contractors – Sales and Installation, Heating Contractors – Sales, Service, Installation",Air Conditioning Contractors – Sales and Installation,"Heating, Plumbing, A/C",Yes


In [34]:
unique_values = df['mcc']['combined_description'].unique()
from pprint import pprint

# Print the unique values
pprint(unique_values)

array(['Veterinary Services', 'Agricultural Co-operatives',
       'Horticultural Services, Landscaping Services',
       'General Contractors-Residential and Commercial',
       'Air Conditioning Contractors – Sales and Installation, Heating Contractors – Sales, Service, Installation',
       'Electrical Contractors',
       'Insulation – Contractors, Masonry, Stonework Contractors, Plastering Contractors, Stonework and Masonry Contractors, Tile Settings Contractors',
       'Carpentry Contractors',
       'Roofing – Contractors, Sheet Metal Work – Contractors, Siding – Contractors',
       'Contractors – Concrete Work',
       'Contractors – Special Trade, Not Elsewhere Classified',
       'Miscellaneous Publishing and Printing',
       'Typesetting, Plate Making, & Related Services',
       'Specialty Cleaning, Polishing, and Sanitation Preparations',
       'UNITED AIRLINES', 'AMERICAN AIRLINES', 'PAN AMERICAN', 'Airlines',
       'TRANS WORLD AIRLINES', 'BRITISH AIRWAYS', 'JAPAN A

In [None]:
#1 See if sales and bank dataframes share any id's

# Check if there are any shared IDs
shared_ids = df['sales']['document_id'].isin(df['bank']['document_id'])
# Get the actual shared IDs
shared_ids_list = df['sales'][df['sales']['document_id'].isin(df['bank']['document_id'])]['document_id'].unique()

len(shared_ids_list)

#print("Shared IDs:", shared_ids_list)

# Perform a left join of the MCC dictionary (mcc_df) into sales_df on the 'mcc' column
sales_with_mcc_df = pd.merge(df['sales'], df['mcc'], on='mcc', how='left')

# Display the resulting DataFrame
print(sales_with_mcc_df.head())