In [74]:
## Terminals (order): michael, riley, kensho

import sys
import os
import pandas as pd
import numpy as np
import sklearn as sk
import pyarrow as pa
import plotly as px
import plotly.graph_objects as go
import nbformat

import gdown
from IPython.display import display, HTML

In [65]:
### Initial Setup: DO NOT RUN THIS CELL

data = {
    'bank': "1dzL_SWBkBs5xrUxuGQTm04oe3USgkL9u",    # banking data
    'sales': "1QK-VgSU3AxXUw330KjYFUj8S9hzKJsG6",   # sales data
    'mcc': "1JN0bR84sgZ_o4wjKPBUmz45NeEEkVgt7",     # mcc description
}

# Download all files from Google Drive
for name, file_id in data.items():
    gdown.download(f'https://drive.google.com/uc?id={file_id}', name + '.parquet', quiet=False)
    
# Read all files and store on a dictionary of pandas dataframes
df = {}
for name in data.keys():
    df[name] = pd.read_parquet(name + '.parquet')
    
# Set pandas option to display all columns
pd.set_option('display.max_columns', None)

# Custom CSS to enable horizontal scrolling
scrolling_css = """
<style>
    .dataframe-div {
        overflow-x: auto;
        white-space: nowrap;
    }
</style>
"""

Downloading...
From: https://drive.google.com/uc?id=1dzL_SWBkBs5xrUxuGQTm04oe3USgkL9u
To: /Users/michael/Desktop/Personal Projects/HackBRASA/backend/bank.parquet
100%|██████████| 1.57M/1.57M [00:00<00:00, 11.0MB/s]
Downloading...
From: https://drive.google.com/uc?id=1QK-VgSU3AxXUw330KjYFUj8S9hzKJsG6
To: /Users/michael/Desktop/Personal Projects/HackBRASA/backend/sales.parquet
100%|██████████| 6.37M/6.37M [00:00<00:00, 23.6MB/s]
Downloading...
From: https://drive.google.com/uc?id=1JN0bR84sgZ_o4wjKPBUmz45NeEEkVgt7
To: /Users/michael/Desktop/Personal Projects/HackBRASA/backend/mcc.parquet
100%|██████████| 57.3k/57.3k [00:00<00:00, 1.78MB/s]


In [None]:
import subprocess
import sys

def require(package):
    try:
        # Try to import the package
        globals()[package] = __import__(package)
        return True
    except ImportError:
        print(f"{package} is not installed. Trying to install via Homebrew...")
        try:
            # Attempt to install the package using Homebrew
            subprocess.check_call(["brew", "install", package])
            # After installation, try importing again
            globals()[package] = __import__(package)
            return True
        except subprocess.CalledProcessError:
            print(f"Failed to install {package} via Homebrew. Trying pip with --break-system-packages...")
            try:
                # If Homebrew fails, try installing via pip with the break-system-packages flag
                subprocess.check_call([sys.executable, "-m", "pip", "install", package, "--break-system-packages"])
                globals()[package] = __import__(package)
                return True
            except ImportError:
                print(f"Failed to install {package}.")
                return False


In [42]:
def display_head_with_scroll(df, num_rows=5):
    """
    Display a DataFrame with horizontal scrolling enabled.
    
    Parameters:
    df (pd.DataFrame): The DataFrame to display.
    num_rows (int): The number of rows to display. Default is 5.
    """
    display(HTML(scrolling_css + df.head(num_rows).to_html(classes='dataframe-div')))

## Michael's EDA

In [43]:
print(df['sales'].shape)
display_head_with_scroll(df['sales'])

(264933, 7)


Unnamed: 0,document_id,date_time,value,card_number,type,mcc,state
0,9132021237731236867,2022-12-08 01:09:53.352600,240.0,,Pix,5499,RN
1,9132021237731236867,2022-12-04 03:04:08.115900,88.0,,Pix,5499,RN
2,9132021237731236867,2022-12-07 03:07:05.865600,76.0,,Pix,5499,RN
3,9132021237731236867,2022-12-05 02:45:01.539400,48.0,,Pix,5499,RN
4,9132021237731236867,2022-12-08 03:10:59.259900,8.0,,Pix,5499,RN


In [44]:
print(df['bank'].shape)
display_head_with_scroll(df['bank'])

(66189, 5)


Unnamed: 0,document_id,date_time,value,counterparty_document,type
0,5615027685943047372,2023-02-08 19:02:36.289545,400000.0,2701672467485454263,pix_in
1,6321206883189082161,2023-05-21 17:45:10.407340,330000.0,5674766186099233601,pix_in
2,6204525363384429949,2023-05-19 14:53:21.567099,200000.0,2193750750108086695,pix_out
3,6347736874608223396,2023-04-05 12:13:38.056087,200000.0,904790816053028747,pix_out
4,6347736874608223396,2023-04-07 23:44:04.727672,200000.0,904790816053028747,pix_out


In [45]:
print(df['mcc'].shape)
display_head_with_scroll(df['mcc'])

(981, 6)


Unnamed: 0,mcc,edited_description,combined_description,usda_description,irs_description,irs_reportable
0,742,Veterinary Services,Veterinary Services,Veterinary Services,Veterinary Services,Yes
1,763,Agricultural Co-operatives,Agricultural Co-operatives,Agricultural Co-operatives,Agricultural Cooperative,Yes
2,780,"Horticultural Services, Landscaping Services","Horticultural Services, Landscaping Services",Horticultural Services,Landscaping Services,Yes
3,1520,General Contractors-Residential and Commercial,General Contractors-Residential and Commercial,General Contractors-Residential and Commercial,General Contractors,Yes
4,1711,"Air Conditioning Contractors – Sales and Installation, Heating Contractors – Sales, Service, Installation","Air Conditioning Contractors – Sales and Installation, Heating Contractors – Sales, Service, Installation",Air Conditioning Contractors – Sales and Installation,"Heating, Plumbing, A/C",Yes


In [46]:
#Perform a left join of the MCC dictionary (mcc_df) into sales_df on the 'mcc' column
final_df = pd.merge(df['sales'], df['mcc'], on='mcc', how='left')
final_df = final_df.iloc[:, :-4]
final_df.head(5)

Unnamed: 0,document_id,date_time,value,card_number,type,mcc,state,edited_description
0,9132021237731236867,2022-12-08 01:09:53.352600,240.0,,Pix,5499,RN,Misc. Food Stores – Convenience Stores and Spe...
1,9132021237731236867,2022-12-04 03:04:08.115900,88.0,,Pix,5499,RN,Misc. Food Stores – Convenience Stores and Spe...
2,9132021237731236867,2022-12-07 03:07:05.865600,76.0,,Pix,5499,RN,Misc. Food Stores – Convenience Stores and Spe...
3,9132021237731236867,2022-12-05 02:45:01.539400,48.0,,Pix,5499,RN,Misc. Food Stores – Convenience Stores and Spe...
4,9132021237731236867,2022-12-08 03:10:59.259900,8.0,,Pix,5499,RN,Misc. Food Stores – Convenience Stores and Spe...


In [47]:
earliest_date = final_df['date_time'].min()
latest_date = final_df['date_time'].max()

print("Earliest date:", earliest_date)
print("Latest date:", latest_date)

Earliest date: 2022-12-01 02:11:50.812200
Latest date: 2023-05-31 02:03:41.934200


In [63]:
sum_values = final_df.groupby('document_id')['value'].sum()
sum_values_sorted = sum_values.sort_values(ascending=False)

document_id
5392937379751703376    83015198.12
4619042854388500300    73271707.08
8620618076836167965    71066018.52
2586717201780273273    67315595.96
5952522905932206715    56150164.56
                          ...     
648260413505740527       143935.48
6381288718598539296      141024.12
1898616423937443          85709.60
453832840298988785        85566.84
5141494222495776041       85043.32
Name: value, Length: 100, dtype: float64


In [88]:
'''
table_data = sum_values_sorted.reset_index().rename(columns={'document_id': 'Document ID', 'value': 'Total Sales Revenue'})
fig = go.Figure(data=[go.Table(header=dict(values=list(table_data.columns)),
                               cells=dict(values=[table_data['Document ID'], table_data['Total Sales Revenue']]))
                     ])
fig.show()
'''

"\ntable_data = sum_values_sorted.reset_index().rename(columns={'document_id': 'Document ID', 'value': 'Total Sales Revenue'})\nfig = go.Figure(data=[go.Table(header=dict(values=list(table_data.columns)),\n                               cells=dict(values=[table_data['Document ID'], table_data['Total Sales Revenue']]))\n                     ])\nfig.show()\n"

True