In [9]:
### Initial Setup: DO NOT RUN THIS CELL
## Terminals (order): michael, riley, kensho

import sys
import os
import pandas as pd
import numpy as np
import sklearn as sk
import pyarrow as pa
import gdown
from IPython.display import display, HTML
import altair as alt
import sys
import os

data = {
    'bank': "1dzL_SWBkBs5xrUxuGQTm04oe3USgkL9u",    # banking data
    'sales': "1QK-VgSU3AxXUw330KjYFUj8S9hzKJsG6",   # sales data
    'mcc': "1JN0bR84sgZ_o4wjKPBUmz45NeEEkVgt7",     # mcc description
}

# Download all files from Google Drive
for name, file_id in data.items():
    gdown.download(f'https://drive.google.com/uc?id={file_id}', name + '.parquet', quiet=False)
    
# Read all files and store on a dictionary of pandas dataframes
df = {} 
for name in data.keys():
    df[name] = pd.read_parquet(name + '.parquet')
    
# Set pandas option to display all columns
pd.set_option('display.max_columns', None)

# Custom CSS to enable horizontal scrolling
scrolling_css = """
<style>
    .dataframe-div {
        overflow-x: auto;
        white-space: nowrap;
    }
</style>
"""

Downloading...
From: https://drive.google.com/uc?id=1dzL_SWBkBs5xrUxuGQTm04oe3USgkL9u
To: /Users/riley/VSCode/HackBRASA/backend/bank.parquet
100%|██████████| 1.57M/1.57M [00:00<00:00, 6.27MB/s]
Downloading...
From: https://drive.google.com/uc?id=1QK-VgSU3AxXUw330KjYFUj8S9hzKJsG6
To: /Users/riley/VSCode/HackBRASA/backend/sales.parquet
100%|██████████| 6.37M/6.37M [00:00<00:00, 13.7MB/s]
Downloading...
From: https://drive.google.com/uc?id=1JN0bR84sgZ_o4wjKPBUmz45NeEEkVgt7
To: /Users/riley/VSCode/HackBRASA/backend/mcc.parquet
100%|██████████| 57.3k/57.3k [00:00<00:00, 2.02MB/s]


In [10]:
# Useful Functions
def display_head_with_scroll(df, num_rows=5):
    """
    Display a DataFrame with horizontal scrolling enabled.
    
    Parameters:
    df (pd.DataFrame): The DataFrame to display.
    num_rows (int): The number of rows to display. Default is 5.
    """
    display(HTML(scrolling_css + df.head(num_rows).to_html(classes='dataframe-div')))

import subprocess
def require(package):
    try:
        # Try to import the package
        globals()[package] = __import__(package)
        return True
    except ImportError:
        print(f"{package} is not installed. Trying to install via Homebrew...")
        try:
            # Attempt to install the package using Homebrew
            subprocess.check_call(["brew", "install", package])
            # After installation, try importing again
            globals()[package] = __import__(package)
            return True
        except subprocess.CalledProcessError:
            print(f"Failed to install {package} via Homebrew. Trying pip with --break-system-packages...")
            try:
                # If Homebrew fails, try installing via pip with the break-system-packages flag
                subprocess.check_call([sys.executable, "-m", "pip", "install", package, "--break-system-packages"])
                globals()[package] = __import__(package)
                return True
            except ImportError:
                print(f"Failed to install {package}.")
                return False


In [11]:
print(df['sales'].shape)
grouped_df = df['sales'].groupby(['document_id', 'state']).size().reset_index(name='transaction_count')
grouped_df.to_csv('./data/grouped_data.csv', index=False)

(264933, 7)


In [12]:
print(df['bank'].shape)
display_head_with_scroll(df['bank'])

(66189, 5)


Unnamed: 0,document_id,date_time,value,counterparty_document,type
0,5615027685943047372,2023-02-08 19:02:36.289545,400000.0,2701672467485454263,pix_in
1,6321206883189082161,2023-05-21 17:45:10.407340,330000.0,5674766186099233601,pix_in
2,6204525363384429949,2023-05-19 14:53:21.567099,200000.0,2193750750108086695,pix_out
3,6347736874608223396,2023-04-05 12:13:38.056087,200000.0,904790816053028747,pix_out
4,6347736874608223396,2023-04-07 23:44:04.727672,200000.0,904790816053028747,pix_out


In [13]:
print(df['mcc'].shape)
display_head_with_scroll(df['mcc'])

(981, 6)


Unnamed: 0,mcc,edited_description,combined_description,usda_description,irs_description,irs_reportable
0,742,Veterinary Services,Veterinary Services,Veterinary Services,Veterinary Services,Yes
1,763,Agricultural Co-operatives,Agricultural Co-operatives,Agricultural Co-operatives,Agricultural Cooperative,Yes
2,780,"Horticultural Services, Landscaping Services","Horticultural Services, Landscaping Services",Horticultural Services,Landscaping Services,Yes
3,1520,General Contractors-Residential and Commercial,General Contractors-Residential and Commercial,General Contractors-Residential and Commercial,General Contractors,Yes
4,1711,"Air Conditioning Contractors – Sales and Installation, Heating Contractors – Sales, Service, Installation","Air Conditioning Contractors – Sales and Installation, Heating Contractors – Sales, Service, Installation",Air Conditioning Contractors – Sales and Installation,"Heating, Plumbing, A/C",Yes


In [18]:
with open('data/br-states.json', 'r') as file:
    statesdata = json.load(file)
statesdata['objects']['estados']

{'type': 'GeometryCollection',
 'bbox': [-73.97741220933203,
  -33.74579986956992,
  -34.81727575186676,
  5.24981966486962],
 'geometries': [{'type': 'Polygon',
   'properties': {'nome': 'Acre'},
   'id': 'AC',
   'arcs': [[0, 1, 2]]},
  {'type': 'Polygon',
   'properties': {'nome': 'Alagoas'},
   'id': 'AL',
   'arcs': [[3, 4, 5, 6]]},
  {'type': 'Polygon',
   'properties': {'nome': 'Amazonas'},
   'id': 'AM',
   'arcs': [[7, 8, 9, 10, -1, 11]]},
  {'type': 'Polygon',
   'properties': {'nome': 'Amapá'},
   'id': 'AP',
   'arcs': [[12, 13]]},
  {'type': 'Polygon',
   'properties': {'nome': 'Bahia'},
   'id': 'BA',
   'arcs': [[-6, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]]},
  {'type': 'Polygon',
   'properties': {'nome': 'Ceará'},
   'id': 'CE',
   'arcs': [[24, 25, 26, 27, 28, 29]]},
  {'type': 'Polygon',
   'properties': {'nome': 'Distrito Federal'},
   'id': 'DF',
   'arcs': [[30]]},
  {'type': 'Polygon',
   'properties': {'nome': 'Espírito Santo'},
   'id': 'ES',
   'arcs': [[31, 3

In [19]:
import altair as alt
import json
import pandas as pd
import vegafusion

# Load the JSON data
with open('data/br-states.json', 'r') as file:
    statesdata = json.load(file)

# Enable dark theme
alt.themes.enable("dark")

# Print the first few rows of grouped_df to ensure it is correct
print(grouped_df.head())

# Print the structure of the TopoJSON data to ensure it is correct
print(statesdata['objects']['estados'].keys())

# Print the first few geometries to inspect the IDs
print(statesdata['objects']['estados']['geometries'][:5])

# Enable vegafusion data transformer
alt.data_transformers.enable("vegafusion")

# Convert the JSON data to a TopoJSON feature
states = alt.topo_feature('data/br-states.json', 'estados')  # Ensure 'objects.estados' matches your TopoJSON structure

# Assuming the TopoJSON uses state abbreviations as IDs, create a mapping
state_id_map = {
    'RN': 'Rio Grande do Norte',
    'SC': 'Santa Catarina',
    'RS': 'Rio Grande do Sul',
    'PR': 'Paraná',
    'RJ': 'Rio de Janeiro',
    'SP': 'São Paulo',
    'MG': 'Minas Gerais',
    'CE': 'Ceará',
    'MT': 'Mato Grosso',
    'DF': 'Distrito Federal',
    'RR': 'Roraima',
    'AL': 'Alagoas',
    'GO': 'Goiás',
    'SE': 'Sergipe',
    'PE': 'Pernambuco',
    'PB': 'Paraíba',
    'BA': 'Bahia',
    'AC': 'Acre',
    'AM': 'Amazonas',
    'ES': 'Espírito Santo',
    'PA': 'Pará',
    'PI': 'Piauí'
}

# Map the state abbreviations to the IDs used in the TopoJSON
grouped_df['state_id'] = grouped_df['state']

# Create the chart
chart = alt.Chart(states).mark_geoshape().encode(
    color=alt.Color('transaction_count:Q', scale=alt.Scale(scheme='blues')),
    stroke=alt.value('#154360')
).transform_lookup(
    lookup='id',  # Use 'id' field from TopoJSON
    from_=alt.LookupData(grouped_df, 'state', ['transaction_count'])
).properties(
    width=500,
    height=300
).project(
    type='mercator'  # Changed to 'mercator' for a more general projection
)

chart

          document_id state  transaction_count state_id
0    1898616423937443    AM               2982       AM
1   10825678225686420    PR               2998       PR
2   40944759436046065    SC               1819       SC
3  150845804022381629    DF               2963       DF
4  272221220503584164    SP               2191       SP
dict_keys(['type', 'bbox', 'geometries'])
[{'type': 'Polygon', 'properties': {'nome': 'Acre'}, 'id': 'AC', 'arcs': [[0, 1, 2]]}, {'type': 'Polygon', 'properties': {'nome': 'Alagoas'}, 'id': 'AL', 'arcs': [[3, 4, 5, 6]]}, {'type': 'Polygon', 'properties': {'nome': 'Amazonas'}, 'id': 'AM', 'arcs': [[7, 8, 9, 10, -1, 11]]}, {'type': 'Polygon', 'properties': {'nome': 'Amapá'}, 'id': 'AP', 'arcs': [[12, 13]]}, {'type': 'Polygon', 'properties': {'nome': 'Bahia'}, 'id': 'BA', 'arcs': [[-6, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]]}]
