In [65]:
import os
from dotenv import load_dotenv
load_dotenv()
import mysql.connector
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import gspread
import json
import hubspot
from datetime import datetime, timezone

# Access the environment variables.
mydb_pass = os.getenv('MYDB_PASS')
google_cred = os.getenv('GOOGLE_CRED')
gc = gspread.service_account_from_dict(json.loads(google_cred.replace('"', "'").replace("'", '"')))
hubspot_key = os.getenv('HUBSPOT_KEY')

In [87]:
client = hubspot.Client.create(access_token=hubspot_key)

company_list = []
has_more = True
after_value = 0

pull_properties = ["name", "send_to_cs", "record_type", "total_students", 'private_public', "closedate", "churn_date__cs", "number_of_open_deals", "total_open_deal_value", "notes_last_contacted", "segment_new", "client_type_size", "number_of_associated_contacts","potential_opportunity_size","exclude_logo"]

while has_more:
    api_response = client.crm.companies.basic_api.get_page(limit=100, after = after_value, properties=pull_properties, archived=False).to_dict()
    company_list.extend(api_response['results'])
    try:
        after_value = api_response['paging']['next']['after']
    except:
        has_more = False

print('Companies included: ' + str(len(company_list)))

company_ids = (pd.DataFrame(company_list)['id'])
company_list_df = pd.DataFrame(company_list)

Companies included: 13915


In [88]:
def pull_properties(hubspot_raw_df):
    # Extract all unique property keys from 'properties' dictionary
    property_keys = set().union(*hubspot_raw_df['properties'].apply(lambda x: x.keys()))

    # Create new columns for each property key
    for key in property_keys:
        hubspot_raw_df[key] = ''

    # Extract values for each property key and update the corresponding cell in the dataframe
    for i, row in hubspot_raw_df.iterrows():
        for key, value in row['properties'].items():
            if value is None:
                value = ''  # replace None values with empty strings
            hubspot_raw_df.at[i, key] = value

    # Drop 'properties' column and return the updated dataframe
    return hubspot_raw_df.drop('properties', axis=1)

In [89]:
def convert_iso_to_datetime(iso_date):
    # Check for None or NaN values
    if iso_date is None or (isinstance(iso_date, float) and np.isnan(iso_date)):
        return iso_date
    
    formats = ['%Y-%m-%dT%H:%M:%S.%fZ', '%Y-%m-%dT%H:%M:%S%z']
    
    for fmt in formats:
        try:
            # Try to parse the ISO date string to a datetime object
            dt = datetime.strptime(iso_date, fmt)
            
            # Set timezone to UTC if not set
            if dt.tzinfo is None:
                dt = dt.replace(tzinfo=timezone.utc)
            
            return dt.date()
        except ValueError:
            pass

In [90]:
hubspot_clients_only = pull_properties(company_list_df).query('send_to_cs == "true" and exclude_logo != "true"').reset_index(drop=True)

# drop blank and non-useful columns
hubspot_clients_only.dropna(axis=1, how='all', inplace=True)
hubspot_df = hubspot_clients_only.drop(['send_to_cs', 'exclude_logo', 'created_at', 'updated_at', 'archived', 'hs_lastmodifieddate', 'hs_object_id', 'createdate'], axis=1)

# rename columns
hubspot_df.rename({'id':'hubspot_id', 'name':'hubspot_name', 'closedate':'close_date', 'private_public':'school_type', 'segment_new':'segment', 'notes_last_contacted':'last_contacted', 'churn_date__cs':'churn_date'}, axis=1, inplace=True)

# convert date columns to datetime
hubspot_df['churn_date'] = pd.to_datetime(hubspot_df['churn_date'])
hubspot_df['close_date'] = hubspot_df['close_date'].apply(convert_iso_to_datetime)
hubspot_df['last_contacted'] = hubspot_df['last_contacted'].apply(convert_iso_to_datetime)

# update all null values
hubspot_df = hubspot_df.replace('', np.nan, regex=True)

In [91]:
hubspot_df

Unnamed: 0,hubspot_id,record_type,close_date,hubspot_name,total_students,last_contacted,segment,churn_date,school_type,potential_opportunity_size
0,4104270905,Lone Campus,2020-05-13,California Healing Arts College - Carson,181,2022-08-25,Allied Health,NaT,Private for-profit,67500
1,4104270938,Corporate Office,2023-09-27,San Joaquin Valley College - Corporate Office,2888,2023-10-10,Allied Health,NaT,Private for-profit,450000
2,4104270948,Corporate Office,2021-09-30,Lawrence & Company College of Cosmetology - Co...,115,2022-10-06,Beauty + Wellness,2023-10-01,Private for-profit,22500
3,4104275458,Lone Campus,2021-09-29,Manhattan School of Computer Technology - Broo...,392,2022-05-17,Allied Health,2022-04-25,Not-for-profit,102500
4,4104293899,Lone Campus,2022-05-26,California Career Institute - Anaheim,246,2023-04-28,Allied Health,NaT,Private for-profit,67500
...,...,...,...,...,...,...,...,...,...,...
557,17522572323,Site,,Gurnick Academy - Fresno,,,,NaT,,
558,17522572325,Site,,Gurnick Academy - Modesto,,,,NaT,,
559,17522572327,Site,,Gurnick Academy - Sacramento,,,,NaT,,
560,17522572332,Site,,Gurnick Academy - San Mateo,,,,NaT,,


In [93]:
host="production.cqof4esbua2o.us-west-2.rds.amazonaws.com"
user="leadershipDashboard"
password=mydb_pass
database="ckprodv2"
port=3306

In [92]:
from sqlalchemy import create_engine

def create_mysql_engine(user, password, host, port, database):
    connection_string = f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}"
    engine = create_engine(connection_string)
    return engine

In [94]:
def dataframe_to_mysql(df, table_name, engine, if_exists="fail", index=False):
    """
    Load a DataFrame into a MySQL table.
    
    Parameters:
    - df (pandas.DataFrame): DataFrame to load.
    - table_name (str): Name of the table in the MySQL database.
    - engine (sqlalchemy.engine): SQLAlchemy engine instance.
    - if_exists (str): What to do if the table already exists. Options: "fail", "replace", "append". Default: "fail".
    - index (bool): Whether to write the DataFrame's index to the table. Default: False.
    """
    df.to_sql(table_name, engine, if_exists=if_exists, index=index)

In [95]:
# Write data to MySQL
engine = create_mysql_engine(user, password, host, 3306, database)
dataframe_to_mysql(hubspot_df, 'ft_hs_comp_won', engine, if_exists="replace")