Tdl 24434 custom object support (#242)

* initial commit to add support for custom objects * modify the endpoint for custom records * functionality for the discover mode of custom streams * add custom_objects schema * add sync functionality for custom_object records * create shared folder for the re-use of schema * Log a warning for custom object scopes, handling exceptions. * add doc string to definitions * remove custom_object schema stream, as it just the schema of custom objects with solid analytical info. * modifying naming conventions * include shared folder in the package data * include shared json file names * modified package data in setup.py * update manifest file with the shared folder * correct the misspelled * make pylint happy * add unit test for the custom object functionality * handle the chains of .get() calls * before sync get the streams from the catalog instead of the api endpoint * make pylint happy * remove `custom_` prefix from the table name * make pylint happy * Add doc string * disable pylint issue - inconsistent-return-statements * fix unit test case * update the discover mode for custom streams * remove trailing space * modify function name * update STATE assignment post custom object sync * move try catch block to the `gen_request_custom_objects` function * correct variable name * include custom objects stream for testing all the scenarios (#243) * initial commit to add support for custom objects * modify the endpoint for custom records * functionality for the discover mode of custom streams * add custom_objects schema * add sync functionality for custom_object records * create shared folder for the re-use of schema * Log a warning for custom object scopes, handling exceptions. * add doc string to definitions * remove custom_object schema stream, as it just the schema of custom objects with solid analytical info. * modifying naming conventions * include shared folder in the package data * include shared json file names * modified package data in setup.py * update manifest file with the shared folder * include custom objects stream for testing all the scenarios * correct the misspelled * make pylint happy * fix the errors * take the recent startdate, as the custom objects are created 2 days before. * remove `custom_` prefix from the name of custom object stream * remove unused code * Update start dates due to the absence of custom object creation in between. * update test client with pagination logic * fix all fields test for deals stream * update STATE assignment post custom object sync * correct variable name * setup and changelog update
singer-io · Dec 12, 2023 · 88b0b8e · 88b0b8e
1 parent 60b00d6
commit 88b0b8e
Show file tree

Hide file tree

Showing 10 changed files with 544 additions and 20 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,8 @@
 # Changelog
 
+## 2.13.0
+  * HubSpot Custom CRM Objects Support [#242](https://github.com/singer-io/tap-hubspot/pull/242)
+
 ## 2.12.2
   * Use engagements_page_size advanced option [#234](https://github.com/singer-io/tap-hubspot/pull/234)
   * 

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,2 +1,3 @@
 include LICENSE
 include tap_hubspot/schemas/*.json
+include tap_hubspot/schemas/shared/*.json
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 from setuptools import setup
 
 setup(name='tap-hubspot',
-      version='2.12.2',
+      version='2.13.0',
       description='Singer.io tap for extracting data from the HubSpot API',
       author='Stitch',
       url='http://singer.io',
@@ -29,18 +29,9 @@
       packages=['tap_hubspot'],
       package_data = {
           'tap_hubspot/schemas': [
-              "campaigns.json",
-              "companies.json",
-              "contact_lists.json",
-              "contacts.json",
-              "deals.json",
-              "email_events.json",
-              "forms.json",
-              "keywords.json",
-              "owners.json",
-              "subscription_changes.json",
-              "workflows.json",
-          ],
+                "schemas/*.json",
+                "schemas/shared/*.json"
+              ]
       },
       include_package_data=True,
 )
diff --git a/tap_hubspot/__init__.py b/tap_hubspot/__init__.py
@@ -102,6 +102,9 @@ class StateFields:
 
     "tickets_properties":   "/crm/v3/properties/tickets",
     "tickets":              "/crm/v4/objects/tickets",
+
+    "custom_objects_schema":        "/crm/v3/schemas",
+    "custom_objects": "/crm/v3/objects/p_{object_name}"
 }
 
 def get_start(state, tap_stream_id, bookmark_key, older_bookmark_key=None):
@@ -197,7 +200,12 @@ def get_field_schema(field_type, extras=False):
             }
         }
 
-def parse_custom_schema(entity_name, data):
+def parse_custom_schema(entity_name, data, is_custom_object=False):
+    if is_custom_object:
+        return {
+            field['name']: get_field_type_schema(field['type'])
+            for field in data
+        }
     if entity_name == "tickets":
         return {
             field['name']: get_field_type_schema(field['type'])
@@ -1104,6 +1112,92 @@ def sync_deal_pipelines(STATE, ctx):
     singer.write_state(STATE)
     return STATE
 
+def gen_request_custom_objects(tap_stream_id, url, params, path, more_key):
+    """
+    Cursor-based API Pagination : Used in custom_objects stream implementation
+    """
+    try:
+        with metrics.record_counter(tap_stream_id) as counter:
+            while True:
+                data = request(url, params).json()
+
+                if data.get(path) is None:
+                    raise RuntimeError(
+                        "Unexpected API response: {} not in {}".format(path, data.keys()))
+
+                for row in data[path]:
+                    counter.increment()
+                    yield row
+
+                if not data.get(more_key):
+                    break
+                params['after'] = data.get(more_key, {}).get('next', {}).get('after', None)
+                if params['after'] is None:
+                    break
+    except SourceUnavailableException as ex:
+        warning_message = str(ex).replace(CONFIG['access_token'], 10 * '*')
+        LOGGER.warning(warning_message)
+        return []
+
+def sync_custom_objects(stream_id, primary_key, bookmark_key, catalog, STATE, params, is_custom_object=False):
+    """
+    Synchronize records from a data source
+    """
+    mdata = metadata.to_map(catalog.get('metadata'))
+    if is_custom_object:
+        url = get_url("custom_objects", object_name=stream_id)
+    else:
+        url = get_url(stream_id)
+    max_bk_value = bookmark_value = utils.strptime_with_tz(
+        get_start(STATE, stream_id, bookmark_key))
+
+    LOGGER.info(f"Sync record for {stream_id} from {bookmark_value}")
+    schema = catalog.get('schema')
+    singer.write_schema(stream_id, schema, [primary_key],
+                        [bookmark_key], catalog.get('stream_alias'))
+
+    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as transformer:
+        # To handle records updated between start of the table sync and the end,
+        # store the current sync start in the state and not move the bookmark past this value.
+        sync_start_time = utils.now()
+        for row in gen_request_custom_objects(stream_id, url, params, 'results', "paging"):
+            # parsing the string formatted date to datetime object
+            modified_time = utils.strptime_to_utc(row[bookmark_key])
+
+            # Checking the bookmark value is present on the record and it
+            # is greater than or equal to defined previous bookmark value
+            if modified_time and modified_time >= bookmark_value:
+                # transforms the data and filters out the selected fields from the catalog
+                record = transformer.transform(lift_properties_and_versions(row), schema, mdata)
+                singer.write_record(stream_id, record, catalog.get(
+                    'stream_alias'), time_extracted=utils.now())
+            if modified_time and modified_time >= max_bk_value:
+                max_bk_value = modified_time
+
+    # Don't bookmark past the start of this sync to account for updated records during the sync.
+    new_bookmark = min(max_bk_value, sync_start_time)
+    STATE = singer.write_bookmark(STATE, stream_id, bookmark_key, utils.strftime(new_bookmark))
+    singer.write_state(STATE)
+    return STATE
+
+
+def sync_custom_object_records(STATE, ctx, stream_id):
+    """
+    Function to sync records for each `custom_object` stream
+    """
+    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
+    mdata = metadata.to_map(catalog.get('metadata'))
+    primary_key = "id"
+    bookmark_key = "updatedAt"
+
+    params = {'limit': 100,
+              'associations': 'emails,meetings,notes,tasks,calls,conversations,contacts,companies,deals,tickets',
+              'properties': get_selected_property_fields(catalog, mdata),
+              'archived': False
+              }
+    return sync_custom_objects(stream_id, primary_key, bookmark_key, catalog, STATE, params, is_custom_object=True)
+
+
 @attr.s
 class Stream:
     tap_stream_id = attr.ib()
@@ -1131,6 +1225,64 @@ class Stream:
     Stream('engagements', sync_engagements, ["engagement_id"], 'lastUpdated', 'FULL_TABLE')
 ]
 
+# pylint: disable=inconsistent-return-statements
+def generate_custom_streams(mode, catalog=None):
+    """
+    - In DISCOVER mode, fetch the custom schema from the API endpoint and set the schema for the custom objects.
+    - In SYNC mode, extend STREAMS for the custom objects.
+
+    Args:
+        mode (str): The mode indicating whether to DISCOVER or SYNC custom streams.
+        catalog (dict): The catalog containing stream information.
+
+    Returns:
+        List[dict] or List[str]: Returns list of custom streams (contains dictionary) in DISCOVER mode and a list of custom object names in SYNC mode.
+    """
+    custom_objects_schema_url = get_url("custom_objects_schema")
+    if mode == "DISCOVER":
+        custom_streams = []
+        # Load Hubspot's shared schemas
+        refs = load_shared_schema_refs()
+        for custom_object in gen_request_custom_objects("custom_objects_schema", custom_objects_schema_url, {}, 'results', "paging"):
+            stream_id = custom_object["name"]
+            schema = utils.load_json(get_abs_path('schemas/shared/custom_objects.json'))
+            custom_schema = parse_custom_schema(stream_id, custom_object["properties"], is_custom_object=True)
+            schema["properties"]["properties"] = {
+                "type": "object",
+                "properties": custom_schema,
+            }
+
+            # Move properties to top level
+            custom_schema_top_level = {'property_{}'.format(k): v for k, v in custom_schema.items()}
+            schema['properties'].update(custom_schema_top_level)
+
+            final_schema = singer.resolve_schema_references(schema, refs)
+            custom_streams.append({"stream": Stream(stream_id, sync_custom_object_records, ['id'], 'updatedAt', 'INCREMENTAL'),
+                                   "schema": final_schema})
+
+        return custom_streams
+
+    elif mode == "SYNC":
+        custom_objects = [custom_object["name"] for custom_object in gen_request_custom_objects("custom_objects_schema", custom_objects_schema_url, {}, 'results', "paging")]
+        if len(custom_objects) > 0:
+            for stream in catalog["streams"]:
+                if stream["tap_stream_id"] in custom_objects:
+                    STREAMS.append(Stream(stream["tap_stream_id"], sync_custom_object_records, ['id'], 'updatedAt', 'INCREMENTAL'))
+        return custom_objects
+
+def load_shared_schema_refs():
+    shared_schemas_path = get_abs_path('schemas/shared')
+
+    shared_file_names = [f for f in os.listdir(shared_schemas_path)
+                         if os.path.isfile(os.path.join(shared_schemas_path, f))]
+
+    shared_schema_refs = {}
+    for shared_file in shared_file_names:
+        with open(os.path.join(shared_schemas_path, shared_file)) as data_file:
+            shared_schema_refs[shared_file] = json.load(data_file)
+
+    return shared_schema_refs
+
 def get_streams_to_sync(streams, state):
     target_stream = singer.get_currently_syncing(state)
     result = streams
@@ -1152,6 +1304,7 @@ def get_selected_streams(remaining_streams, ctx):
     return selected_streams
 
 def do_sync(STATE, catalog):
+    custom_objects = generate_custom_streams(mode="SYNC", catalog=catalog)
     # Clear out keys that are no longer used
     clean_state(STATE)
 
@@ -1168,7 +1321,10 @@ def do_sync(STATE, catalog):
         singer.write_state(STATE)
 
         try:
-            STATE = stream.sync(STATE, ctx) # pylint: disable=not-callable
+            if stream.tap_stream_id in custom_objects:
+                STATE = stream.sync(STATE, ctx, stream.tap_stream_id)
+            else:
+                STATE = stream.sync(STATE, ctx) # pylint: disable=not-callable
         except SourceUnavailableException as ex:
             error_message = str(ex).replace(CONFIG['access_token'], 10 * '*')
             LOGGER.error(error_message)
@@ -1210,8 +1366,7 @@ def validate_dependencies(ctx):
     if errs:
         raise DependencyException(" ".join(errs))
 
-def load_discovered_schema(stream):
-    schema = load_schema(stream.tap_stream_id)
+def get_metadata(stream, schema):
     mdata = metadata.new()
 
     mdata = metadata.write(mdata, (), 'table-key-properties', stream.key_properties)
@@ -1230,8 +1385,13 @@ def load_discovered_schema(stream):
     if stream.tap_stream_id == "engagements":
         mdata = metadata.write(mdata, ('properties', 'engagement'), 'inclusion', 'automatic')
         mdata = metadata.write(mdata, ('properties', 'lastUpdated'), 'inclusion', 'automatic')
+    return metadata.to_list(mdata)
 
-    return schema, metadata.to_list(mdata)
+def load_discovered_schema(stream):
+    schema = load_schema(stream.tap_stream_id)
+    mdata = get_metadata(stream, schema)
+
+    return schema, mdata
 
 def discover_schemas():
     result = {'streams': []}
@@ -1247,6 +1407,14 @@ def discover_schemas():
             # Skip the discovery mode on the streams were the required scopes are missing
             warning_message = str(ex).replace(CONFIG['access_token'], 10 * '*')
             LOGGER.warning(warning_message)
+
+    for custom_stream in generate_custom_streams(mode="DISCOVER"):
+        LOGGER.info('Loading schema for Custom Object - %s', custom_stream["stream"].tap_stream_id)
+        result['streams'].append({'stream': custom_stream["stream"].tap_stream_id,
+                                  'tap_stream_id': custom_stream["stream"].tap_stream_id,
+                                  'schema': custom_stream["schema"],
+                                  'metadata': get_metadata(custom_stream["stream"], custom_stream["schema"])})
+
     # Load the contacts_by_company schema
     LOGGER.info('Loading schema for contacts_by_company')
     contacts_by_company = Stream('contacts_by_company', _sync_contacts_by_company, ['company-id', 'contact-id'], None, 'FULL_TABLE')

diff --git a/tap_hubspot/schemas/shared/associations_schema.json b/tap_hubspot/schemas/shared/associations_schema.json
@@ -0,0 +1,34 @@
+{
+    "type": [
+        "null",
+        "object"
+    ],
+    "properties": {
+        "results": {
+            "type": [
+                "null",
+                "array"
+            ],
+            "items": {
+                "type": [
+                    "null",
+                    "object"
+                ],
+                "properties": {
+                    "id": {
+                        "type": [
+                            "null",
+                            "string"
+                        ]
+                    },
+                    "type": {
+                        "type": [
+                            "null",
+                            "string"
+                        ]
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/tap_hubspot/schemas/shared/custom_objects.json b/tap_hubspot/schemas/shared/custom_objects.json
@@ -0,0 +1,44 @@
+{
+    "type": "object",
+    "properties": {
+        "id": {
+            "type": [
+                "null",
+                "string"
+            ]
+        },
+        "createdAt": {
+            "type": ["null", "string"],
+            "format": "date-time"
+        },
+        "updatedAt": {
+            "type": ["null", "string"],
+            "format": "date-time"
+        },
+        "archived": {
+            "type": [
+                "null",
+                "boolean"
+            ]
+        },
+        "associations": {
+            "type": [
+                "null",
+                "object"
+            ],
+            "properties": {
+                "emails": {"$ref": "associations_schema.json"},
+                "meetings": {"$ref": "associations_schema.json"},
+                "notes": {"$ref": "associations_schema.json"},
+                "tasks": {"$ref": "associations_schema.json"},
+                "calls": {"$ref": "associations_schema.json"},
+                "conversation_session": {"$ref": "associations_schema.json"},
+                "companies": {"$ref": "associations_schema.json"},
+                "contacts": {"$ref": "associations_schema.json"},
+                "deals": {"$ref": "associations_schema.json"},
+                "products": {"$ref": "associations_schema.json"},
+                "tickets": {"$ref": "associations_schema.json"}
+            }
+        }
+    }
+}