# Crunchbase API v4 set up

**From CrunchBase**:
- How to Use the API - https://data.crunchbase.com/docs/using-the-api
- How to find which fields and data are available via our API - https://data.crunchbase.com/docs/available-data
- Swaggerhub (contains API calls and responses) - https://app.swaggerhub.com/apis-docs/Crunchbase/crunchbase-enterprise_api/#/Entity/get_entities_organizations__entity_id_
- API Reference - https://data.crunchbase.com/reference

### Questions for CB Contact
- How can I easily pull CB UUIDs from a list of companies? Autocomplete? Or can we access that through CB<>SFDC integration?

### Following this thread: https://medium.com/priyanshumadan/extract-data-from-crunchbase-api-using-python-8e99ed6bc73e

### Next Steps (8/8):
- Query 1: Organization --> Financial Summary, Funding Rounds, Board Members
- Query 2: Investor --> Board Affiliations, Partner Funding Rounds
- Query 3: Match-making (Corporate BoD <> Partner Investors)
- Push to Google Sheets: https://towardsdatascience.com/how-to-integrate-google-sheets-and-jupyter-notebooks-c469309aacea

### Import all necessary packages

In [178]:
# Requests allows you to send HTTP/1.1 requests extremely easily.
# There’s no need to manually add query strings to your URLs, or to form-encode your POST data. 
# Keep-alive and HTTP connection pooling are 100% automatic, thanks to urllib3.
import requests
import json
import pandas as pd
from pandas import json_normalize 
from operator import itemgetter
from user_key import userkey # Pulls in P1s Crunchbase API user key

# My own way

In [179]:
def url_count(query, query_type): 
    # POST method with API URL, query_type as a parameter, and passing query as json.
    # https://www.w3schools.com/python/ref_requests_post.asp
    r = requests.post("https://api.crunchbase.com/api/v4/searches/" + query_type, params = userkey , json = query)
    result = json.loads(r.text)
    count = result["count"]
    return count

def url_extraction(query, query_type):    
    # Create global raw variable
    global raw   
    # POST method with API URL, query_type as a parameter, and passing query as json.
    # https://www.w3schools.com/python/ref_requests_post.asp
    r = requests.post("https://api.crunchbase.com/api/v4/searches/" + query_type, params = userkey , json = query)
    result = json.loads(r.text)
    normalized_raw = json_normalize(result['entities'])
    # Append normalized entity results to global raw variable
    raw = raw.append(normalized_raw, ignore_index=True)

def autocompletes(query, collection_ids_list=None, limit=None):
    '''
    Suggests matching Identifier entities based on the query and entity_def_ids provided.
    
    QUERY
    Value to perform the autocomplete search with.
    
    COLLECTION_IDS_LIST
    A comma separated list of collection ids to search against. 
    Leaving this blank means it will search across all identifiers. 
    Entity defs can be constrained to specific facets by providing them as facet collections. 
    Relationship collections will resolve to their underlying entity def.
    Collection ids are: organizations, people, funding_rounds, acquisitions, investments,
    events, press_references, funds, event_appearances, ipos, ownerships, categories, 
    category_groups, locations, jobs
    
    LIMIT
    Number of results to retrieve; default = 10, max = 25
    '''
    params = {**userkey, "query": query}
    if collection_ids_list and type(collection_ids_list) == list:
        params.update({"collection_ids": collection_ids_list})
    if limit and type(limit) == int:
        params.update({"limit": limit})
    r = requests.get("https://api.crunchbase.com/api/v4/autocompletes", params = params)
    result = json.loads(r.text)
    normalized_result = json_normalize(result["entities"])
    dataframe = pd.DataFrame.from_dict(normalized_result)
    return dataframe

def whoKnows(name, df, person_first=True):
    '''
    This function takes in a name string and a dataframe generated by the url_extraction(query, "jobs") function.
    
    Returns a string that is a concatenated list of unique names mapped to their companies, excluding the rows of the named person.
    
    This is a Pledge 1% action that helps us see who knows a specific Boardroom Ally and how, pulled from the Boardroom Allies affiliations dataframe.
    
    If person_first is True (default action), the output will aggregate by person. 
    matches_str = 'Name1 (Company1, Company), Name2 (Company2), Name3 (Company1), ...'
    
    If person_first is False, the output will aggregate by company
    matches_str = 'Company1 (Name1, Name2, Name3), Company2 (Name1), Company3 (Name2, Name3), ...'
    '''
    # Make a list of all unique companies affiliated with input name
    company_matches = set(df["properties.organization_identifier.value"][df["properties.person_identifier.value"] == name].to_list())
    
    # Create matches dataframe, filtering by those that match the unique company list. 
    matches_df = df[df["properties.organization_identifier.value"].isin(company_matches)]
    matches_df = matches_df[matches_df["properties.person_identifier.value"] != name].sort_values(["properties.organization_identifier.value"])
    
    # Create intermediate dictionary which will de-dup based on `person_first` value
    matches_list_names = matches_df["properties.person_identifier.value"].to_list()
    matches_list_co = matches_df["properties.organization_identifier.value"].to_list()
    matches_dict = {}
    
    if person_first:
        # Aggregate by name.
        for i in range(len(matches_list_names)):
            if matches_list_names[i] in matches_dict.keys():
                matches_dict[matches_list_names[i]].append(matches_list_co[i])
            else:
                matches_dict[matches_list_names[i]] = [matches_list_co[i]]
    
    if not person_first:
        # Aggregate by company.
        for i in range(len(matches_list_names)):
            if matches_list_co[i] in matches_dict.keys():
                matches_dict[matches_list_co[i]].append(matches_list_names[i])
            else:
                matches_dict[matches_list_co[i]] = [matches_list_names[i]]    
    
    # Create output string
    matches_str = ""
    for key, value in matches_dict.items():
        matches_str += key + " ("
        forloop_str = ""
        for i in range(len(matches_dict[key])):
            if i == len(matches_dict[key]) - 1:
                forloop_str += matches_dict[key][i] + ")"
                continue
            forloop_str += matches_dict[key][i] + ", "
        matches_str += forloop_str+ ", "
    # Remove extra space and comma
    matches_str = matches_str[:-2]
    
    return matches_str

def makequery_jobs(uuid, limit=1000):
    '''
    Job Search: Boardroom Ally Current Board/Executive Affiliations
    - Person includes list of `uuid` values
    - The job title is current (`is_current == True`)
    - Excludes `employee` level jobs
    '''
    query = {
        "field_ids": [
            "created_at",
            "employee_featured_order",
            "ended_on",
            "entity_def_id",
            "identifier",
            "is_current",
            "job_type",
            "name",
            "organization_identifier",
            "permalink",
            "person_identifier",
            "short_description",
            "started_on",
            "title",
            "updated_at",
            "uuid"],
        "limit": limit,
        "query": [
            {
                "type": "predicate",
                "field_id": "person_identifier",
                "operator_id": "includes",
                "values": uuid
            },
            {
                "type": "predicate",
                "field_id": "is_current",
                "operator_id": "eq",
                "values": ["true"]
            },
            {
                "type": "predicate",
                "field_id": "job_type",
                "operator_id": "not_includes",
                "values": ["employee"]
            }]
    }
    return query

def makequery_p1_jobs(uuid, limit=1000):
    '''
    Job Search: P1 Companies Current Board/Executive Affiliations
    - Organization includes list of `uuid` values
    - The job title is current (`is_current == True`)
    - Excludes `employee` and `executive` level jobs
    '''
    query = {
        "field_ids": [
            "created_at",
            "employee_featured_order",
            "ended_on",
            "entity_def_id",
            "identifier",
            "is_current",
            "job_type",
            "name",
            "organization_identifier",
            "permalink",
            "person_identifier",
            "short_description",
            "started_on",
            "title",
            "updated_at",
            "uuid"],
        "limit": limit,
        "query": [
            {
                "type": "predicate",
                "field_id": "organization_identifier",
                "operator_id": "includes",
                "values": uuid
            },
            {
                "type": "predicate",
                "field_id": "is_current",
                "operator_id": "eq",
                "values": ["true"]
            },
            {
                "type": "predicate",
                "field_id": "job_type",
                "operator_id": "not_includes",
                "values": ["employee", "executive"]
            }]
    }
    return query

### Get UUID for each Boardroom Ally target

In [180]:
search_jobs = pd.read_csv("Boardroom Allies Tracker - job_search_terms.csv").CONCAT.to_list()
search_jobs.sort()
uuid_jobs = []
for item in search_jobs:
    #print("\n{}.... Searching".format(item))
    found = autocompletes(item, ["people"], limit=1)
    uuid_jobs.append(found["identifier.uuid"][0])
    #print("Found...\n{}".format(found))
    #print("*"*100)

### Get UUID for each P1 Equity Company

In [203]:
search_companies = pd.read_csv("Boardroom Allies Tracker - company_search_terms.csv").Company.to_list()
uuid_companies = []
for item in search_companies:
    #print("\Searching for...{}".format(item))
    found = autocompletes(item, ["organizations"], limit=1)
    uuid_companies.append(found["identifier.uuid"][0])
    #print("\nFound...\n{}".format(found[["short_description", "identifier.value"]]))
    #print("*"*100)

## Boardroom Allies Tracker: Boardroom Allies Affiliations

In [185]:
query = makequery_jobs(uuid_jobs)
raw = pd.DataFrame() # Global raw variable
comp_count = url_count(query, "jobs") 
url_extraction(query, "jobs")
affiliations = raw[["properties.organization_identifier.value",
                    "properties.person_identifier.value", 
                    "properties.title", 
                    "properties.job_type", 
                    "properties.started_on.value",
                    "properties.updated_at"]].sort_values(["properties.organization_identifier.value"])
affiliations.to_csv("boardroom_allies_affiliations.csv")
affiliations.head()

Unnamed: 0,properties.organization_identifier.value,properties.person_identifier.value,properties.title,properties.job_type,properties.started_on.value,properties.updated_at
335,140 Proof,Ronald Conway,Member of the Board of Advisory,advisor,,2018-02-13T01:25:41Z
678,23andMe,Roelof Botha,Board Member,board_member,2017-09-01,2019-03-30T11:13:50Z
251,2U Inc.,Timothy Haley,Member of the Board of Directors,board_member,2009-01-01,2018-02-13T01:23:50Z
758,3Com,Sanford Robertson,Member of the Board of Directors,board_member,,2018-02-13T01:34:39Z
353,3D Robotics,Jason Mendelson,Board Member,board_member,,2018-02-13T11:48:53Z


## Boardroom Allies Tracker: Matchmaking

In [209]:
affiliations = pd.read_csv("boardroom_allies_affiliations.csv")
affiliations_p1 = affiliations[affiliations["properties.organization_identifier.value"].isin(search_companies)]
names = list(set(affiliations["properties.person_identifier.value"].to_list()))
names.sort()
knows = []
knows_byco = []
knows_byp1 = []
for item in names:
    knows.append(whoKnows(item, affiliations))
    knows_byco.append(whoKnows(item, affiliations, False))
    p1_string = ""
    knows_byp1_list = affiliations_p1[affiliations_p1["properties.person_identifier.value"] == item]["properties.organization_identifier.value"].to_list()
    for company in knows_byp1_list:
        p1_string += company + ", "
    p1_string = p1_string[:-2]
    knows_byp1.append(p1_string)
d = {'Name':names,'Knows':knows, 'Knows_ByCompany': knows_byco, "Knows_P1Company": knows_byp1}
affiliations_matchmaking = pd.DataFrame(d, columns = ["Name", "Knows", "Knows_ByCompany","Knows_P1Company"])
affiliations_matchmaking.to_csv("boardroom_allies_matchmaking.csv")
affiliations_matchmaking.head()

Unnamed: 0,Name,Knows,Knows_ByCompany,Knows_P1Company
0,Ajay Agarwal,"Enrique Salem (Bain Capital Ventures, Clari), ...","Bain Capital Ventures (Enrique Salem), Clari (...",
1,Andrew Chen,"Susan Su (500 Startups), Jeff Jordan (Andreess...","500 Startups (Susan Su), Andreessen Horowitz (...",Dropbox
2,Andy Weissman,David Pakman (YouNow),YouNow (David Pakman),
3,Arne Duncan,"Ryan Hinkle (Pluralsight, Turnitin), Tim Maudl...","Pluralsight (Ryan Hinkle, Tim Maudlin), Turnit...",Pluralsight
4,Ben Horowitz,"Jeff Jordan (Andreessen Horowitz), John O'Farr...","Andreessen Horowitz (Jeff Jordan, John O'Farre...",Okta


## P1 Companies Board Members

In [210]:
query = makequery_p1_jobs(uuid_companies)
raw = pd.DataFrame() # Global raw variable
comp_count = url_count(query, "jobs") 
url_extraction(query, "jobs")
board_of_p1_companies = raw[["properties.organization_identifier.value",
                    "properties.person_identifier.value", 
                    "properties.title", 
                    "properties.job_type", 
                    "properties.started_on.value",
                    "properties.updated_at"]].sort_values(["properties.organization_identifier.value"])
board_of_p1_companies.to_csv("board_of_p1_companies_affiliations.csv")
board_of_p1_companies.head()

Unnamed: 0,properties.organization_identifier.value,properties.person_identifier.value,properties.title,properties.job_type,properties.started_on.value,properties.updated_at
32,Atlassian,Steve Sordello,Director,board_member,2015-11-01,2019-09-06T13:26:35Z
177,Atlassian,Enrique Salem,Director,board_member,2013-07-01,2019-09-06T13:19:08Z
105,Atlassian,Heather Mirjahangir,Director,board_member,2015-11-01,2019-09-06T13:26:35Z
114,Atlassian,Shona Brown,Director,board_member,2015-11-01,2019-09-06T13:26:35Z
20,Atlassian,Mike Cannon-Brookes,Director,board_member,2002-02-01,2019-09-06T12:48:01Z


# ----------------------------------------------------------------------------------
# ----------------------------------------------------------------------------------
# ----------------------------------------------------------------------------------


### Walk Through Example: Los Angeles Companies query

The `query` dictionary below is the query that will be posted to the request API to receive the data.

- `field_ids: []` -- List of all the entities needed from Crunchbase

- `limit: 1000` -- Defines the number of results the query returns. Crunchbase Pro has a maximum limit of 1000.

- `query: {}` -- Defines the actual query part. 
    - This is an example query for companies in Los Angeles. `location_identifier` is the universally unique identifier (UUID) of Los Angeles. 
    - The second part of the query is what we want Crunchbase to return, which in this case is company data. Hence, we set `facet_ids` value as `company`.

In [86]:
query = {
    "field_ids": [ # THIS IS THE DATA THAT GETS RETURNED
        "identifier",
        "location_identifiers",
        "short_description",
        "categories",
        "num_employees_enum",
        "revenue_range",
        "operating_status",
        "website",
        "linkedin"],
    "limit": 1000, # 1000 LIMIT
    "query": [
        {
            "type": "predicate", # FIRST PART SPECIFIES LOOKING FOR LOS ANGELES AS LOCATION_IDENTIFIERS
            "field_id": "location_identifiers",
            "operator_id": "includes",
            "values": [
                "4ce61f42-f6c4-e7ec-798d-44813b58856b"] #UUID FOR LOS ANGELES
        },
        {
            "type": "predicate",
            "field_id": "facet_ids",
            "operator_id": "includes",
            "values": ["company"] # RETURN COMPANY DATA
        }]
}

### Extact data with user-defined functions

With the query set up, the following code is two functions that will return the number of companies (`company_count`) and extract data and save it as a pandas data frame (`url_extractions`). 

`company_count`:
- Returns `count`
- Equal to the total number of query results, which may be more than 1000

`url_extraction`
- Returns the results of each query, normalizes the json data, and appends it to the global `raw` dictionary:
- Keys of `result`: `count`, `entities`
- `entities` -- A list of dictionnaires of the top 1000 results. Each list item has keys `uuid` and `properties`

In [92]:
def company_count(query): 
    # POST method with API URL, userkey as a parameter, and passing query as json.
    r = requests.post("https://api.crunchbase.com/api/v4/searches/organizations", params = userkey , json = query)
    #print(r.text)
    result = json.loads(r.text)
    total_companies = result["count"]
    return total_companies

def url_extraction(query):    
    # Create global raw variable
    global raw   
    # POST method with API URL, userkey as a parameter, and passing query as json.
    r = requests.post("https://api.crunchbase.com/api/v4/searches/organizations", params = userkey , json = query)
    result = json.loads(r.text)
    normalized_raw = json_normalize(result['entities'])
    # Append normalized entity results to global raw variable
    raw = raw.append(normalized_raw, ignore_index=True)

## For looping over > 1000 entries

Now since there are more than 1000 companies, I had to loop my query till I get all my results. The way I did this was by adding “after_id” key in the query part and the last UUID as the key. By doing this the loop will fetch new data after the last UUID that was fetched.

In [75]:
raw=pd.DataFrame() # Global raw variable
comp_count = company_count(query) # How much data is there to loop through?
data_acq = 0 # Starting loop value

while data_acq < comp_count:
    if data_acq != 0: # Query loop
        #print("before0:", data_acq)
        last_uuid = raw.uuid[len(raw.uuid)-1] # selects the most recently added result
        query["after_id"] = last_uuid # saves most recent uuid query so POST request starts after this one
        url_extraction(query) # extracts data  
        data_acq = len(raw.uuid) # updates data_acq variable
        #print("after0:",data_acq)
    else:
        if "after_id" in query: # Removes after_id in case its there before the query starts.
            #print("before1:", data_acq)
            query = query.pop("after_id")
            url_extraction(query)
            data_acq = len(raw.uuid)
            #print("after1:",data_acq)
        else: # Starting query loop
            #print("before2:", data_acq)
            url_extraction(query)
            data_acq = len(raw.uuid)
            #print("after2:", data_acq)

before2: 0
after2: 1000
before0: 1000
after0: 2000
before0: 2000
after0: 3000
before0: 3000
after0: 4000
before0: 4000
after0: 5000
before0: 5000
after0: 6000
before0: 6000
after0: 7000
before0: 7000
after0: 8000
before0: 8000
after0: 8544


In [76]:
raw.head()

Unnamed: 0,uuid,properties.website.value,properties.identifier.permalink,properties.identifier.image_id,properties.identifier.uuid,properties.identifier.entity_def_id,properties.identifier.value,properties.linkedin.value,properties.short_description,properties.operating_status,properties.num_employees_enum,properties.categories,properties.location_identifiers,properties.revenue_range
0,2bc2d5c9-37d3-86b4-ceba-f5cf3f34dc82,http://www.pipe.com,pipe,morjb7ivpibxte3wlew5,2bc2d5c9-37d3-86b4-ceba-f5cf3f34dc82,organization,Pipe,https://www.linkedin.com/company/pipetechnolog...,Pipe is the new way for SaaS companies to fina...,active,c_00011_00050,"[{'entity_def_id': 'category', 'permalink': 'f...","[{'permalink': 'los-angeles-california', 'uuid...",
1,e4f41fbf-eea3-2db9-0c69-cd16f5cfc0e9,https://www.hulu.com/,hulu,bk8cux6dapq8qjzylfaj,e4f41fbf-eea3-2db9-0c69-cd16f5cfc0e9,organization,Hulu,https://www.linkedin.com/company/hulu,Hulu is an online video service that offers a ...,active,c_01001_05000,"[{'entity_def_id': 'category', 'permalink': 'c...","[{'permalink': 'los-angeles-california', 'uuid...",r_00100000
2,5df7b3c7-d0f4-2ff1-b7ad-74c67706cd2f,http://blavityinc.com/,blavity,tgtkxxka1bwmq4lak1dj,5df7b3c7-d0f4-2ff1-b7ad-74c67706cd2f,organization,"Blavity, Inc.",https://www.linkedin.com/company/blavity,"Blavity, Inc. is home to the largest network o...",active,c_00051_00100,"[{'entity_def_id': 'category', 'permalink': 's...","[{'permalink': 'los-angeles-california', 'uuid...",r_00001000
3,b2ad4b5b-1d18-2aae-9276-ae0c3fb75306,http://kernel.co/,kernel-co,vtkmcdz3nfbwkoavyqnq,b2ad4b5b-1d18-2aae-9276-ae0c3fb75306,organization,Kernel,https://www.linkedin.com/company/kernel/,Kernel is a neuroscience company that speciali...,active,c_00051_00100,"[{'entity_def_id': 'category', 'permalink': 'i...","[{'permalink': 'los-angeles-california', 'uuid...",r_00010000
4,1e349e9f-1206-6a22-434b-9ca4281a360d,https://www.bambee.com,bambee,fdmycopis7nbdph3hpu8,1e349e9f-1206-6a22-434b-9ca4281a360d,organization,Bambee,https://www.linkedin.com/company/bambee/,Bambee allows small and medium-sized businesse...,active,c_00051_00100,"[{'entity_def_id': 'category', 'permalink': 'h...","[{'permalink': 'los-angeles-california', 'uuid...",r_00000000


## Clean up!

In [81]:
revenue_range = {
    "r_00000000": "Less than $1M",
    "r_00001000": "$1M to $10M",
    "r_00010000": "$10M to $50M",
    "r_00050000": "$50M to $100M",
    "r_00100000": "$100M to $500M",
    "r_00500000": "$500M to $1B",
    "r_01000000": "$1B to $10B",
    "r_10000000": "$10B+"}

employee_range = {
    "c_00001_00010": "1-10",
    "c_00011_00050": "11-50",
    "c_00051_00100": "51-100",
    "c_00101_00250": "101-250",
    "c_00251_00500": "251-500",
    "c_00501_01000": "501-1000",
    "c_01001_05000": "1001-5000",
    "c_05001_10000": "5001-10000",
    "c_10001_max": "10001+"}

master = pd.DataFrame()
master["uuid"] = raw["uuid"]
master["company"] = raw["properties.identifier.value"]
master["description"] = raw["properties.short_description"]
master["categories"] = raw["properties.categories"].apply(lambda x: 
                                                          list(map(itemgetter('value'), x)
                                                               if isinstance(x, list) 
                                                               else ["Not found"])).apply(lambda x : 
                                                                                          ",".join(map(str, x)))
master["location"] = raw["properties.location_identifiers"].apply(lambda x: 
                                                                  list(map(itemgetter('value'), x)
                                                                       if isinstance(x, list) 
                                                                       else ["Not found"])).apply(lambda x : 
                                                                                                  ",".join(map(str, x)))
master["revenue"] = raw["properties.revenue_range"].map(revenue_range)
master["num_of_employees"] = raw["properties.num_employees_enum"].map(employee_range)
master["linkedin"] = raw["properties.linkedin.value"]
master["website"] = raw["properties.website.value"]
master["status"] = raw["properties.operating_status"]
master=master.fillna("NA")

In [90]:
raw["properties.categories"][0]

[{'entity_def_id': 'category',
  'permalink': 'finance',
  'uuid': 'ae5718dc-8602-c733-e62b-28e6167091ad',
  'value': 'Finance'},
 {'entity_def_id': 'category',
  'permalink': 'financial-services',
  'uuid': '90b4194f-1d4f-ff5c-d7a6-6b6f32ae4892',
  'value': 'Financial Services'},
 {'entity_def_id': 'category',
  'permalink': 'internet',
  'uuid': '2772a539-e636-09dc-30dc-b1a7deb98d0e',
  'value': 'Internet'},
 {'entity_def_id': 'category',
  'permalink': 'saas-5c4e',
  'uuid': '5c4e6926-5ff7-b188-0892-c8eb036c5ace',
  'value': 'SaaS'}]

In [91]:
master.head()

Unnamed: 0,uuid,company,description,categories,location,revenue,num_of_employees,linkedin,website,status
0,2bc2d5c9-37d3-86b4-ceba-f5cf3f34dc82,Pipe,Pipe is the new way for SaaS companies to fina...,"Finance,Financial Services,Internet,SaaS","Los Angeles,California,United States,North Ame...",,11-50,https://www.linkedin.com/company/pipetechnolog...,http://www.pipe.com,active
1,e4f41fbf-eea3-2db9-0c69-cd16f5cfc0e9,Hulu,Hulu is an online video service that offers a ...,"Content,Film,TV,Video,Video Streaming","Los Angeles,California,United States,North Ame...",$100M to $500M,1001-5000,https://www.linkedin.com/company/hulu,https://www.hulu.com/,active
2,5df7b3c7-d0f4-2ff1-b7ad-74c67706cd2f,"Blavity, Inc.","Blavity, Inc. is home to the largest network o...","Social Media,Software","Los Angeles,California,United States,North Ame...",$1M to $10M,51-100,https://www.linkedin.com/company/blavity,http://blavityinc.com/,active
3,b2ad4b5b-1d18-2aae-9276-ae0c3fb75306,Kernel,Kernel is a neuroscience company that speciali...,"Intelligent Systems,Life Science,Neuroscience","Los Angeles,California,United States,North Ame...",$10M to $50M,51-100,https://www.linkedin.com/company/kernel/,http://kernel.co/,active
4,1e349e9f-1206-6a22-434b-9ca4281a360d,Bambee,Bambee allows small and medium-sized businesse...,"Human Resources,SaaS,Software","Los Angeles,California,United States,North Ame...",Less than $1M,51-100,https://www.linkedin.com/company/bambee/,https://www.bambee.com,active


### Tinkering with python-crunchbase module

https://github.com/anglinb/python-crunchbase

In [162]:
"""
Python library for the CrunchBase api.
Copyright (c) 2010 Apurva Mehta <mehta.apurva@gmail.com> for CrunchBase class
Edit made by Brian Anglin <brianranglin@gmail.com> to...
  * Update wrapper for API version 2.0 https://developer.crunchbase.com/docs
Edits made by Alexander Pease <alexander@usv.com> to...
  * Ensure compliance with 2013 API key requirement
  * Fix namespace conventions (ex: 'Kapor Capital' is sent as 'kapor+capital')
  * Functions requiring parsing of CrunchBase-return JSON (ex. list investors)
  * If HTTP request fails, return None instead of raising Exception
  * Set strict=false for json.loads(). Avoids some errors in the CB API.
  * Sanitize strings used as argument for __webRequest
"""

__author__ = 'Brian Anglin, Apurva Mehta, Patrick Reilly, Daniel Mendalka'
__version__ = '2.0.1'

import urllib
import json
import unicodedata
from urllib import parse, request
from urllib.error import HTTPError

API_BASE_URL = 'http://api.crunchbase.com/'
API_VERSION = '4' #UPDATE
API_URL = API_BASE_URL + 'v' + API_VERSION + '/'


class CrunchBase:

    def __init__(self, api_key, cache={}):
        self.api_key = api_key
        self.__cache = cache

    def __webRequest(self, url):
        print('Making request to: ', url) #UPDATE
        try:
            opener = request.build_opener(NotModifiedHandler()) #UPDATE
            req = request.Request(url) #UPDATE
            url_handle = opener.open(req)

            if hasattr(url_handle, 'code') and url_handle.code == 304:
                print('Got 304 response, no body send') #UPDATE
                return self.__cache[url]['response']
            else:
                headers = url_handle.info()
                response = url_handle.read()

                cache_data = {
                    'response': response,
                    'url': url.replace('?api_key=' + self.api_key, '')
                }
                self.__cache[url] = cache_data
                return response

        except HTTPError: #UPDATE
            print('HTTPError calling ' + url) #UPDATE
            return None

    def createQueryArgs(self, kwargs):
        query_string = ''
        for key, value in kwargs.items():
            query_string = query_string + '&' + parse.quote_plus(key) + '=' + parse.quote_plus(value)
        return query_string

    def getSingleObjectForPath(self, path, namespace):
        """This returns result of a single path in JSON format"""
        if not path.startswith(namespace+'/'):
            path = namespace+'/'+path
        url = API_URL + path + '/?user_key='+ self.api_key
        return json.loads(self.__webRequest(url))
 
    def getOrganizations(self, query, **kwargs):
        """This returns result of an organization search query in JSON format. 
        Optional: name, domain_name, organization_types, location_uuids, category_uuids, page, order [created_at DESC/ASC, updated_at DESC/ASC]"""
        extra_args = self.createQueryArgs(dict({'query':query}.items() | kwargs.items()) )
        url = API_URL + 'organizations/?user_key='+ self.api_key + extra_args
        return json.loads(self.__webRequest(url))

    def getOrganization(self, path):
        """This returns result of a single organization in JSON format"""
        return self.getSingleObjectForPath(path, 'organization')

    def getPeople(self, **kwargs):
        """This returns result of people in JSON format. Optional: page,  order [created_at DESC/ASC, updated_at DESC/ASC]"""
        extra_args = self.createQueryArgs( kwargs )
        url = API_URL + 'people/?user_key='+ self.api_key + extra_args
        return json.loads(self.__webRequest(url))

    def getPerson(self, path):
        """This returns result of a single person in JSON format"""
        return self.getSingleObjectForPath(path, 'person')

    def getProducts(self, **kwargs):
        """This returns result of products in JSON format. Optional: page,  order [created_at DESC/ASC, updated_at DESC/ASC]"""
        extra_args = self.createQueryArgs( kwargs )
        url = API_URL + 'products/?user_key='+ self.api_key + extra_args
        return json.loads(self.__webRequest(url))

    def getProduct(self, path):
        """This returns result of a single product in JSON format"""
        return self.getSingleObjectForPath(path, 'product')

    def getFundingRound(self, path):
        """This returns result of a single funding-round in JSON format"""
        return self.getSingleObjectForPath(path, 'funding-round')

    def getAcquisition(self, path):
        """This returns result of a single acquisition in JSON format"""
        return self.getSingleObjectForPath(path, 'acquisition')

    def getIPO(self, path):
        """This returns result of a single ipo in JSON format"""
        return self.getSingleObjectForPath(path, 'ipo')

    def getFundRaise(self, path):
        """This returns result of a single fund-raise in JSON format"""
        return self.getSingleObjectForPath(path, 'fund-raise')

    def getLocations(self, **kwargs):
        """This returns result of locations in JSON format. Optional: page"""
        extra_args = self.createQueryArgs( kwargs )
        url = API_URL + 'locations/?user_key='+ self.api_key + extra_args
        return json.loads(self.__webRequest(url))

    def getCategories(self, **kwargs):
        """This returns result of categories in JSON format. Optional: page"""
        extra_args = self.createQueryArgs( kwargs )
        url = API_URL + 'categories/?user_key='+ self.api_key + extra_args
        return json.loads(self.__webRequest(url))

# organization/dropbox
class CrunchBaseResponse(object):

    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

    def __repr__(self):
        return '%s(%r)' % (self.__class__.__name__, self.__dict__)


class CrunchBaseError(Exception):
    
    pass

class NotModifiedHandler(urllib.request.BaseHandler): #UPDATE

    def http_error_304(self, req, fp, code, message, headers):
        addinfourl = urllib.response.addinfourl(fp, headers, req.get_full_url())
        addinfourl.code = code
        return addinfourl