In [43]:
import pandas as pd
import gspread
from gspread_dataframe import set_with_dataframe
from oauth2client.service_account import ServiceAccountCredentials

# Auth
scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive']
creds = ServiceAccountCredentials.from_json_keyfile_name('../gspread_creds.json', scope)
client = gspread.authorize(creds)

In [44]:
# Open the sheet
worksheet = client.open("Handbuch 1932").worksheet("validated")

# Get all records as list of dicts
records = worksheet.get_all_records()

# Convert to DataFrame
hb1932 = pd.DataFrame(records)

bool_cols = ['Grossaktionär is a US comp',
             'Director attached to a US company', 
             'US location linked to a director',
             'US-sounding names', 
             'Bond issuance in the US', 
             'Other indicators are strong']
hb1932[bool_cols] = hb1932[bool_cols].apply(lambda x: x == "TRUE")

In [45]:
def assign_link_type_HB(row):
    if row['Grossaktionär is a US comp']:
        link_type = 'USO'
    elif row['Director attached to a US company']:
        link_type = 'USOP'
    elif row['Other indicators are strong']:
        link_type = 'USOP'
    elif row['US location linked to a director']:
        link_type = 'USC'
    elif row['Bond issuance in the US']:
        link_type = 'USB'
    else:
        link_type = 'Others'

    return link_type

In [46]:
hb1932['link_type'] = hb1932.apply(assign_link_type_HB, axis=1)
hb1932 = hb1932.assign(org_type = 'AG')
hb1932 = hb1932.rename(columns={'Master US firm name': 'US Company', 
                                'Master name': 'German subsidiary',
                                'corrected firm name': 'Affiliated German firm (as in book)',
                                'US parent': 'US firm (as in book)',
                                })
hb1932['Vol'] = (
    "Vol " + hb1932['band'].astype(str) + 
    ", p. " + hb1932['firmname_page'].astype(str)
)

hb1932_result = hb1932[['German subsidiary', 'US Company', 'link_type', 'org_type', 'US firm (as in book)', 'Affiliated German firm (as in book)', 'Vol', 'Grossaktionär is a US comp',
       'Director attached to a US company', 'US location linked to a director',
       'US-sounding names', 'Bond issuance in the US', 'Other indicators',
       'Other indicators are strong']]

hb1932_cleaned = hb1932_result[hb1932_result['link_type']!='Others']

In [None]:
hb1932_cleaned.to_csv('../output/hb32_cleaned.csv', index=False)