In [18]:
import pandas as pd
import gspread
from gspread_dataframe import set_with_dataframe
from oauth2client.service_account import ServiceAccountCredentials

# Auth
scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive']
creds = ServiceAccountCredentials.from_json_keyfile_name('../gspread_creds.json', scope)
client = gspread.authorize(creds)

In [19]:
# Open the sheet
worksheet = client.open("tfr500_summarized").worksheet("classified")

# Get all records as list of dicts
records = worksheet.get_all_records()

# Convert to DataFrame
tfr500 = pd.DataFrame(records)

In [21]:
org_type_mapping = {
    'Corporation (Aktiengesellschaft)': 'AG',
    'GmbH': 'GmbH',
}
link_type_mapping = {
    'ownership': 'USO', 
    'unknown': 'USOP', 
    'Partnership': 'USC', 
    'branch': 'USO'
}

tfr500['org_type'] = tfr500['Type of Organization'].map(org_type_mapping).fillna("Others")
tfr500['link_type'] = tfr500['Type of Investment'].map(link_type_mapping).fillna("Others")

In [None]:
tfr500 = tfr500.rename(columns={'American Owner - Name': 'US Company', 
                                'Master German firm name': 'German subsidiary'})
tfr500_cleaned = tfr500[['German subsidiary', 'US Company', 'Owned Through (Allied Foreign Organization)', 'link_type', 'org_type', 'Type of Investment', 'Type of Organization', 'Percent Owned']]

In [None]:
tfr500_cleaned.to_csv('../output/tfr_cleaned.csv', index=False)