In [1]:
import pandas as pd
import numpy as np
import pandas as pd
import openpyexcel
from openpyexcel import workbook, load_workbook
from openpyxl.worksheet.table import Table, TableStyleInfo
from openpyexcel.utils import get_column_letter
from itertools import islice
from os import listdir
import re
import csv
import datetime
import pickle

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 100)
pd.options.display.float_format = '{:.0f}'.format

# Map Line Items

In [3]:
df = pd.read_csv('CSV/balance_line_items.csv')

In [4]:
df.set_index('line_item', inplace=True)
df

Unnamed: 0_level_0,type
line_item,Unnamed: 1_level_1
Investment securities,X
Other Assets,
Deferred revenue from Sanofi,
"Plant, property and equipment, net",
"Other long-term assets, net",
...,...
"Common stock, par value $.01 per share, authorized 600,000,000 shares, issued and outstanding 207,470,000 and 213,420,000 shares, respectively",
"Common stock, $1 par value - authorized 1.2 billion shares; issued 355.2 million and 354.3 million shares",
Paid-in capital,
Total Biogen Inc. shareholders’ equity,


In [11]:
# Keep These lines - Used to set type in line items

def set_line_item_type():
    
df[df.index.str.contains(r"Total.*assets",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"current|other|non|deferred|discontinued",case=False, regex=True, na=False)] = 'TA'

df[df.index.str.contains(r"Total.*current.*assets",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"other|non|deferred|discontinued",case=False, regex=True, na=False)] = 'TCA'

df[df.index.str.contains(r"Total.*liabilities$",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"capitalization|long|current|other|non|deferred|discontinued",case=False, regex=True, na=False)] = 'TL'

df[df.index.str.contains(r"Total.*current.*liabilities$",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"capitalization|long|other|non|deferred|discontinued",case=False, regex=True, na=False)] = 'TCL'

df[df.index.str.contains(r"Total non.*current.*liabilities$",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"capitalization|long|other|deferred|discontinued",case=False, regex=True, na=False)] = 'TNCL'

df[df.index.str.contains(r"Total.*liabilities.*shareholder.*$",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"capitalization|long|other|deferred|discontinued",case=False, regex=True, na=False)] = 'TLSE'

df[df.index.str.contains(r"Total shareholder.*$",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"capitalization|long|other|deferred|discontinued",case=False, regex=True, na=False)] = 'TSE'

df[df.index.str.contains(r".*Intangible",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"capitalization|long|other|deferred|discontinued",case=False, regex=True, na=False)] = 'IA'

df[df.index.str.contains(r"Cash",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"hedges|restricted|obligation|long|other|deferred|discontinued",case=False, regex=True, na=False)] = 'TC'

df[df.index.str.contains(r"property",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"hedges|lease|restricted|gross|obligation|leases|other|deferred|discontinued",case=False, regex=True, na=False)] = 'NPPE'

df[df.index.str.contains(r"Total non",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"hedges|leases|other|deferred|liabilities|discontinued",case=False, regex=True, na=False)] = 'TNCA'

df[df.index.str.contains(r"Debt",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"hedges|leases|other|deferred|long|non|short|due|VIE|year|securities",case=False, regex=True, na=False)] = 'D'

df[df.index.str.contains(r"short.*Debt",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"hedges|leases|other|deferred|long|non|VIE|year|securities",case=False, regex=True, na=False)] = 'STD'

df[df.index.str.contains(r"long.*Debt",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"interest|leases|other|deferred|short|non|VIE|year|securities",case=False, regex=True, na=False)] = 'LTD'

df[df.index.str.contains(r"property",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"interest|leases|other|deferred|short|non|VIE|year|securities",case=False, regex=True, na=False)] = 'NPPE'


In [None]:
#Balance Sheet Mappings:
    
#TA   Total Assets
#TCA  Total Current Assets
#TL   Total Liabilities
#TCL  Total Current Liabilities
#TNCL Total Non Current Liabilities
#TLSE Total Liabilities and Shareholder Equity
#TSE  Total Shareholder Equity
#IA   Intangible assets
#TC   Total Cash
#NPPE Net property, plant and equipment
#TNCA Total non current assets
#D    Debt
#STD  Short term debt
#LTD  Long term debt

In [65]:
df[df.index.str.contains(r"property",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"interest|leases|other|deferred|short|non|VIE|year|securities",case=False, regex=True, na=False)] = 'NPPE'

                       type
line_item                  
('Net revenues', nan)   NaN
('Net Revenue', nan)    NaN
('Net revenues:', nan)  NaN


In [66]:
# Save to file

def save_line_items(df):
    
  df.to_csv('CSV/balance_map_line_items.csv')

In [63]:
#Check rows which are not set to type 

df[df['type'].isna()]

Unnamed: 0_level_0,type
line_item,Unnamed: 1_level_1
Other Assets,
Deferred revenue from Sanofi,
"Plant, property and equipment, net",
"Other long-term assets, net",
"Accrued salaries, benefits and payroll taxes",
...,...
"Common stock, par value $.01 per share, authorized 600,000,000 shares, issued and outstanding 207,470,000 and 213,420,000 shares, respectively",
"Common stock, $1 par value - authorized 1.2 billion shares; issued 355.2 million and 354.3 million shares",
Paid-in capital,
Total Biogen Inc. shareholders’ equity,


In [67]:
# Select data which have a type set

df = df[~df['type'].isna()]

In [68]:
save_line_items(df)

# Map Headings

In [11]:
df = pd.read_csv('CSV/balance_headings.csv')


In [16]:
df.set_index('heading', inplace=True)
df

Unnamed: 0_level_0,type
heading,Unnamed: 1_level_1
Current liabilities [Abstract],X
"Best Buy Co., Inc. Shareholders Equity",
NONCURRENT LIABILITIES,
Other Assets,
Retained Earnings [Member],
"Best Buy Co., Inc. Shareholders’ Equity",
CURRENT LIABILITIES,
"Class A common stock, par value $.01 per share; 104.9 million and 102.9 million shares issued; 47.6 million and 52.2 million shares outstanding",
"Preferred stock, 1,000,000 shares authorized, no shares issued",
Automotive [Member],


In [7]:
#Balance Sheet Mappings for headings:

#CA   Current Assets
#CL   Liabilities
#PE   Property and Equipment
#SB   Short term borrowings
#LDY  Long term debt due within one year
#LD   Long term debt
#SI   Shareholders Investments
#SE   Shareholder Equity

In [12]:
# Keep these lines - Used to set type in headings

def set_headings():
    
df[df.index.str.contains(r"Asset",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"long|other|non|deferred|discontinued",case=False, regex=True, na=False)] = 'CA'

df[df.index.str.contains(r"liabilities",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"long|other|non|deferred|shareholder|equity|discontinued",case=False, regex=True, na=False)] = 'CL'

df[df.index.str.contains(r"property",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"long|other|non|deferred|lease|shareholder|equity|discontinued",case=False, regex=True, na=False)] = 'PE'

df[df.index.str.contains(r"short.*borrowings",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"long|other|non|deferred|lease|shareholder|equity|discontinued",case=False, regex=True, na=False)] = 'SB'

df[df.index.str.contains(r"long.*debt.*within",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"other|non|deferred|lease|shareholder|equity|discontinued",case=False, regex=True, na=False)] = 'LDY'

df[df.index.str.contains(r"investments",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"GAAP|within|other|non|deferred|lease|discontinued",case=False, regex=True, na=False)] = 'SI'

df[df.index.str.contains(r"equity",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"GAAP|within|other|non|deferred|lease|discontinued",case=False, regex=True, na=False)] = 'SE'


SyntaxError: invalid syntax (<ipython-input-12-62590dbd654e>, line 6)

In [51]:
df[df.index.str.contains(r"equity",case=False, regex=True, na=False) & \
   ~ df.index.str.contains(r"GAAP|within|other|non|deferred|lease|discontinued",case=False, regex=True, na=False)]


In [52]:
# Save to file

def save_headings(df):
    
  df.to_csv('CSV/balance_map_headings.csv')

In [54]:
#Check rows which are not set to type 

df[df['type'].isna()]

Unnamed: 0_level_0,type
heading,Unnamed: 1_level_1
NONCURRENT LIABILITIES,
Other Assets,
Retained Earnings [Member],
"Class A common stock, par value $.01 per share; 104.9 million and 102.9 million shares issued; 47.6 million and 52.2 million shares outstanding",
"Preferred stock, 1,000,000 shares authorized, no shares issued",
Automotive [Member],
NONCURRENT ASSETS,
Property under capital lease and financing obligations:,
"Liabilities, Noncurrent [Abstract]",
Long-Term Liabilities,


In [55]:
# Select data which have a type set

df = df[~df['type'].isna()]
df

Unnamed: 0_level_0,type
heading,Unnamed: 1_level_1
Current liabilities [Abstract],CL
"Best Buy Co., Inc. Shareholders Equity",SE
"Best Buy Co., Inc. Shareholders’ Equity",SE
CURRENT LIABILITIES,CL
Investments,SI
LIABILITIES AND EQUITY,SE
Oracle Corporation stockholders' equity:,SE
LONG-TERM DEBT,LD
Shareholders equity,SE
CURRENT ASSETS:,CA


In [56]:
save_headings(df)

In [95]:
#Balance SHeet Mappings for line items:
    
#TA   Total Assets
#TCA  Total Current Assets
#TL   Total Liabilities
#TCL  Total Current Liabilities
#TNCL Total Non Current Liabilities
#TLSE Total Liabilities and Shareholder Equity
#TSE  Total Shareholder Equity
#IA   Intangible assets
#TC   Total Cash
#NPPE Net property, plant and equipment
#TNCA Total non current assets
#D    Debt
#STD  Short term debt
#LTD  Long term debt

#Balance Sheet Mappings for headings:

#CA   Current Assets
#CL   Liabilities
#PE   Property and Equipment
#SB   Short term borrowings
#LDY  Long term debt due within one year
#LD   Long term debt
#SI   Shareholders Investments
#SE   Shareholder Equity

# Create mapping for Headings and Line Items

In [57]:
cash_table_map = {'CA':  ['TA','TCA','TNCA'],
                  'CL':  ['TL','TCL','TNCL','TLSE'],
                  'PE':  ['NPPE'],
                  'LDY': ['STD'],
                  'LD':  ['LTD'],
                  'SI':  ['TSE','TLSE'],
                  'SE':  ['TSE','TLSE'],
                  'No Heading': ['D']}
        

# Create lookup for Headings

In [104]:
df

Unnamed: 0_level_0,type
heading,Unnamed: 1_level_1
Adjustments to reconcile consolidated net earnings to net cash provided by operating activities:,ARNI
Investing Activities,CIA
Cash flows from operating activities:,CFOA
Cash Flows from Financing Activities,CFA
Operating activities:,CFOA
Adjustments to arrive at cash provided by operating activities:,ARNI
CASH PROVIDED BY / (USED IN) OPERATING ACTIVITIES,CFOA
Significant Non-cash Investing and Financing Activity,CFA
Changes in Operating Assets and Liabilities [Abstract],COAL
Adjustments to reconcile net income (loss) to net cash flows from operating activities:,ARNI


In [58]:
x = df.to_dict()
balance_heading_lookup = x['type']
balance_heading_lookup

{'Current liabilities [Abstract]': 'CL',
 'Best Buy Co., Inc. Shareholders Equity': 'SE',
 'Best Buy Co., Inc. Shareholders’ Equity': 'SE',
 'CURRENT LIABILITIES': 'CL',
 'Investments': 'SI',
 'LIABILITIES AND EQUITY': 'SE',
 "Oracle Corporation stockholders' equity:": 'SE',
 'LONG-TERM DEBT': 'LD',
 'Shareholders equity': 'SE',
 'CURRENT ASSETS:': 'CA',
 'SHAREHOLDERS’ EQUITY': 'SE',
 'Equity': 'SE',
 "STOCKHOLDER'S EQUITY": 'SE',
 "Shareholders' equity:": 'SE',
 "Stockholders' Equity": 'SE',
 "Stockholders' equity": 'SE',
 'Current Liabilities:': 'CL',
 "SHAREHOLDERS' EQUITY": 'SE',
 'Current Liabilities': 'CL',
 'Biogen Inc. shareholders’ equity': 'SE',
 'Property and Equipment': 'PE',
 'Cisco shareholders’ equity:': 'SE',
 'ASSETS': 'CA',
 'PepsiCo Common Shareholders’ Equity': 'SE',
 'Long-term debt due after one year:': 'LD',
 'LIABILITIES AND SHAREHOLDERS EQUITY': 'SE',
 "SHAREHOLDERS' EQUITY:": 'SE',
 'Temporary Equity [Abstract]': 'SE',
 'Property, Plant and Equipment': 'PE',


# Create Lookup for Line Items


In [60]:
df = pd.read_csv('CSV/balance_map_line_items.csv')
df.set_index('line_item', inplace=True)
df

Unnamed: 0_level_0,type
line_item,Unnamed: 1_level_1
Investment securities,X
"Plant, property and equipment, net",NPPE
Long-term debt due currently,LTD
"Property, plant and equipment, net of accumulated depreciation of $2,694.5 and $2,172.0, respectively",NPPE
Debt and capital lease obligations,D
Total shareholders' investment,TSE
"Property, Plant and Equipment, Net",NPPE
"Property and equipment, gross",NPPE
Cash and temporary cash investments,TC
Total non-current liabilities,TNCL


In [61]:
x = df.to_dict()
balance_line_item_lookup = x['type']
balance_line_item_lookup

{'Investment securities': 'X',
 'Plant, property and equipment, net': 'NPPE',
 'Long-term debt due currently': 'LTD',
 'Property, plant and equipment, net of accumulated depreciation of $2,694.5 and $2,172.0, respectively': 'NPPE',
 'Debt and capital lease obligations': 'D',
 "Total shareholders' investment": 'TSE',
 'Property, Plant and Equipment, Net': 'NPPE',
 'Property and equipment, gross': 'NPPE',
 'Cash and temporary cash investments': 'TC',
 'Total non-current liabilities': 'TNCL',
 'Property, net of accumulated depreciation': 'NPPE',
 'Intangible assets, net': 'IA',
 'Long-term Debt and Lease Obligation': 'LTD',
 'Long-term debt, net': 'LTD',
 'Property and equipment, net': 'NPPE',
 'Total Liabilities, Temporary Equity, and Shareholders’ Equity': 'TLSE',
 'Total shareholders’ equity': 'TSE',
 'Property, plant and equipment — net': 'NPPE',
 'Gross property and equipment': 'NPPE',
 'TOTAL LIABILITIES AND SHAREHOLDERS’ EQUITY': 'TLSE',
 'TOTAL LIABILITIES AND SHAREHOLDERS EQUITY'

# Combine Lookups for heading and line items into one dictionary and save

In [62]:
balance_lookup = {'heading': balance_heading_lookup,
                  'line_item' : balance_line_item_lookup}

In [63]:
# Save to file

with open('pickle/balance_map_lookup.pkl', 'wb') as fp:
  pickle.dump(balance_lookup, fp)
