# Merge all data sets

Earnings from yahoo.com, metrics from stockrow.com, and sector/industries from marketwatch.com

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tickerlist import make_list
import re
import pickle
import json

## Clean sector and industry data set

In [2]:
sectors = {'consumer_discr': ['Computers/Consumer Electronics', 'Mixed Retailing', 'Home Goods Retail', 'Broadcasting', 'Restaurants', 'Tourism', 'Footwear', 'Automobiles', 'Toys & Games', 'Clothing Retail', 'Hotels', 'Auto & Commercial Vehicle Parts', 'Commercial Vehicles', 'Housewares', 'Specialty Retail', 'Clothing', 'Advertising/Marketing/Public Relations', 'Publishing', 'Gambling Industries', 'Residential Building Construction', 'Tires', 'Furniture'],
           'conumser_staples': ['Personal Care Products/Appliances', 'Non-Alcoholic Beverages/Drinks', 'Tobacco', 'Drug Retail', 'Food Retail', 'Food Products', 'Nondurable Household Products', 'Alcoholic Beverages/Drinks'],
           'energy': ['Major Oil & Gas', 'Oil & Gas Products/Services', 'Oil Extraction', 'Pipeline Transportation'],
           'financials': ['Reinsurance', 'Major International Banks', 'Banking', 'Diversified Holding Companies', 'Consumer Finance', 'Life Insurance', 'Securities', 'Finance Companies', 'Non-Life Insurance', 'Investment Advisors', 'Full-Line Insurance', 'Accounting', 'Insurance Brokering', 'Savings Institutions'],
           'health_care': ['Pharmaceuticals', 'Biotechnology', 'Medical Equipment/Supplies', 'Precision Products', 'Healthcare Provision', ],
           'industrials': ['Aerospace Products/Parts', 'Railroads', 'Air Freight', 'Defense Equipment/Products', 'Mobile Machinery', 'Farming', 'Industrial Machinery', 'Passenger Airlines', 'Environment/Waste Management', 'Industrial Products', 'Building Materials/Products', 'Wholesalers', 'General Services', 'Transportation Services', 'Trucking', 'Technical Services', 'Employment/Training Services', 'Construction'],
           'info_tech': ['Software', 'Internet/Online', 'Semiconductors', 'Networking', 'Computer Services', 'Diversified Business Services', 'Industrial Electronics'],
           'materials': ['Chemicals', 'Paper/Pulp', 'Gold', 'Non-Ferrous Metals', 'Iron/Steel', 'Containers/Packaging'],
           'real_estate': ['Specialty REITs', 'Retail REITs', 'Industrial/Office REITs', 'Healthcare REITs', 'Residential REITs', 'Hotel/Lodging REITs', 'Diversified REITs', 'Real Estate Developers'],
           'telecomm': ['Wireless Telecommunications Services', 'Wired Telecommunications Services'],
           'utilities': ['Electric Utilities', 'Multiutilities', 'Gas Utilities', 'Water Utilities']}

### Replace marketwatch sectors with official GISC sectors

In [3]:
with open('/Users/samfunk/ds/metis/project_mcnulty/code/sector_industries.pkl', 'rb') as f:
    sect_ind_dict = pickle.load(f)

sector_industry_raw = pd.DataFrame.from_dict(sect_ind_dict, orient='index')
sect_ind_df = pd.DataFrame(columns=['mw_sector', 'industry', 'true_sector'])

for ticker in sector_industry_raw.index:
    true_sector = [k for k,v in sectors.items() if sector_industry_raw.loc[ticker][1] in v]
    for i in range(1,5):
        sect_ind_df.loc['%s_%d' % (ticker, i)] = sector_industry_raw.loc[ticker].tolist() + true_sector

In [4]:
len(sect_ind_df)

1984

In [5]:
sect_ind_df.true_sector.value_counts()

consumer_discr      392
financials          340
health_care         228
industrials         212
info_tech           208
conumser_staples    148
real_estate         128
utilities           116
energy              116
materials            80
telecomm             16
Name: true_sector, dtype: int64

## Merge time

In [6]:
with open('/Users/samfunk/ds/metis/project_mcnulty/code/earnings_df.pkl', 'rb') as f :
    earnings_df = pickle.load(f)

with open('/Users/samfunk/ds/metis/project_mcnulty/code/metrics.pkl', 'rb') as f:
    metrics_df = pickle.load(f)

master = metrics_df.join(earnings_df, how='left')
master = master.join(sect_ind_df, how='left')


### Fill and remove various nans

In [7]:
master = master.reset_index()
master.rename(columns={'index': 'ticker_quarter'}, inplace=True)
master['currentR'] = master['currentR'].fillna(1)
master['payout'] = master['payout'].fillna(0)
master.dropna(axis=0, inplace=True)

## Segment various features

### Dividends (payout ratio)

In [8]:
def divs(row):
    if row['payout'] < 0:
        return -1
    elif row['payout'] == 0:
        return 0
    else:
        return 1

master['dividends'] = master.apply(lambda x: divs(x), axis=1)

### Interest covereage (industry standard = 1.5)

In [9]:
def interest_coverage(row):
    if row['interest'] <= 1.5:
        return 0
    else:
        return 1

master['interest_coverage'] = master.apply(lambda x: interest_coverage(x), axis=1)

### Current ratio (industry standard < 1)

In [10]:
missing_currents = {'BRKB': 1.3,'GE': 2.16,'AXP': 1.96,'MET': 0.98,'SCHW': 0.33,'SPG': 0.27,'PRU': 1.19,'CI': 0.90,'AIG': 2.82,'DE': 2.12,'TRV': 0.40,'PLD': 1.14,'ALL': 0.45,'AFL': 0.40,'PSA': 1.11,'PGR': 1.00,'DLR': 0.77,'HCN': 1.53,'EQR': 0.26,'AVB': 0.92,'DFS': 1.53,'AMP': 2.58,'TROW': 3.36,'VTR': 0.92,'HIG': 4.35,'BXP': 2.96,'PFG': 2.66,'LNC': 3.24,'ESS': 0.70,'O': 0.61,'DHI': 7.22,'VNO': 6.28,'L': 0.52,'GGP': 1.58,'ETFC': 1.40,'HCP': 4.01,'MAA': 0.17,'UNM': 8.98,'CINF': 1.22,'LEN': 6.62,'X': 0.50,'ARE': 1.45,'RJF': 3.31,'SLG': 4.16,'DRE': 10.30,'UDR': 0.18,'EXR': 0.99,'RE': 2.00,'REG': 1.37,'FRT': 1.40,'WU': 0.29,'TMK': 0.32,'KIM': 2.57,'PHM': 4.27,'AIV': 0.18,'MAC': 0.68,'AIZ': 0.93,'NAVI': 34.22,
}

for x in master['ticker_quarter']:
    ticker = re.search(r'([A-Z]*)', x)[1]
    if ticker in missing_currents.keys():
        master.loc[master['ticker_quarter'] == x, 'currentR'] = missing_currents[ticker]
        
def current_ratio(row):
    if row['currentR'] < 1:
        return 0
    else:
        return 1

master['current_ratio'] = master.apply(lambda x: current_ratio(x), axis=1)

### Debt to Equity ratio (industry standard = 2)

In [11]:
def debt_to_equity(row):
    if row['debt_equity'] < 0:
        return 0
    elif row['debt_equity'] < 2:
        return 1
    else:
        return 2

master['debt_to_equity'] = master.apply(lambda x: debt_to_equity(x), axis=1)

### Quality of income/earnings (industry standard ~ 1)

In [12]:
def income_quality(row):
    if row['incomeQ'] < 1:
        return 0
    else:
        return 1

master['income_quality'] = master.apply(lambda x: income_quality(x), axis=1)

### Research and Development as percentage of revenue (greater than or equal to 0)

In [13]:
def r_and_d(row):
    if row['rd'] <= 0:
        return 0
    else:
        return 1

master['r_and_d'] = master.apply(lambda x: r_and_d(x), axis=1)

### Continuous variables (Bookvalue/share, FCF/share, Cash/share)

In [14]:
master.loc[master['bookvalue'] < 0, 'bookvalue'] = master.loc[master['bookvalue'] < 0, 'bookvalue'].apply(lambda x: 0.01 / abs(x))

master['log_bv'] = master['bookvalue'].apply(lambda x: np.log(x))

In [15]:
master.loc[master['fcf'] < 0, 'fcf'] = master.loc[master['fcf'] < 0, 'fcf'].apply(lambda x: 0.01 / abs(x))

master['log_fcf'] = master['fcf'].apply(lambda x: np.log(x))

In [16]:
'''master['fcf/cash'] = master.fcf / master.cash
master['fcf/bookvalue'] = master.fcf / master.bookvalue
master['cash/bookvalue'] = master.cash / master.bookvalue
master['cash/fcf'] = master.cash / master.bookvalue
master['bookvalue/fcf'] = master.bookvalue / master.fcf
master['bookvalue/cash'] = master.bookvalue / master.cash

inters = ['fcf/cash','fcf/bookvalue','cash/bookvalue','cash/fcf','bookvalue/fcf','bookvalue/cash']'''

"master['fcf/cash'] = master.fcf / master.cash\nmaster['fcf/bookvalue'] = master.fcf / master.bookvalue\nmaster['cash/bookvalue'] = master.cash / master.bookvalue\nmaster['cash/fcf'] = master.cash / master.bookvalue\nmaster['bookvalue/fcf'] = master.bookvalue / master.fcf\nmaster['bookvalue/cash'] = master.bookvalue / master.cash\n\ninters = ['fcf/cash','fcf/bookvalue','cash/bookvalue','cash/fcf','bookvalue/fcf','bookvalue/cash']"

### Save final/master df

In [17]:
with open('/Users/samfunk/ds/metis/project_mcnulty/code/master_df.pkl', 'wb') as f:
    pickle.dump(master, f)