## This code creates a dictionary called final_parent_dict.txt that maps owner/entity/company name to a "parent" owner
## The dictionary can be used to analyze other micro datasets 

In [27]:
import pandas as pd
import matplotlib as plt
import re
import json
%matplotlib inline
dataDir = '/Users/rachelanderson/Dropbox (Princeton)/Data for Tax Equity Project/'
 
with open(dataDir + 'JSONFiles/parent_utility_dict.txt') as outfile:
    parent_dict = json.load(outfile)

In [4]:
parent_dict = dict((k.lower(), v.lower()) for k,v in parent_dict.items())

In [5]:
# read in other parent list and merge them
parent_map = {}
subs_data = pd.read_csv(dataDir + 'company_data/good_jobs_subs_data.csv')
subs_data.columns = subs_data.columns.str.lower().str.replace(" ", "_")
subs_data.company = subs_data.company.str.lower()
subs_data.parent_company = subs_data.parent_company.str.lower()

parent_map = dict(zip(subs_data.company, subs_data.parent_company))


In [6]:
clearway_subs = pd.read_csv(dataDir + 'company_data/clearway_subs.csv', names=['entity_name','','state']).entity_name.str.lower()
for x in clearway_subs:
    parent_map[x] = 'clearway energy'

In [7]:
parent_map.update(parent_dict)

## make master list of utility names from utility dataset, generator dataset, and owners dataset

### Read in relevant datasets

In [8]:
owners = pd.read_csv(dataDir + 'master_company_list_2018/owners2018.csv')
gen_data = pd.read_csv(dataDir + 'master_company_list_2018/gen_2018.csv')
utils = pd.read_csv(dataDir + 'master_company_list_2018/utils_2018.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [9]:
dfList = [owners, gen_data, utils]


In [10]:
owners.columns

Index(['Unnamed: 0', 'utility_id', 'utility_name', 'plant_code', 'plant_name',
       'state', 'generator_id', 'status', 'owner_name', 'owner_street_address',
       'owner_city', 'owner_state', 'owner_zip', 'ownership_id',
       'percent_owned', 'perc_owned', 'num_owners', 'idx', 'owner_1',
       'owner_2', 'owner_3', 'owner_4', 'perc_owner_1', 'perc_owner_2',
       'perc_owner_3', 'perc_owner_4'],
      dtype='object')

In [11]:
owners.rename(columns = {'owner_name':'company_name', 
                         'owner_street_address':'company_address',
                         'owner_state': 'company_state',
                          'owner_city': 'company_city',
                          'owner_zip': 'company_zip',
                           'ownership_id': 'company_id'},inplace=True)
gen_data.rename(columns = {'utility_name':'company_name'}, inplace=True)
utils.rename(columns = {'utility_name':'company_name',
                       'utility_id': 'company_id',
                        'street_address':'company_address',
                         'state':'company_state',
                         'city': 'company_city', 
                          'zip':'company_zip'},inplace=True)

In [15]:
companies.head(10)

Unnamed: 0,company_name,company_address,company_state,company_city,company_zip,company_id
0,Alabama Power Co,P O Box 2641600 North 18th Street,AL,Birmingham,35291,195
1,Mississippi Power Co,2992 West Beach Boulevard,MS,Gulfport,39501,12686
5,Georgia Power Co,P O Box 4545,GA,Atlanta,30302,7140
14,Cleco Power LLC,2030 Donahue Ferry RoadP.O. Box 5000,LA,Pineville,71361,3265
15,"Northeast Texas Elec Coop, Inc",1127 Judson Road,TX,Longview,75601,13670
16,Oklahoma Municipal Power Authority,P O Box 1960,OK,Edmond,73083,14077
17,Southwestern Electric Power Co,212 E. 6th Street,OK,Tulsa,74119,17698
18,City of Hastings - (NE),1228 North Denver Avenue,NE,Hastings,68902,8245
19,City of Nebraska City,100 Central Ave,NE,Nebraska City,68410,13334
20,Municipal Energy Agency of NE,PO BOX 951241111 O Street Suite 200,NE,Lincoln,68509,21352


In [16]:
# make df with master list of companies (utilities + owners)
companyVals = ['name', 'address','state','city','zip','id']
companyVals = ['company_' + x for x in companyVals]
companies = owners[companyVals].append(utils[companyVals])
# select unique combos
companies.drop_duplicates(inplace=True)

In [17]:
names = gen_data['company_name'].drop_duplicates()
# I found that no names in gen data are not contained in the other datasets, so I don't merge it in

In [18]:
def shorten_rd(address):
    '''Completes the road type. I.e. Rd becomes Road, st becomes Street as per Google etc.'''
    address = address.title()
    address = re.sub(r" Street(?=$| [NE(So|S$)(We|W$)])", ' St', address)
    address = re.sub(r" Road(?=$| [NE(So|S$)(We|W$)])", ' Rd', address)
    address = re.sub(r"(?<!The) Avenue(?=$| [NE(So|S$)(We|W$)])", ' Ave', address)
    address = re.sub(r" Close(?=$| [NE(So|S$)(We|W$)])", ' Cl', address)
    address = re.sub(r" Court(?=$| [NE(So|S$)(We|W$)])", ' Ct', address)
    address = re.sub(r"(?<!The) Crescent(?=$| [NE(So|S$)(We|W$)])", ' Cres', address)
    address = re.sub(r" Boulevarde?(?=$| [NE(So|S$)(We|W$)])", ' Blvd', address)
    address = re.sub(r" Drive(?=$| [NE(So|S$)(We|W$)])", ' Dr', address)
    address = re.sub(r" Lane(?=$| [NE(So|S$)(We|W$)])", ' Ln', address)
    address = re.sub(r" Place(?=$| [NE(So|S$)(We|W$)])", ' Pl', address)
    address = re.sub(r" Square(?=$| [NE(So|S$)(We|W$)])", ' Sq', address)
    address = re.sub(r"(?<!The) Parade(?=$| [NE(So|S$)(We|W$)])", ' Pde', address)
    address = re.sub(r" Circuit(?=$| [NE(So|S$)(We|W$)])", ' Cct', address)
    return address

def lengthen_rd(address):
    address = address.title()
    address = re.sub(r" St(?=$| [NE(So|S$)(We|W$)])", " Street", address)
    address = re.sub(r" Rd(?=$| [NE(So|S$)(We|W$)])", " Road", address)
    address = re.sub(r" Ave(?=$| [NE(So|S$)(We|W$)])", " Avenue", address)
    address = re.sub(r" Cl(?=$| [NE(So|S$)(We|W$)])", " Close", address)
    address = re.sub(r" Ct(?=$| [NE(So|S$)(We|W$)])", " Court", address)
    address = re.sub(r" Cres(?=$| [NE(So|S$)(We|W$)])", " Crescent", address)
    address = re.sub(r" Blvd(?=$| [NE(So|S$)(We|W$)])", " Boulevard", address)
    address = re.sub(r" Dr(?=$| [NE(So|S$)(We|W$)])", " Drive", address)
    address = re.sub(r" Ln(?=$| [NE(So|S$)(We|W$)])", " Lane", address)
    address = re.sub(r" Pl(?=$| [NE(So|S$)(We|W$)])", " Place", address)
    address = re.sub(r" Sq(?=$| [NE(So|S$)(We|W$)])", " Square", address)
    address = re.sub(r" Pde(?=$| [NE(So|S$)(We|W$)])", " Parade", address)
    address = re.sub(r" Cct(?=$| [NE(So|S$)(We|W$)])", " Circuit", address)
    return address

def standard_addr(address):
    '''Checks for unit numbers and street addresses and puts them in the standard format''' 
    #print("################################")
    #print("### Address: ", address)
    unit_nums = re.findall(r"(?<=Unit )\w?\d+\w?|(?<=U)\d+\w?|\w?\d+\w?(?=\s*/)", address)
    unit_num = unit_nums[0] if len(unit_nums)==1 else ""
    #print("Unit Number: ", unit_num)
    proc_addr = re.sub(r"Unit \w?\d+\w?/?|U\d+\w?/?|\w?\d+\w?\s*/", "", address)
    proc_addr = re.sub(r"^[,\- ]+|[,\- ]+$", "", proc_addr)
    #print("Unitless address: ", proc_addr)
    type_opts = r"Terrace|Way|Walk|St|Rd|Ave|Cl|Ct|Cres|Blvd|Dr|Ln|Pl|Sq|Pde|Cct"
    road_attrs_pattern = r"(?P<rd_no>\w?\d+(\-\d+)?\w?\s+)(?P<rd_nm>[a-zA-z \d\-]+)\s+(?P<rd_tp>" + type_opts + ")"
    #print("Road Attr Pattern: ", road_attrs_pattern)
    road_attrs = re.search(road_attrs_pattern, proc_addr)
    try:
        road_num = road_attrs.group('rd_no').strip()
    except AttributeError:
        road_num = ""
    #print("Road number: ", road_num)
    try:
        road_name = road_attrs.group('rd_nm').strip()
    except AttributeError:
        road_name = ""
    #print("Road name: ", road_name)
    try:
        road_type = road_attrs.group('rd_tp').strip()
    except AttributeError:
        road_type = ""
    #print("Road type: ", road_type)
    proc_addr = lengthen_rd(re.sub(r"^[,\- ]+|[,\- ]+$", "", re.sub(road_attrs_pattern, "", proc_addr)))
    #print("Leftover: ", proc_addr)

    unit_seg = (unit_num + "/" if unit_num!="" else "") if road_num != "" else ("Unit " + unit_num + ", " if unit_num!="" else "")
    road_seg = ((road_num + " " if road_num!="" else "") + road_name + " " + road_type).strip()
    post_road_seg = " " + proc_addr if proc_addr != "" else ""
    proc_addr = (unit_seg + road_seg) + post_road_seg
    #print("### Processed Address: ", proc_addr)
    return proc_addr

In [19]:
# clean addresses
companies['raw_address'] = companies['company_address']
companies['company_address'] =companies['raw_address'].apply(lambda x: standard_addr(str(x)))

# first assign company name a common var
companies['name_group'] = companies.groupby('company_name').grouper.group_info[0]

# now assign clean address common var
companies['address_group']=companies.groupby('company_address').grouper.group_info[0]

In [20]:
companies['company_name'] = companies.company_name.str.lower()

In [21]:
def apply_parent_dict(df, name, par_dict):
    df['parent_group'] = df[name].str.lower().map(par_dict)
    df['parent_group']=df['parent_group'].fillna(df[name])
    df['parent_group'] = df['parent_group'].apply(lambda x: "duke energy" if "duke energy" in x else x)
    df['parent_group'] = df['parent_group'].apply(lambda x: "nextera energy" if "nextera" in x else x)
    df['parent_group'] = df['parent_group'].apply(lambda x: "nrg energy" if "nrg" in x else x)
    df['parent_group'] = df['parent_group'].apply(lambda x: "solar star" if "solar star" in x else x)
    df['parent_group'] = df['parent_group'].apply(lambda x: "nrg" if "agua caliente solar" in x else x)
    df['parent_group'] = df['parent_group'].apply(lambda x: "nextera energy" if "blythe solar" in x else x)
    df['parent_group'] = df['parent_group'].apply(lambda x: "8me" if "8me" in x.lower() else x)
    df['parent_group'] = df['parent_group'].apply(lambda x: "exelon" if "av solar" in x else x)
    df['parent_group'] = df['parent_group'].apply(lambda x: "exelon" if "exelon" in x else x)
    return(df)

In [51]:
# now do parent dict... can drop those and deal with remainder

companies = apply_parent_dict(companies, 'company_name', parent_map)

In [53]:
companies = companies.sort_values('name_group')
name_dict = dict(zip(companies.company_name, companies.name_group))
address_dict = dict(zip(companies.company_name, companies.address_group))
par_dict =  dict(zip(companies.company_name, companies.parent_group))



In [43]:
def group_two_col(df, col1, col2):
    address_map = {} # is col1
    name_map = {} # is col2 
    group_id = [] # maps to each row in rows
    maximum_group = 0

    
    for  index, row in df.iterrows():

        if address_map.get(row[col1]) is not None:
            group_id.append(address_map[row[col1]]) 
            if name_map.get(row[col2]) is None:
                name_map[row[col2]] = address_map[row[col1]]
            continue

        if name_map.get(row[col2]) is not None:
            group_id.append(name_map[row[col2]])
        
            if address_map.get(row[col1]) is None:
                address_map[row[col1]] = name_map[row[col2]]
            continue
  
  # If we're here no group found, insert new group
        address_map[row[col1]] = maximum_group + 1 # new group id
        name_map[row[col2]] = maximum_group + 1 # new group id

        group_id.append(maximum_group + 1)

        maximum_group += 1
    
    return(group_id)
    

In [55]:
companies['id'] = group_two_col(companies,'company_address','company_name')
companies['id'] = group_two_col(companies,'id','parent_group')


In [57]:
#companies.drop_duplicates(subset='id',keep='first')

Unnamed: 0,company_name,company_address,company_state,company_city,company_zip,company_id,raw_address,name_group,address_group,parent_group,id
4964,"10 briggs solar ng, llc","267 Water St Reet, 2Nd Floor",RI,Warren,02885,62685,"267 Water Street, 2nd Floor",0,2984,"10 briggs solar ng, llc",1
4899,"1025 traveller solar, llc",1447 S Tryon St Ste 201,NC,Charlotte,28203,62139,"1447 S Tryon St, Ste 201",1,2387,"1025 traveller solar, llc",2
3816,126 grove solar llc,315 Post Road West,CT,Westport,06880,60517,315 Post Road West,3,647,126 grove solar llc,3
3673,158th fighter wing,105 NCO Dr Ive,VT,South Burlington,05403,60301,105 NCO Drive,4,2040,158th fighter wing,4
4453,"1634 solar, llc",1519 King St Reet,SC,Charleston,29405,61541,1519 King Street,5,2444,"1634 solar, llc",5
...,...,...,...,...,...,...,...,...,...,...,...
1519,ziegler power systems,8050 State Highway 101East,MN,Shakopee,55379,21144,8050 State Highway 101East,6090,1116,ziegler power systems,3989
1522,zion energy llc,5701 Ninth St,IL,Zion,60099,21191,5701 Ninth St,6091,3614,zion energy llc,3990
2371,zotos international,300 Forge Ave Nue,NY,Geneva,14456,56977,300 Forge Avenue,6092,3049,zotos international,3991
3914,ebay - south jordan,6614 West Crimson View Dr,UT,South Jordan,84095,59095,6614 West Crimson View Dr,6094,3759,ebay - south jordan,3992


In [27]:
## make company_name --> id  (with parent name) dictionary

In [67]:
nextera_subs = companies[companies['id']==95].company_name

In [68]:
for x in nextera_subs:
    final_par_dict[x] = 'nextera energy'

### Now save company_name --> parent_company dictionary so I can use with the generator data

In [29]:
final_par_dict = dict(zip(companies.company_name, companies.parent_group))

In [30]:
with open(dataDir + 'JSONFiles/final_parent_dict.json','w') as outfile:
    json.dump(final_par_dict,outfile)

In [None]:
['adelanto solar ii, llc', 'adelanto solar, llc', 'bluebell solar, llc']

In [44]:
final_par_dict['adelanto solar ii, llc'] = 'nextera energy'
final_par_dict['adelanto solar, llc'] = 'nextera energy'

In [62]:
companies[companies['id']==95]

Unnamed: 0,company_name,company_address,company_state,company_city,company_zip,company_id,raw_address,name_group,address_group,parent_group,id
3197,"adelanto solar ii, llc",700 Universe Blvd,FL,Juno Beach,33408,59212,700 Universe Blvd,127,3805,"adelanto solar ii, llc",95
3196,"adelanto solar, llc",700 Universe Blvd,FL,Juno Beach,33408,59211,700 Universe Blvd,128,3805,"adelanto solar, llc",95
1786,altagas blythe operations inc.,700 Universe Blvd,CA,Juno Beach,33408,55713,700 Universe Blvd,211,3805,altagas blythe operations inc.,95
2251,ashtabula wind iii llc,700 Universe Blvd,FL,Juno Beach,33408,56727,700 Universe Blvd,341,3805,ashtabula wind iii llc,95
2228,baldwin wind llc,700 Universe Blvd,ND,Juno Beach,33408,56688,700 Universe Blvd,441,3805,baldwin wind llc,95
...,...,...,...,...,...,...,...,...,...,...,...
1476,west texas wind energy partners llc,700 Universe Blvd,FL,Juno Beach,33408,20424,700 Universe Blvd,5917,3805,west texas wind energy partners llc,95
4950,"wheatridge wind energy, llc",700 Universe Blvd,FL,Juno Beach,33408,62668,700 Universe Blvd,5960,3805,"wheatridge wind energy, llc",95
3472,"white oak solar, llc",700 Universe Blvd,FL,Juno Beach,33408,59831,700 Universe Blvd,5971,3805,"white oak solar, llc",95
3459,"white pine solar, llc",700 Universe Blvd,FL,Juno Beach,33408,59803,700 Universe Blvd,5973,3805,"white pine solar, llc",95


In [70]:
final_par_dict['adelanto solar, llc']

'nextera energy'

## hand code nextera

In [33]:
final_par_dict[]

{'10 briggs solar ng, llc': '10 briggs solar ng, llc',
 '1025 traveller solar, llc': '1025 traveller solar, llc',
 '1047 little mountain solar, llc': '1047 little mountain solar, llc',
 '126 grove solar llc': '126 grove solar llc',
 '158th fighter wing': '158th fighter wing',
 '1634 solar, llc': '1634 solar, llc',
 '174 power global corp.': '174 power global corp.',
 '180 raritan energy solutions, llc': '180 raritan energy solutions, llc',
 '1951 hamburg turnpike, llc': '1951 hamburg turnpike, llc',
 '2016 esa project company, llc': '2016 esa project company, llc',
 '225dd 8me llc': '8me',
 '226hc 8me llc': '8me',
 '231 dixon 74 solar i, llc': '231 dixon 74 solar i, llc',
 '231rc 8me llc': '8me',
 '301 chestnut solar ng, llc': '301 chestnut solar ng, llc',
 '31st street energy llc': '31st street energy llc',
 '325mk 8me llc': '8me',
 '33ui 8me llc': '8me',
 '350 clark solar ng, llc': '350 clark solar ng, llc',
 '380 middlesex solar llc': '380 middlesex solar llc',
 '3880 north mission 