## This code creates a dictionary called final_parent_dict.txt that maps owner/entity/company name to a "parent" owner
## The dictionary can be used to analyze other micro datasets 

In [1]:
import pandas as pd
import matplotlib as plt
import re
import json
%matplotlib inline
dataDir = '/Users/rachelanderson/Dropbox (Princeton)/Data for Tax Equity Project/'
 
with open(dataDir + 'JSONFiles/parent_utility_dict.txt') as outfile:
    parent_dict = json.load(outfile)

In [2]:
parent_dict = dict((k.lower(), v.lower()) for k,v in parent_dict.items())

In [3]:
# read in other parent list and merge them
parent_map = {}
subs_data = pd.read_csv(dataDir + 'company_data/good_jobs_subs_data.csv')
subs_data.columns = subs_data.columns.str.lower().str.replace(" ", "_")
subs_data.company = subs_data.company.str.lower()
subs_data.parent_company = subs_data.parent_company.str.lower()

parent_map = dict(zip(subs_data.company, subs_data.parent_company))
parent_map.update(parent_dict)

## make master list of utility names from utility dataset, generator dataset, and owners dataset

### Read in relevant datasets

In [7]:
owners = pd.read_csv(dataDir + 'master_company_list_2018/owners2018.csv')
gen_data = pd.read_csv(dataDir + 'master_company_list_2018/gen_2018.csv')
utils = pd.read_csv(dataDir + 'master_company_list_2018/utils_2018.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
dfList = [owners, gen_data, utils]

In [9]:
owners.columns

Index(['Unnamed: 0', 'utility_id', 'utility_name', 'plant_code', 'plant_name',
       'state', 'generator_id', 'status', 'owner_name', 'owner_street_address',
       'owner_city', 'owner_state', 'owner_zip', 'ownership_id',
       'percent_owned', 'perc_owned', 'num_owners', 'idx', 'owner_1',
       'owner_2', 'owner_3', 'owner_4', 'perc_owner_1', 'perc_owner_2',
       'perc_owner_3', 'perc_owner_4'],
      dtype='object')

In [10]:
owners.rename(columns = {'owner_name':'company_name', 
                         'owner_street_address':'company_address',
                         'owner_state': 'company_state',
                          'owner_city': 'company_city',
                          'owner_zip': 'company_zip',
                           'ownership_id': 'company_id'},inplace=True)
gen_data.rename(columns = {'utility_name':'company_name'}, inplace=True)
utils.rename(columns = {'utility_name':'company_name',
                       'utility_id': 'company_id',
                        'street_address':'company_address',
                         'state':'company_state',
                         'city': 'company_city', 
                          'zip':'company_zip'},inplace=True)

In [13]:
# make df with master list of companies (utilities + owners)
companyVals = ['name', 'address','state','city','zip','id']
companyVals = ['company_' + x for x in companyVals]
companies = owners[companyVals].append(utils[companyVals])
# select unique combos
companies.drop_duplicates(inplace=True)

In [14]:
names = gen_data['company_name'].drop_duplicates()
# I found that no names in gen data are not contained in the other datasets, so I don't merge it in

In [15]:
def shorten_rd(address):
    '''Completes the road type. I.e. Rd becomes Road, st becomes Street as per Google etc.'''
    address = address.title()
    address = re.sub(r" Street(?=$| [NE(So|S$)(We|W$)])", ' St', address)
    address = re.sub(r" Road(?=$| [NE(So|S$)(We|W$)])", ' Rd', address)
    address = re.sub(r"(?<!The) Avenue(?=$| [NE(So|S$)(We|W$)])", ' Ave', address)
    address = re.sub(r" Close(?=$| [NE(So|S$)(We|W$)])", ' Cl', address)
    address = re.sub(r" Court(?=$| [NE(So|S$)(We|W$)])", ' Ct', address)
    address = re.sub(r"(?<!The) Crescent(?=$| [NE(So|S$)(We|W$)])", ' Cres', address)
    address = re.sub(r" Boulevarde?(?=$| [NE(So|S$)(We|W$)])", ' Blvd', address)
    address = re.sub(r" Drive(?=$| [NE(So|S$)(We|W$)])", ' Dr', address)
    address = re.sub(r" Lane(?=$| [NE(So|S$)(We|W$)])", ' Ln', address)
    address = re.sub(r" Place(?=$| [NE(So|S$)(We|W$)])", ' Pl', address)
    address = re.sub(r" Square(?=$| [NE(So|S$)(We|W$)])", ' Sq', address)
    address = re.sub(r"(?<!The) Parade(?=$| [NE(So|S$)(We|W$)])", ' Pde', address)
    address = re.sub(r" Circuit(?=$| [NE(So|S$)(We|W$)])", ' Cct', address)
    return address

def lengthen_rd(address):
    address = address.title()
    address = re.sub(r" St(?=$| [NE(So|S$)(We|W$)])", " Street", address)
    address = re.sub(r" Rd(?=$| [NE(So|S$)(We|W$)])", " Road", address)
    address = re.sub(r" Ave(?=$| [NE(So|S$)(We|W$)])", " Avenue", address)
    address = re.sub(r" Cl(?=$| [NE(So|S$)(We|W$)])", " Close", address)
    address = re.sub(r" Ct(?=$| [NE(So|S$)(We|W$)])", " Court", address)
    address = re.sub(r" Cres(?=$| [NE(So|S$)(We|W$)])", " Crescent", address)
    address = re.sub(r" Blvd(?=$| [NE(So|S$)(We|W$)])", " Boulevard", address)
    address = re.sub(r" Dr(?=$| [NE(So|S$)(We|W$)])", " Drive", address)
    address = re.sub(r" Ln(?=$| [NE(So|S$)(We|W$)])", " Lane", address)
    address = re.sub(r" Pl(?=$| [NE(So|S$)(We|W$)])", " Place", address)
    address = re.sub(r" Sq(?=$| [NE(So|S$)(We|W$)])", " Square", address)
    address = re.sub(r" Pde(?=$| [NE(So|S$)(We|W$)])", " Parade", address)
    address = re.sub(r" Cct(?=$| [NE(So|S$)(We|W$)])", " Circuit", address)
    return address

def standard_addr(address):
    '''Checks for unit numbers and street addresses and puts them in the standard format''' 
    #print("################################")
    #print("### Address: ", address)
    unit_nums = re.findall(r"(?<=Unit )\w?\d+\w?|(?<=U)\d+\w?|\w?\d+\w?(?=\s*/)", address)
    unit_num = unit_nums[0] if len(unit_nums)==1 else ""
    #print("Unit Number: ", unit_num)
    proc_addr = re.sub(r"Unit \w?\d+\w?/?|U\d+\w?/?|\w?\d+\w?\s*/", "", address)
    proc_addr = re.sub(r"^[,\- ]+|[,\- ]+$", "", proc_addr)
    #print("Unitless address: ", proc_addr)
    type_opts = r"Terrace|Way|Walk|St|Rd|Ave|Cl|Ct|Cres|Blvd|Dr|Ln|Pl|Sq|Pde|Cct"
    road_attrs_pattern = r"(?P<rd_no>\w?\d+(\-\d+)?\w?\s+)(?P<rd_nm>[a-zA-z \d\-]+)\s+(?P<rd_tp>" + type_opts + ")"
    #print("Road Attr Pattern: ", road_attrs_pattern)
    road_attrs = re.search(road_attrs_pattern, proc_addr)
    try:
        road_num = road_attrs.group('rd_no').strip()
    except AttributeError:
        road_num = ""
    #print("Road number: ", road_num)
    try:
        road_name = road_attrs.group('rd_nm').strip()
    except AttributeError:
        road_name = ""
    #print("Road name: ", road_name)
    try:
        road_type = road_attrs.group('rd_tp').strip()
    except AttributeError:
        road_type = ""
    #print("Road type: ", road_type)
    proc_addr = lengthen_rd(re.sub(r"^[,\- ]+|[,\- ]+$", "", re.sub(road_attrs_pattern, "", proc_addr)))
    #print("Leftover: ", proc_addr)

    unit_seg = (unit_num + "/" if unit_num!="" else "") if road_num != "" else ("Unit " + unit_num + ", " if unit_num!="" else "")
    road_seg = ((road_num + " " if road_num!="" else "") + road_name + " " + road_type).strip()
    post_road_seg = " " + proc_addr if proc_addr != "" else ""
    proc_addr = (unit_seg + road_seg) + post_road_seg
    #print("### Processed Address: ", proc_addr)
    return proc_addr

In [16]:
# clean addresses
companies['raw_address'] = companies['company_address']
companies['company_address'] =companies['raw_address'].apply(lambda x: standard_addr(str(x)))

# first assign company name a common var
companies['name_group'] = companies.groupby('company_name').grouper.group_info[0]

# now assign clean address common var
companies['address_group']=companies.groupby('company_address').grouper.group_info[0]

In [17]:
companies['company_name'] = companies.company_name.str.lower()

In [18]:
def apply_parent_dict(df, name, par_dict):
    df['parent_group'] = df[name].str.lower().map(par_dict)
    df['parent_group']=df['parent_group'].fillna(df[name])
    df['parent_group'] = df['parent_group'].apply(lambda x: "duke energy" if "duke energy" in x else x)
    df['parent_group'] = df['parent_group'].apply(lambda x: "nextera energy" if "nextera" in x else x)
    df['parent_group'] = df['parent_group'].apply(lambda x: "nrg energy" if "nrg" in x else x)
    df['parent_group'] = df['parent_group'].apply(lambda x: "solar star" if "solar star" in x else x)
    df['parent_group'] = df['parent_group'].apply(lambda x: "nrg" if "agua caliente solar" in x else x)
    df['parent_group'] = df['parent_group'].apply(lambda x: "nextera energy" if "blythe solar" in x else x)
    df['parent_group'] = df['parent_group'].apply(lambda x: "8me" if "8me" in x.lower() else x)
    df['parent_group'] = df['parent_group'].apply(lambda x: "exelon" if "av solar" in x else x)
    df['parent_group'] = df['parent_group'].apply(lambda x: "exelon" if "exelon" in x else x)
    return(df)

In [19]:
# now do parent dict... can drop those and deal with remainder

companies = apply_parent_dict(companies, 'company_name', parent_map)

In [20]:
companies = companies.sort_values('name_group')
name_dict = dict(zip(companies.company_name, companies.name_group))
address_dict = dict(zip(companies.company_name, companies.address_group))
par_dict =  dict(zip(companies.company_name, companies.parent_group))



In [21]:
def group_two_col(df, col1, col2):
    address_map = {} # is col1
    name_map = {} # is col2 
    group_id = [] # maps to each row in rows
    maximum_group = 0

    
    for  index, row in df.iterrows():

        if address_map.get(row[col1]) is not None:
            group_id.append(address_map[row[col1]]) 
            if name_map.get(row[col2]) is None:
                name_map[row[col2]] = address_map[row[col1]]
            continue

        if name_map.get(row[col2]) is not None:
            group_id.append(name_map[row[col2]])
        
            if address_map.get(row[col1]) is None:
                address_map[row[col1]] = name_map[row[col2]]
            continue
  
  # If we're here no group found, insert new group
        address_map[row[col1]] = maximum_group + 1 # new group id
        name_map[row[col2]] = maximum_group + 1 # new group id

        group_id.append(maximum_group + 1)

        maximum_group += 1
    
    return(group_id)
    

In [22]:
companies['id'] = group_two_col(companies,'company_address','company_name')
companies['id'] = group_two_col(companies,'id','parent_group')

In [27]:
companies[companies['id']==95]

Unnamed: 0,company_name,company_address,company_state,company_city,company_zip,company_id,raw_address,name_group,address_group,parent_group,id
3197,"adelanto solar ii, llc",700 Universe Blvd,FL,Juno Beach,33408,59212,700 Universe Blvd,127,3805,"adelanto solar ii, llc",95
3196,"adelanto solar, llc",700 Universe Blvd,FL,Juno Beach,33408,59211,700 Universe Blvd,128,3805,"adelanto solar, llc",95
1786,altagas blythe operations inc.,700 Universe Blvd,CA,Juno Beach,33408,55713,700 Universe Blvd,211,3805,altagas blythe operations inc.,95
2251,ashtabula wind iii llc,700 Universe Blvd,FL,Juno Beach,33408,56727,700 Universe Blvd,341,3805,ashtabula wind iii llc,95
2228,baldwin wind llc,700 Universe Blvd,ND,Juno Beach,33408,56688,700 Universe Blvd,441,3805,baldwin wind llc,95
...,...,...,...,...,...,...,...,...,...,...,...
1476,west texas wind energy partners llc,700 Universe Blvd,FL,Juno Beach,33408,20424,700 Universe Blvd,5917,3805,west texas wind energy partners llc,95
4950,"wheatridge wind energy, llc",700 Universe Blvd,FL,Juno Beach,33408,62668,700 Universe Blvd,5960,3805,"wheatridge wind energy, llc",95
3472,"white oak solar, llc",700 Universe Blvd,FL,Juno Beach,33408,59831,700 Universe Blvd,5971,3805,"white oak solar, llc",95
3459,"white pine solar, llc",700 Universe Blvd,FL,Juno Beach,33408,59803,700 Universe Blvd,5973,3805,"white pine solar, llc",95


In [176]:
## make company_name --> id  (with parent name) dictionary

In [24]:
companies['parent_group'] = companies.parent_group.str.replace('nrg energy','nrg')

### Now save company_name --> parent_company dictionary so I can use with the generator data

In [30]:
final_par_dict = dict(zip(companies.company_name, companies.id))

In [31]:
with open(dataDir + 'JSONFiles/final_parent_dict.json','w') as outfile:
    json.dump(final_par_dict,outfile)

In [37]:
companies.groupby('id').company_name.nunique().reset_index().sort_values(by='company_name',ascending=False)

Unnamed: 0,id,company_name
57,58,154
94,95,116
4,5,78
62,63,45
136,137,44
...,...,...
1489,1490,1
1490,1491,1
1491,1492,1
1493,1494,1


In [73]:
companies[(companies['id']==24) | (companies['id']==3363)]

Unnamed: 0,company_name,company_address,company_state,company_city,company_zip,company_id,raw_address,name_group,address_group,parent_group,id
3650,8point3 operating company llc,77 Rio Robles,CA,San Jose,95134,60024,77 Rio Robles,36,1089,8point3 operating company llc,24
4426,"pgc plano i, llc",77 Rio Robles,CA,San Jose,95134,61509,77 Rio Robles,4096,1089,"pgc plano i, llc",24
4428,"redstone solar i, llc",77 Rio Robles,CA,San Jose,95134,61513,77 Rio Robles,4484,1089,"redstone solar i, llc",24
2920,solar star arizona ii llc,1414 Harbour Way,CA,South Richmond,94804,58649,1414 Harbour Way,4937,2368,solar star,3363
2845,solar star arizona iii llc,1414 Harbour Way,CA,South Richmond,94804,58507,1414 Harbour Way,4938,2368,solar star,3363
4503,"solar star california lxvi, llc",77 Rio Robles,CA,San Jose,95134,61600,77 Rio Robles,4942,1089,solar star,24
3495,"solar star california xix, llc",15367 Avenue A,CA,Rosamond,93560,59873,15367 Avenue A,4943,288,solar star,3363
4505,"solar star california xlii, llc",77 Rio Robles,CA,San Jose,95134,61602,77 Rio Robles,4945,1089,solar star,24
4504,"solar star california xlvii, llc",77 Rio Robles,CA,San Jose,95134,61601,77 Rio Robles,4946,1089,solar star,24
3494,"solar star california xx, llc",11936 Rosamond Blvd,CA,Rosamond,93560,59872,11936 Rosamond Blvd,4949,2183,solar star,3363


In [44]:
decode_parent_map = {
    95: 'nextera',
    5: 'southern current',
    63: 'tenaska capital',
    137: 'exelon'
}