## This code creates a dictionary called final_parent_dict.txt that maps owner/entity/company name to a "parent" owner
## The dictionary can be used to analyze other micro datasets 

In [157]:
import pandas as pd
import matplotlib as plt
import re
import json
%matplotlib inline
dataDir = '/Users/rachelanderson/Dropbox (Princeton)/Data for Tax Equity Project/'
 
with open(dataDir + 'JSONFiles/parent_utility_dict.txt') as outfile:
    parent_dict = json.load(outfile)

In [158]:
parent_dict = dict((k.lower(), v.lower()) for k,v in parent_dict.items())

In [159]:
# read in other parent list and merge them
parent_map = {}
subs_data = pd.read_csv(dataDir + 'company_data/good_jobs_subs_data.csv')
subs_data.columns = subs_data.columns.str.lower().str.replace(" ", "_")
subs_data.company = subs_data.company.str.lower()
subs_data.parent_company = subs_data.parent_company.str.lower()

parent_map = dict(zip(subs_data.company, subs_data.parent_company))
parent_map.update(parent_dict)

## make master list of utility names from utility dataset, generator dataset, and owners dataset

### Read in relevant datasets

In [160]:
owners = pd.read_csv(dataDir + 'master_company_list_2018/owners2018.csv')
gen_data = pd.read_csv(dataDir + 'master_company_list_2018/gen_2018.csv')
utils = pd.read_csv(dataDir + 'master_company_list_2018/utils_2018.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [161]:
dfList = [owners, gen_data, utils]


In [162]:
owners.columns

Index(['Unnamed: 0', 'utility_id', 'utility_name', 'plant_code', 'plant_name',
       'state', 'generator_id', 'status', 'owner_name', 'owner_street_address',
       'owner_city', 'owner_state', 'owner_zip', 'ownership_id',
       'percent_owned', 'perc_owned', 'num_owners', 'idx', 'owner_1',
       'owner_2', 'owner_3', 'owner_4', 'perc_owner_1', 'perc_owner_2',
       'perc_owner_3', 'perc_owner_4'],
      dtype='object')

In [163]:
owners.rename(columns = {'owner_name':'company_name', 
                         'owner_street_address':'company_address',
                         'owner_state': 'company_state',
                          'owner_city': 'company_city',
                          'owner_zip': 'company_zip',
                           'ownership_id': 'company_id'},inplace=True)
gen_data.rename(columns = {'utility_name':'company_name'}, inplace=True)
utils.rename(columns = {'utility_name':'company_name',
                       'utility_id': 'company_id',
                        'street_address':'company_address',
                         'state':'company_state',
                         'city': 'company_city', 
                          'zip':'company_zip'},inplace=True)

In [164]:
companies.head(10)

Unnamed: 0,company_name,company_address,company_state,company_city,company_zip,company_id,raw_address,name_group,address_group,parent_group,id
4964,"10 briggs solar ng, llc","267 Water St Reet, 2Nd Floor",RI,Warren,2885,62685,"267 Water St Reet, 2Nd Floor",0,2971,"10 briggs solar ng, llc",1
4899,"1025 traveller solar, llc",1447 S Tryon St St E 201,NC,Charlotte,28203,62139,1447 S Tryon St Ste 201,1,2365,"1025 traveller solar, llc",2
4900,"1047 little mountain solar, llc",1447 S Tryon St St E 201,NC,Charlotte,28203,62140,1447 S Tryon St Ste 201,2,2365,"1047 little mountain solar, llc",2
3816,126 grove solar llc,315 Post Road West,CT,Westport,6880,60517,315 Post Road West,3,636,126 grove solar llc,3
498,126 grove solar llc,315 Post Road West,CT,Westport,6880,60517,315 Post Road West,3,636,126 grove solar llc,3
3673,158th fighter wing,105 NCO Dr Ive,VT,South Burlington,5403,60301,105 NCO Dr Ive,4,2018,158th fighter wing,4
4453,"1634 solar, llc",1519 King St Reet,SC,Charleston,29405,61541,1519 King St Reet,5,2422,"1634 solar, llc",5
4397,174 power global corp.,300 Spectrum Center Dr . Ste1020,CA,Irvine,92618,61222,300 Spectrum Center Dr . Ste1020,6,3048,174 power global corp.,6
2137,"180 raritan energy solutions, llc","633 Division St Reet, Suite 2",NJ,Elizabeth,7201,58164,"633 Division St Reet, Suite 2",7,3707,"180 raritan energy solutions, llc",7
2671,"180 raritan energy solutions, llc","633 Division St Reet, Suite 2",NJ,Elizabeth,7201,58164,"633 Division St Reet, Suite 2",7,3707,"180 raritan energy solutions, llc",7


In [165]:
# make df with master list of companies (utilities + owners)
companyVals = ['name', 'address','state','city','zip','id']
companyVals = ['company_' + x for x in companyVals]
companies = owners[companyVals].append(utils[companyVals])
# select unique combos
companies.drop_duplicates(inplace=True)

In [166]:
names = gen_data['company_name'].drop_duplicates()
# I found that no names in gen data are not contained in the other datasets, so I don't merge it in

In [167]:
def shorten_rd(address):
    '''Completes the road type. I.e. Rd becomes Road, st becomes Street as per Google etc.'''
    address = address.title()
    address = re.sub(r" Street(?=$| [NE(So|S$)(We|W$)])", ' St', address)
    address = re.sub(r" Road(?=$| [NE(So|S$)(We|W$)])", ' Rd', address)
    address = re.sub(r"(?<!The) Avenue(?=$| [NE(So|S$)(We|W$)])", ' Ave', address)
    address = re.sub(r" Close(?=$| [NE(So|S$)(We|W$)])", ' Cl', address)
    address = re.sub(r" Court(?=$| [NE(So|S$)(We|W$)])", ' Ct', address)
    address = re.sub(r"(?<!The) Crescent(?=$| [NE(So|S$)(We|W$)])", ' Cres', address)
    address = re.sub(r" Boulevarde?(?=$| [NE(So|S$)(We|W$)])", ' Blvd', address)
    address = re.sub(r" Drive(?=$| [NE(So|S$)(We|W$)])", ' Dr', address)
    address = re.sub(r" Lane(?=$| [NE(So|S$)(We|W$)])", ' Ln', address)
    address = re.sub(r" Place(?=$| [NE(So|S$)(We|W$)])", ' Pl', address)
    address = re.sub(r" Square(?=$| [NE(So|S$)(We|W$)])", ' Sq', address)
    address = re.sub(r"(?<!The) Parade(?=$| [NE(So|S$)(We|W$)])", ' Pde', address)
    address = re.sub(r" Circuit(?=$| [NE(So|S$)(We|W$)])", ' Cct', address)
    return address

def lengthen_rd(address):
    address = address.title()
    address = re.sub(r" St(?=$| [NE(So|S$)(We|W$)])", " Street", address)
    address = re.sub(r" Rd(?=$| [NE(So|S$)(We|W$)])", " Road", address)
    address = re.sub(r" Ave(?=$| [NE(So|S$)(We|W$)])", " Avenue", address)
    address = re.sub(r" Cl(?=$| [NE(So|S$)(We|W$)])", " Close", address)
    address = re.sub(r" Ct(?=$| [NE(So|S$)(We|W$)])", " Court", address)
    address = re.sub(r" Cres(?=$| [NE(So|S$)(We|W$)])", " Crescent", address)
    address = re.sub(r" Blvd(?=$| [NE(So|S$)(We|W$)])", " Boulevard", address)
    address = re.sub(r" Dr(?=$| [NE(So|S$)(We|W$)])", " Drive", address)
    address = re.sub(r" Ln(?=$| [NE(So|S$)(We|W$)])", " Lane", address)
    address = re.sub(r" Pl(?=$| [NE(So|S$)(We|W$)])", " Place", address)
    address = re.sub(r" Sq(?=$| [NE(So|S$)(We|W$)])", " Square", address)
    address = re.sub(r" Pde(?=$| [NE(So|S$)(We|W$)])", " Parade", address)
    address = re.sub(r" Cct(?=$| [NE(So|S$)(We|W$)])", " Circuit", address)
    return address

def standard_addr(address):
    '''Checks for unit numbers and street addresses and puts them in the standard format''' 
    #print("################################")
    #print("### Address: ", address)
    unit_nums = re.findall(r"(?<=Unit )\w?\d+\w?|(?<=U)\d+\w?|\w?\d+\w?(?=\s*/)", address)
    unit_num = unit_nums[0] if len(unit_nums)==1 else ""
    #print("Unit Number: ", unit_num)
    proc_addr = re.sub(r"Unit \w?\d+\w?/?|U\d+\w?/?|\w?\d+\w?\s*/", "", address)
    proc_addr = re.sub(r"^[,\- ]+|[,\- ]+$", "", proc_addr)
    #print("Unitless address: ", proc_addr)
    type_opts = r"Terrace|Way|Walk|St|Rd|Ave|Cl|Ct|Cres|Blvd|Dr|Ln|Pl|Sq|Pde|Cct"
    road_attrs_pattern = r"(?P<rd_no>\w?\d+(\-\d+)?\w?\s+)(?P<rd_nm>[a-zA-z \d\-]+)\s+(?P<rd_tp>" + type_opts + ")"
    #print("Road Attr Pattern: ", road_attrs_pattern)
    road_attrs = re.search(road_attrs_pattern, proc_addr)
    try:
        road_num = road_attrs.group('rd_no').strip()
    except AttributeError:
        road_num = ""
    #print("Road number: ", road_num)
    try:
        road_name = road_attrs.group('rd_nm').strip()
    except AttributeError:
        road_name = ""
    #print("Road name: ", road_name)
    try:
        road_type = road_attrs.group('rd_tp').strip()
    except AttributeError:
        road_type = ""
    #print("Road type: ", road_type)
    proc_addr = lengthen_rd(re.sub(r"^[,\- ]+|[,\- ]+$", "", re.sub(road_attrs_pattern, "", proc_addr)))
    #print("Leftover: ", proc_addr)

    unit_seg = (unit_num + "/" if unit_num!="" else "") if road_num != "" else ("Unit " + unit_num + ", " if unit_num!="" else "")
    road_seg = ((road_num + " " if road_num!="" else "") + road_name + " " + road_type).strip()
    post_road_seg = " " + proc_addr if proc_addr != "" else ""
    proc_addr = (unit_seg + road_seg) + post_road_seg
    #print("### Processed Address: ", proc_addr)
    return proc_addr

In [168]:
# clean addresses
companies['raw_address'] = companies['company_address']
companies['company_address'] =companies['raw_address'].apply(lambda x: standard_addr(str(x)))

# first assign company name a common var
companies['name_group'] = companies.groupby('company_name').grouper.group_info[0]

# now assign clean address common var
companies['address_group']=companies.groupby('company_address').grouper.group_info[0]

In [169]:
companies['company_name'] = companies.company_name.str.lower()

In [170]:
def apply_parent_dict(df, name, par_dict):
    df['parent_group'] = df[name].str.lower().map(par_dict)
    df['parent_group']=df['parent_group'].fillna(df[name])
    df['parent_group'] = df['parent_group'].apply(lambda x: "duke energy" if "duke energy" in x else x)
    df['parent_group'] = df['parent_group'].apply(lambda x: "nextera energy" if "nextera" in x else x)
    df['parent_group'] = df['parent_group'].apply(lambda x: "nrg energy" if "nrg" in x else x)
    df['parent_group'] = df['parent_group'].apply(lambda x: "solar star" if "solar star" in x else x)
    df['parent_group'] = df['parent_group'].apply(lambda x: "nrg" if "agua caliente solar" in x else x)
    df['parent_group'] = df['parent_group'].apply(lambda x: "nextera energy" if "blythe solar" in x else x)
    df['parent_group'] = df['parent_group'].apply(lambda x: "8me" if "8me" in x.lower() else x)
    df['parent_group'] = df['parent_group'].apply(lambda x: "exelon" if "av solar" in x else x)
    df['parent_group'] = df['parent_group'].apply(lambda x: "exelon" if "exelon" in x else x)
    return(df)

In [171]:
# now do parent dict... can drop those and deal with remainder

companies = apply_parent_dict(companies, 'company_name', parent_map)

In [172]:
companies = companies.sort_values('name_group')
name_dict = dict(zip(companies.company_name, companies.name_group))
address_dict = dict(zip(companies.company_name, companies.address_group))
par_dict =  dict(zip(companies.company_name, companies.parent_group))



In [173]:
def group_two_col(df, col1, col2):
    address_map = {} # is col1
    name_map = {} # is col2 
    group_id = [] # maps to each row in rows
    maximum_group = 0

    
    for  index, row in df.iterrows():

        if address_map.get(row[col1]) is not None:
            group_id.append(address_map[row[col1]]) 
            if name_map.get(row[col2]) is None:
                name_map[row[col2]] = address_map[row[col1]]
            continue

        if name_map.get(row[col2]) is not None:
            group_id.append(name_map[row[col2]])
        
            if address_map.get(row[col1]) is None:
                address_map[row[col1]] = name_map[row[col2]]
            continue
  
  # If we're here no group found, insert new group
        address_map[row[col1]] = maximum_group + 1 # new group id
        name_map[row[col2]] = maximum_group + 1 # new group id

        group_id.append(maximum_group + 1)

        maximum_group += 1
    
    return(group_id)
    

In [174]:
companies['id'] = group_two_col(companies,'company_address','company_name')
companies['id'] = group_two_col(companies,'id','parent_group')

In [175]:
companies.drop_duplicates(subset='id',keep='first')

Unnamed: 0,company_name,company_address,company_state,company_city,company_zip,company_id,raw_address,name_group,address_group,parent_group,id
4964,"10 briggs solar ng, llc","267 Water St Reet, 2Nd Floor",RI,Warren,02885,62685,"267 Water Street, 2nd Floor",0,2984,"10 briggs solar ng, llc",1
4899,"1025 traveller solar, llc",1447 S Tryon St Ste 201,NC,Charlotte,28203,62139,"1447 S Tryon St, Ste 201",1,2387,"1025 traveller solar, llc",2
3816,126 grove solar llc,315 Post Road West,CT,Westport,06880,60517,315 Post Road West,3,647,126 grove solar llc,3
3673,158th fighter wing,105 NCO Dr Ive,VT,South Burlington,05403,60301,105 NCO Drive,4,2040,158th fighter wing,4
4453,"1634 solar, llc",1519 King St Reet,SC,Charleston,29405,61541,1519 King Street,5,2444,"1634 solar, llc",5
...,...,...,...,...,...,...,...,...,...,...,...
1519,ziegler power systems,8050 State Highway 101East,MN,Shakopee,55379,21144,8050 State Highway 101East,6090,1116,ziegler power systems,4001
1522,zion energy llc,5701 Ninth St,IL,Zion,60099,21191,5701 Ninth St,6091,3614,zion energy llc,4002
2371,zotos international,300 Forge Ave Nue,NY,Geneva,14456,56977,300 Forge Avenue,6092,3049,zotos international,4003
3914,ebay - south jordan,6614 West Crimson View Dr,UT,South Jordan,84095,59095,6614 West Crimson View Dr,6094,3759,ebay - south jordan,4004


In [176]:
## make company_name --> id  (with parent name) dictionary

In [189]:
companies[companies['parent_group'].str.contains('nrg')]['parent_group'] = 'nrg energy'
companies[companies['parent_group'].str.contains('nrg')]['parent_group']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


3276           nrg
957     nrg energy
1972    nrg energy
565     nrg energy
3292    nrg energy
           ...    
1865    nrg energy
3429    nrg energy
3560    nrg energy
3083    nrg energy
2641    nrg energy
Name: parent_group, Length: 63, dtype: object

### Now save company_name --> parent_company dictionary so I can use with the generator data

In [178]:
final_par_dict = dict(zip(companies.company_name, companies.parent_group))

In [182]:
with open(dataDir + 'JSONFiles/final_parent_dict.json','w') as outfile:
    json.dump(final_par_dict,outfile)