In [452]:
import pandas as pd
import statsmodels.formula.api as smf
from collections import defaultdict

In [453]:
data = pd.read_csv("output/describeNetwork.csv", dtype={'primary_naics': object})

In [454]:
naics = pd.read_csv("data/naics.csv", dtype={'2012 NAICS US Code': object})

In [455]:
naics = naics.drop(naics.columns[[0,3,4]], axis=1)

In [456]:
naics.columns = ["naics", "indName"]

In [457]:
# data["naics2dig"] = data['primary_naics'].str[:2]
# data["naics3dig"] = data['primary_naics'].str[:3]

In [458]:
# data = data.merge(naics, left_on=["naics2dig"], right_on="naics").merge(naics, left_on=["naics3dig"], right_on="naics", suffixes=("2", "3"))
# data = data.drop(["naics2dig", "naics3dig"], axis=1)

In [459]:
# this is why we couldn't use 2-digit naics codes - they don't all match up to the crosswalk
naics[naics['naics'].str.len()==2]

Unnamed: 0,naics,indName
1,11,"Agriculture, Forestry, Fishing and Hunting"
132,21,"Mining, Quarrying, and Oil and Gas Extraction"
180,22,Utilities
205,23,Construction
930,42,Wholesale Trade
1402,51,Information
1480,52,Finance and Insurance
1569,53,Real Estate and Rental and Leasing
1624,54,"Professional, Scientific, and Technical Services"
1718,55,Management of Companies and Enterprises


In [460]:
data["naics2"] = data['primary_naics'].str[:2]
data["naics3"] = data['primary_naics'].str[:3]

In [461]:
data = data.merge(naics, left_on=["naics3"], right_on="naics", how="left")

In [462]:
data = data.drop("naics", axis=1)

In [463]:
data.head()

Unnamed: 0.1,Unnamed: 0,Facility,Degrees,Clustering Coefficient,Betweeness,Closeness,Eigenvector,primary_naics,parent_company_name,Community,naics2,naics3,indName
0,0,IBM CORP,236,0.264443,0.060645,0.728713,0.056001,334111,IBM CORP,0,33,334,Computer and Electronic Product Manufacturing
1,1,IBM CORP,236,0.264443,0.060645,0.728713,0.056001,334413,IBM CORP,0,33,334,Computer and Electronic Product Manufacturing
2,2,EASTMAN KODAK CO EASTMAN BUSINESS PARK,228,0.284218,0.058787,0.717349,0.123391,325992,EASTMAN KODAK CO,0,32,325,Chemical Manufacturing
3,3,NORLITE LLC,225,0.243294,0.10651,0.720157,0.211257,327992,TRADEBE ENVIRONMENTAL SERVICES LLC,1,32,327,Nonmetallic Mineral Product Manufacturing
4,4,ALCOA INC,193,0.332038,0.05108,0.667877,0.05454,331313,ALCOA INC,4,33,331,Primary Metal Manufacturing


In [464]:
data.to_csv('output/facilitiesWithNaics.csv')

In [465]:
communities = defaultdict(dict)

In [466]:
# this makes dict of DFs, which are written to indivual .csv files
for i in data['Community'].unique():
    commData = data[data['Community']==i]
    communities[i]['ungroupedData'] = commData
    industries = commData.groupby(['naics3', 'indName']).size().order(ascending=False)
    industries = industries.reset_index()
    industries = industries.rename(columns = {0:'countPerComm'})
    industries['percentPerComm'] = (industries['countPerComm'] / len(commData))*100
    communities[i]['groupedData'] = industries



In [467]:
for i in communities:
    communities[i]['groupedData'].to_csv('output/community{}.csv'.format(i))

In [468]:
commDict = {}

In [469]:
# this does pretty much same as above, except writes to one master DF and .csv rather than separate files
for i in data['Community'].unique():
    commData = data[data['Community']==i]
#     communities[i]['ungroupedData'] = commData
    industries = commData.groupby(['naics3', 'indName']).size().order(ascending=False)
    industries = industries.reset_index()
    industries = industries.rename(columns = {0:'countPerComm'})
    industries['percentPerComm'] = (industries['countPerComm'] / len(commData))*100
    industries['community'] = i
    commDict[i] = industries
    
#     communities[i]['groupedData'] = industries



In [470]:
commDF = pd.concat([commDict[f] for f in commDict])

In [471]:
cols = commDF.columns.tolist()
cols = cols[-1:] + cols[:-1]
commDF = commDF[cols]

In [472]:
commDF = commDF.reset_index(drop=True)

In [473]:
commDF

Unnamed: 0,community,naics3,indName,countPerComm,percentPerComm
0,0,325,Chemical Manufacturing,30,21.126761
1,0,332,Fabricated Metal Product Manufacturing,21,14.788732
2,0,334,Computer and Electronic Product Manufacturing,14,9.859155
3,0,221,Utilities,13,9.154930
4,0,327,Nonmetallic Mineral Product Manufacturing,13,9.154930
5,0,331,Primary Metal Manufacturing,10,7.042254
6,0,311,Food Manufacturing,7,4.929577
7,0,333,Machinery Manufacturing,4,2.816901
8,0,323,Printing and Related Support Activities,4,2.816901
9,0,562,Waste Management and Remediation Services,4,2.816901


In [474]:
commDF.to_csv('output/allCommunities.csv')