In [494]:
import pandas as pd
from collections import defaultdict

In [495]:
data = pd.read_csv("output/describeNetwork.csv", dtype={'primary_naics': object})
naics = pd.read_csv("data/naics.csv", dtype={'2012 NAICS US Code': object})

In [496]:
naics = naics.drop(naics.columns[[0,3,4]], axis=1)
naics.columns = ["naics", "indName"]

In [497]:
# this is why we couldn't use 2-digit naics codes - they don't all match up to the crosswalk
# these are all the 2-digit codes from the naics xwalk - notice many 2-digit #s are missing
naics[naics['naics'].str.len()==2]

Unnamed: 0,naics,indName
1,11,"Agriculture, Forestry, Fishing and Hunting"
132,21,"Mining, Quarrying, and Oil and Gas Extraction"
180,22,Utilities
205,23,Construction
930,42,Wholesale Trade
1402,51,Information
1480,52,Finance and Insurance
1569,53,Real Estate and Rental and Leasing
1624,54,"Professional, Scientific, and Technical Services"
1718,55,Management of Companies and Enterprises


In [498]:
data["naics3"] = data['primary_naics'].str[:3]

In [499]:
data = data.merge(naics, left_on=["naics3"], right_on="naics", how="left")
data = data.drop("naics", axis=1)

In [500]:
data.to_csv('output/facilitiesWithNaics.csv')

In [501]:
# this makes dict of DFs, which are written to indivual .csv files
communities = defaultdict(dict)

for i in data['Community'].unique():
    commData = data[data['Community']==i]
    communities[i]['ungroupedData'] = commData
    industries = commData.groupby(['naics3', 'indName']).size().order(ascending=False)
    industries = industries.reset_index()
    industries = industries.rename(columns = {0:'countPerComm'})
    industries['percentPerComm'] = (industries['countPerComm'] / len(commData))*100
    communities[i]['groupedData'] = industries



In [502]:
for i in communities:
    communities[i]['groupedData'].to_csv('output/community{}.csv'.format(i))

In [503]:
# this does pretty much same as above, except writes to one master DF and .csv rather than separate files
commDict = {}

for i in data['Community'].unique():
    commData = data[data['Community']==i]
    industries = commData.groupby(['naics3', 'indName']).size().order(ascending=False)
    industries = industries.reset_index()
    industries = industries.rename(columns = {0:'countPerComm'})
    industries['percentPerComm'] = (industries['countPerComm'] / len(commData))*100
    industries['community'] = i
    commDict[i] = industries



In [504]:
commDF = pd.concat([commDict[f] for f in commDict])
cols = commDF.columns.tolist()
cols = cols[-1:] + cols[:-1]
commDF = commDF[cols]
commDF = commDF.reset_index(drop=True)

In [505]:
commDF.to_csv('output/allCommunities.csv')

In [506]:
commDF.groupby('community').head(3)

Unnamed: 0,community,naics3,indName,countPerComm,percentPerComm
0,0,325,Chemical Manufacturing,30,21.126761
1,0,332,Fabricated Metal Product Manufacturing,21,14.788732
2,0,334,Computer and Electronic Product Manufacturing,14,9.859155
22,1,424,"Merchant Wholesalers, Nondurable Goods",28,35.0
23,1,325,Chemical Manufacturing,17,21.25
24,1,326,Plastics and Rubber Products Manufacturing,6,7.5
40,2,326,Plastics and Rubber Products Manufacturing,6,75.0
41,2,336,Transportation Equipment Manufacturing,1,12.5
42,2,325,Chemical Manufacturing,1,12.5
43,3,324,Petroleum and Coal Products Manufacturing,13,27.659574


In [507]:
# change commOfInterest to easily explore full data for community you want to look at
commOfInterest = 5
colsToDrop = ['Unnamed: 0','primary_naics', 'Betweeness', 'Closeness', 'Eigenvector']
data[data['Community']==commOfInterest].drop(colsToDrop, axis=1)

Unnamed: 0,Facility,Degrees,Clustering Coefficient,parent_company_name,Community,naics3,indName
354,FENNER PRECISION,9,0.583333,,5,326,Plastics and Rubber Products Manufacturing
359,UTICA CUTLERY CO,5,1.0,,5,332,Fabricated Metal Product Manufacturing
360,API HEAT TRANSFER INC,5,1.0,API HEAT TRANSFER INC,5,332,Fabricated Metal Product Manufacturing
361,GASSER & SONS INC,5,1.0,,5,332,Fabricated Metal Product Manufacturing
