In [1]:
import pandas as pd

In [2]:
# Load the dataset
file = ("ghgp_data_2010.csv")
GHG_2010_df = pd.read_csv(file, low_memory=False)

In [3]:
print(GHG_2010_df.columns)

Index(['Facility Id', 'FRS Id', 'Facility Name', 'City', 'State', 'Zip Code',
       'Address', 'County', 'Latitude', 'Longitude', 'Primary NAICS Code',
       'Industry Type (subparts)', 'Industry Type (sectors)',
       'Total reported direct emissions', 'CO2 emissions (non-biogenic) ',
       'Methane (CH4) emissions ', 'Nitrous Oxide (N2O) emissions ',
       'HFC emissions', 'PFC emissions', 'SF6 emissions ', 'NF3 emissions',
       'Other Fully Fluorinated GHG emissions', 'HFE emissions',
       'Very Short-lived Compounds emissions', 'Other GHGs (metric tons CO2e)',
       'Biogenic CO2 emissions (metric tons)', 'Stationary Combustion',
       'Electricity Generation', 'Adipic Acid Production',
       'Aluminum Production', 'Ammonia Manufacturing', 'Cement Production',
       'Ferroalloy Production', 'Glass Production',
       'HCFC�22 Production from HFC�23 Destruction', 'Hydrogen Production',
       'Iron and Steel Production', 'Lead Production', 'Lime Production',
       'Mis

In [4]:
# Select the columns to keep for the analysis
GHG_2010_df = GHG_2010_df[['Facility Id', 'Facility Name', 'State', 
                           'Latitude', 'Longitude', 
                           'Primary NAICS Code', 'Industry Type (sectors)', 
                           'Total reported direct emissions', 'CO2 emissions (non-biogenic) ']]

In [5]:
# Check the column data type
GHG_2010_df.dtypes

Facility Id                          int64
Facility Name                       object
State                               object
Latitude                           float64
Longitude                          float64
Primary NAICS Code                   int64
Industry Type (sectors)             object
Total reported direct emissions     object
CO2 emissions (non-biogenic)        object
dtype: object

In [6]:
# Remove the null value
GHG_2010_df.isnull()

Unnamed: 0,Facility Id,Facility Name,State,Latitude,Longitude,Primary NAICS Code,Industry Type (sectors),Total reported direct emissions,CO2 emissions (non-biogenic)
0,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
6292,False,False,False,False,False,False,False,False,False
6293,False,False,False,False,False,False,False,False,False
6294,False,False,False,False,False,False,False,False,False
6295,False,False,False,False,False,False,False,False,False


In [7]:
GHG_2010_df.dropna()

Unnamed: 0,Facility Id,Facility Name,State,Latitude,Longitude,Primary NAICS Code,Industry Type (sectors),Total reported direct emissions,CO2 emissions (non-biogenic)
0,1004377,121 REGIONAL DISPOSAL FACILITY,TX,33.298570,-96.535860,562212,Waste,0.00,0.00
1,1000112,23rd and 3rd,NY,40.663000,-74.000000,221112,Power Plants,82959.74,82875.90
2,1006394,29-6 #2 Central Delivery Point,NM,36.745200,-107.445500,213112,Petroleum and Natural Gas Systems,25176.66,25150.90
3,1002885,30-5 Central Delivery Point Compressor Station,NM,36.811800,-107.403600,213112,Petroleum and Natural Gas Systems,38883.20,38843.20
4,1002707,31-6 Central Delivery Point,NM,36.836300,-107.419900,213112,Petroleum and Natural Gas Systems,63884.36,63818.60
...,...,...,...,...,...,...,...,...,...
6292,1003318,ZYBACH CRYOGENIC PLANT,TX,35.605000,-100.148333,211111,Petroleum and Natural Gas Systems,53508.55,53453.20
6293,1001308,Zeeland Generating Station,MI,42.820000,-85.998800,221112,Power Plants,455591.16,455128.10
6294,1000805,Zion Energy Center,IL,42.477600,-87.895000,221112,Power Plants,84045.04,83959.00
6295,1001464,Zuni,CO,39.736900,-105.016900,221330,Power Plants,44410.74,44365.80


In [8]:
GHG_2010_df = GHG_2010_df.rename(columns = {"Facility Id": "Fac_ID",
                                           "Facility Name": "Fac_Name",
                                           "Primary NAICS Code": "NAICS",
                                           "Industry Type (sectors)": "Sector",
                                           "Total reported direct emissions": "Total_Emissions",
                                           "CO2 emissions (non-biogenic) ": "CO2_Emissions"})
GHG_2010_df.head(8)

Unnamed: 0,Fac_ID,Fac_Name,State,Latitude,Longitude,NAICS,Sector,Total_Emissions,CO2_Emissions
0,1004377,121 REGIONAL DISPOSAL FACILITY,TX,33.29857,-96.53586,562212,Waste,0.0,0.0
1,1000112,23rd and 3rd,NY,40.663,-74.0,221112,Power Plants,82959.74,82875.9
2,1006394,29-6 #2 Central Delivery Point,NM,36.7452,-107.4455,213112,Petroleum and Natural Gas Systems,25176.66,25150.9
3,1002885,30-5 Central Delivery Point Compressor Station,NM,36.8118,-107.4036,213112,Petroleum and Natural Gas Systems,38883.2,38843.2
4,1002707,31-6 Central Delivery Point,NM,36.8363,-107.4199,213112,Petroleum and Natural Gas Systems,63884.36,63818.6
5,1003742,31st Street Landfill,IL,41.83491,-87.91633,562212,Waste,12869.0,
6,1002718,32-7 Central Delivery Point,NM,36.9313,-107.5604,213112,Petroleum and Natural Gas Systems,56478.59,56420.5
7,1002721,32-8 #2 CDP COMPRESSOR STATION,NM,36.9569,-107.6631,213112,Petroleum and Natural Gas Systems,55075.09,55018.1


In [9]:
GHG_2010_df.to_csv("GHG_2010.csv", index = 'False', header = 'True')