In [13]:
import pandas as pd

In [21]:
# Load the dataset
file = ("carbon_emission_analysis\ghgp_data_2010.csv")
GHG_2010_df = pd.read_csv(file, low_memory=False, thousands=',')

In [22]:
print(GHG_2010_df.columns)

Index(['Facility Id', 'FRS Id', 'Facility Name', 'City', 'State', 'Zip Code',
       'Address', 'County', 'Latitude', 'Longitude', 'Primary NAICS Code',
       'Industry Type (subparts)', 'Industry Type (sectors)',
       'Total reported direct emissions', 'CO2 emissions (non-biogenic) ',
       'Methane (CH4) emissions ', 'Nitrous Oxide (N2O) emissions ',
       'HFC emissions', 'PFC emissions', 'SF6 emissions ', 'NF3 emissions',
       'Other Fully Fluorinated GHG emissions', 'HFE emissions',
       'Very Short-lived Compounds emissions', 'Other GHGs (metric tons CO2e)',
       'Biogenic CO2 emissions (metric tons)', 'Stationary Combustion',
       'Electricity Generation', 'Adipic Acid Production',
       'Aluminum Production', 'Ammonia Manufacturing', 'Cement Production',
       'Ferroalloy Production', 'Glass Production',
       'HCFC�22 Production from HFC�23 Destruction', 'Hydrogen Production',
       'Iron and Steel Production', 'Lead Production', 'Lime Production',
       'Mis

In [23]:
# Select the columns to keep for the analysis
GHG_2010_df = GHG_2010_df[['Facility Id', 'Facility Name', 'State', 
                           'Latitude', 'Longitude', 
                           'Primary NAICS Code', 'Industry Type (sectors)', 
                           'Total reported direct emissions', 'CO2 emissions (non-biogenic) ']]
GHG_2010_df.head(5)

Unnamed: 0,Facility Id,Facility Name,State,Latitude,Longitude,Primary NAICS Code,Industry Type (sectors),Total reported direct emissions,CO2 emissions (non-biogenic)
0,1004377,121 REGIONAL DISPOSAL FACILITY,TX,33.29857,-96.53586,562212,Waste,0.0,0.0
1,1000112,23rd and 3rd,NY,40.663,-74.0,221112,Power Plants,82959.74,82875.9
2,1006394,29-6 #2 Central Delivery Point,NM,36.7452,-107.4455,213112,Petroleum and Natural Gas Systems,25176.66,25150.9
3,1002885,30-5 Central Delivery Point Compressor Station,NM,36.8118,-107.4036,213112,Petroleum and Natural Gas Systems,38883.2,38843.2
4,1002707,31-6 Central Delivery Point,NM,36.8363,-107.4199,213112,Petroleum and Natural Gas Systems,63884.36,63818.6


In [24]:
# Check the column data type
GHG_2010_df.dtypes

Facility Id                          int64
Facility Name                       object
State                               object
Latitude                           float64
Longitude                          float64
Primary NAICS Code                   int64
Industry Type (sectors)             object
Total reported direct emissions    float64
CO2 emissions (non-biogenic)       float64
dtype: object

In [37]:
# Remove the null value
GHG_2010_df.isnull()

Unnamed: 0,Fac_ID,Fac_Name,State,Latitude,Longitude,NAICS,Sector,Total_Emissions,CO2_Emissions
0,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
6292,False,False,False,False,False,False,False,False,False
6293,False,False,False,False,False,False,False,False,False
6294,False,False,False,False,False,False,False,False,False
6295,False,False,False,False,False,False,False,False,False


In [38]:
GHG_2010_df.dropna(inplace=True)

In [42]:
GHG_2010_df = GHG_2010_df.rename(columns = {"Facility Id": "Fac_ID",
                                           "Facility Name": "Fac_Name",
                                           "Primary NAICS Code": "NAICS",
                                           "Industry Type (sectors)": "Sector",
                                           "Total reported direct emissions": "Total_Emissions",
                                           "CO2 emissions (non-biogenic) ": "CO2_Emissions"})

In [43]:
GHG_2010_df.isnull().count()

Fac_ID             5563
Fac_Name           5563
State              5563
Latitude           5563
Longitude          5563
NAICS              5563
Sector             5563
Total_Emissions    5563
CO2_Emissions      5563
dtype: int64

In [44]:
GHG_2010_df['Fac_ID'] = GHG_2010_df['Fac_ID'].astype(int)
GHG_2010_df['NAICS'] = GHG_2010_df['NAICS'].astype(int)
GHG_2010_df.head(5)

Unnamed: 0,Fac_ID,Fac_Name,State,Latitude,Longitude,NAICS,Sector,Total_Emissions,CO2_Emissions
0,1004377,121 REGIONAL DISPOSAL FACILITY,TX,33.29857,-96.53586,562212,Waste,0.0,0.0
1,1000112,23rd and 3rd,NY,40.663,-74.0,221112,Power Plants,82959.74,82875.9
2,1006394,29-6 #2 Central Delivery Point,NM,36.7452,-107.4455,213112,Petroleum and Natural Gas Systems,25176.66,25150.9
3,1002885,30-5 Central Delivery Point Compressor Station,NM,36.8118,-107.4036,213112,Petroleum and Natural Gas Systems,38883.2,38843.2
4,1002707,31-6 Central Delivery Point,NM,36.8363,-107.4199,213112,Petroleum and Natural Gas Systems,63884.36,63818.6


In [45]:
GHG_2010_df.to_csv("GHG_2010.csv", index = False, header = True)