In [2]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
import pandas as pd



In [3]:
# Import our  dataset
emission_df = pd.read_csv("data/ml_database.csv")
emission_df.head()

Unnamed: 0,Facility Id,State,industry_type,year,total_direct_emissions,state_population,state_gdp_per_capita,state_policies_incentives,USDA_energy_invest_unit,USDA_energy_invest_$,%Renewables
0,1004377,TX,Waste Management and Remediation Services,2016,221014.75,27914064,58033.62062,140,176,83600842.01,13.4
1,1003188,TX,Plastics and Rubber Products Manufacturing,2016,31773.48,27914064,58033.62062,140,176,83600842.01,13.4
2,1007733,TX,Utilities,2016,22362.816,27914064,58033.62062,140,176,83600842.01,13.4
3,1002685,TX,Oil and Gas Extraction,2016,265377.77,27914064,58033.62062,140,176,83600842.01,13.4
4,1005601,TX,Nonmetallic Mineral Product Manufacturing,2016,55858.24,27914064,58033.62062,140,176,83600842.01,13.4


In [4]:
emission_df.columns

Index(['Facility Id', 'State', 'industry_type', 'year',
       'total_direct_emissions', 'state_population', 'state_gdp_per_capita',
       'state_policies_incentives', 'USDA_energy_invest_unit',
       'USDA_energy_invest_$', '%Renewables'],
      dtype='object')

In [5]:
# Generate our categorical variable list
emission_cat = emission_df.dtypes[emission_df.dtypes == "object"].index.tolist()

# Check the number of unique values in each column
emission_df[emission_cat].nunique()

State            47
industry_type    50
dtype: int64

In [7]:
emission_df.State.value_counts()

TX    3916
LA    2042
CA    1981
PA    1407
OH    1257
IL    1241
MI    1079
NY    1023
IN     964
FL     869
OK     826
GA     819
IA     760
VA     688
KY     676
WI     655
MN     651
NC     628
TN     593
CO     573
WV     555
KS     547
MS     532
AR     491
SC     486
WA     459
NJ     458
AZ     448
NM     404
NE     363
WY     362
MA     357
UT     325
AK     318
ND     305
MD     305
OR     285
CT     203
NV     200
ID     184
SD     171
MT     141
HI     141
ME     128
DE      92
NH      85
VT      32
Name: State, dtype: int64

In [8]:
emission_df.industry_type.value_counts()

Utilities                                                                 6590
Waste Management and Remediation Services                                 5698
Chemical Manufacturing                                                    3091
Pipeline Transportation                                                   3086
Oil and Gas Extraction                                                    2799
Nonmetallic Mineral Product Manufacturing                                 1598
Food Manufacturing                                                        1591
Primary Metal Manufacturing                                               1242
Paper Manufacturing                                                       1047
Petroleum and Coal Products Manufacturing                                  835
Mining (except Oil and Gas)                                                691
Educational Services                                                       556
Transportation Equipment Manufacturing              

### No binning is required. We want to leave the columns alone because we don't want to bucket states together and cause confusion in the model

In [6]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(emission_df[emission_cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(emission_cat)
encode_df.head()

Unnamed: 0,State_AK,State_AR,State_AZ,State_CA,State_CO,State_CT,State_DE,State_FL,State_GA,State_HI,...,industry_type_Support Activities for Agriculture and Forestry,industry_type_Support Activities for Mining,industry_type_Support Activities for Transportation,industry_type_Textile Mills,industry_type_Textile Product Mills,industry_type_Transportation Equipment Manufacturing,industry_type_Utilities,industry_type_Warehousing and Storage,industry_type_Waste Management and Remediation Services,industry_type_Wood Product Manufacturing
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Now that our categorical variables have been encoded, we need to merge them back into our original data frame and remove the unencoded columns

In [7]:
# Merge one-hot encoded features and drop the originals
emission_df = emission_df.merge(emission_df,left_index=True, right_index=True)
emission_df = emission_df.drop(emission_cat,1)
emission_df.head()

KeyError: "['State' 'industry_type'] not found in axis"