In [55]:
# Initial imports.
import pandas as pd
import os
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [88]:
# Loading data
file_path = os.path.join("/content/drive/MyDrive/ml_database2.csv")
df_emissions = pd.read_csv(file_path)
df_emissions.head()

Unnamed: 0.1,Unnamed: 0,Facility Id,State,industry_type,total_direct_emissions,state_population,state_gdp_per_capita,state_policies_incentives,USDA_energy_invest_unit,USDA_energy_invest_$,%Renewables
0,0,1004377,TX,Waste Management and Remediation Services,221014.75,27914064,58033.62062,140,176,83600842.01,13.4
1,1,1003188,TX,Plastics and Rubber Products Manufacturing,31773.48,27914064,58033.62062,140,176,83600842.01,13.4
2,2,1007733,TX,Utilities,22362.816,27914064,58033.62062,140,176,83600842.01,13.4
3,3,1002685,TX,Oil and Gas Extraction,265377.77,27914064,58033.62062,140,176,83600842.01,13.4
4,4,1005601,TX,Nonmetallic Mineral Product Manufacturing,55858.24,27914064,58033.62062,140,176,83600842.01,13.4


In [89]:
df_emissions.dtypes

Unnamed: 0                     int64
Facility Id                    int64
State                         object
industry_type                 object
total_direct_emissions       float64
state_population               int64
state_gdp_per_capita         float64
state_policies_incentives      int64
USDA_energy_invest_unit        int64
USDA_energy_invest_$         float64
%Renewables                  float64
dtype: object

In [104]:
# Define the features set.
le = LabelEncoder()
X = df_emissions.copy()
X = X.drop(columns=["total_direct_emissions","Facility Id", "Unnamed: 0","State","state_population"])
# X["State"] = le.fit_transform(X["State"])
X["industry_type"] = le.fit_transform(X["industry_type"])
X.head()

Unnamed: 0,industry_type,state_gdp_per_capita,state_policies_incentives,USDA_energy_invest_unit,USDA_energy_invest_$,%Renewables
0,41,58033.62062,140,176,83600842.01,13.4
1,30,58033.62062,140,176,83600842.01,13.4
2,39,58033.62062,140,176,83600842.01,13.4
3,26,58033.62062,140,176,83600842.01,13.4
4,25,58033.62062,140,176,83600842.01,13.4


In [105]:
# Define the target set.
y = df_emissions["total_direct_emissions"].ravel()
y = y.astype('int')
y[:5]

array([221014,  31773,  22362, 265377,  55858])

In [106]:
# Split into training and testing
random_state_num = 0
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=random_state_num)

In [107]:
# Create a StandardScaler instance
scaler = StandardScaler()

In [108]:
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

In [109]:
# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Random Forest Regressor

In [110]:
# create a random forest instance using the random forest classifier
rf_model = RandomForestRegressor(n_estimators=10, random_state=random_state_num)

In [111]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [112]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
predictions

array([1123468.60541574,  139392.51786064, 1321206.43958874, ...,
       1571152.750525  ,  570250.38964513,  521179.00083333])

In [113]:
# Calculating the accuracy score.
# acc_score = accuracy_score(y_test, predictions)
print(f"Training Data Score: {rf_model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {rf_model.score(X_test_scaled, y_test)}")

Training Data Score: 0.24794803307283386
Testing Data Score: 0.10075163288690658


In [114]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.50756342, 0.22449421, 0.05049421, 0.09332376, 0.08304148,
       0.04108292])

In [115]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.5075634155084585, 'industry_type'),
 (0.224494210981882, 'state_gdp_per_capita'),
 (0.09332376168386801, 'USDA_energy_invest_unit'),
 (0.08304147651322805, 'USDA_energy_invest_$'),
 (0.05049421463534417, 'state_policies_incentives'),
 (0.04108292067721926, '%Renewables')]