In [1]:
# Initial imports.
import pandas as pd
import os
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Loading data
file_path = os.path.join("/content/drive/MyDrive/ml_database.csv")
df_emissions = pd.read_csv(file_path)
df_emissions.head()

Unnamed: 0,Facility Id,State,industry_type,year,total_direct_emissions,state_population,state_gdp_per_capita,state_policies_incentives,USDA_energy_invest_unit,USDA_energy_invest_$,%Renewables
0,1004377,TX,Waste Management and Remediation Services,2016,221014.75,27914064,58033.62062,140,176,83600842.01,13.4
1,1003188,TX,Plastics and Rubber Products Manufacturing,2016,31773.48,27914064,58033.62062,140,176,83600842.01,13.4
2,1007733,TX,Utilities,2016,22362.816,27914064,58033.62062,140,176,83600842.01,13.4
3,1002685,TX,Oil and Gas Extraction,2016,265377.77,27914064,58033.62062,140,176,83600842.01,13.4
4,1005601,TX,Nonmetallic Mineral Product Manufacturing,2016,55858.24,27914064,58033.62062,140,176,83600842.01,13.4


In [4]:
df_emissions.dtypes

Facility Id                    int64
State                         object
industry_type                 object
year                           int64
total_direct_emissions       float64
state_population               int64
state_gdp_per_capita         float64
state_policies_incentives      int64
USDA_energy_invest_unit        int64
USDA_energy_invest_$         float64
%Renewables                  float64
dtype: object

In [5]:
# Define the features set.
le = LabelEncoder()
X = df_emissions.copy()
X["State"] = le.fit_transform(X["State"])
X["industry_type"] = le.fit_transform(X["industry_type"])
X = X.drop(columns=["total_direct_emissions","Facility Id"])
X.head()

Unnamed: 0,State,industry_type,year,state_population,state_gdp_per_capita,state_policies_incentives,USDA_energy_invest_unit,USDA_energy_invest_$,%Renewables
0,39,48,2016,27914064,58033.62062,140,176,83600842.01,13.4
1,39,33,2016,27914064,58033.62062,140,176,83600842.01,13.4
2,39,46,2016,27914064,58033.62062,140,176,83600842.01,13.4
3,39,29,2016,27914064,58033.62062,140,176,83600842.01,13.4
4,39,28,2016,27914064,58033.62062,140,176,83600842.01,13.4


In [6]:
# Define the target set.
y = df_emissions["total_direct_emissions"].ravel()
y = y.astype('int')
y[:5]

array([221014,  31773,  22362, 265377,  55858])

In [7]:
# Split into training and testing
random_state_num = 0
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=random_state_num)

In [8]:
# Create a StandardScaler instance
scaler = StandardScaler()

In [9]:
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

In [10]:
# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Random Forest Classifier

In [11]:
# create a random forest instance using the random forest classifier
rf_model = RandomForestClassifier(n_estimators=10, random_state=random_state_num)

In [12]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [13]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [15]:
# # Calculating the confusion matrix.
# cm = confusion_matrix(y_test, predictions)

# # Create a DataFrame from the confusion matrix.
# cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

# cm_df

In [16]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [19]:
# Displaying results
# print("Confusion Matrix")
# display(cm_df)
print(f"Accuracy Score : {acc_score}")
# print("Classification Report")
# print(classification_report(y_test, predictions))

Accuracy Score : 4.6044755502348285e-05


In [20]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.02638116, 0.6894076 , 0.02625574, 0.04568744, 0.04747942,
       0.0351983 , 0.04638054, 0.04370352, 0.03950627])

In [21]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.6894076022602069, 'industry_type'),
 (0.0474794181234497, 'state_gdp_per_capita'),
 (0.046380542227896504, 'USDA_energy_invest_unit'),
 (0.04568744175648686, 'state_population'),
 (0.04370352399373131, 'USDA_energy_invest_$'),
 (0.03950627467059371, '%Renewables'),
 (0.03519830437747642, 'state_policies_incentives'),
 (0.026381157075151644, 'State'),
 (0.026255735515006963, 'year')]

### Logistic Regression Model

In [22]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression()

In [23]:
classifier.fit(X_train_scaled, y_train)

LogisticRegression()

In [24]:
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Training Data Score: 0.022563661759965617
Testing Data Score: 0.0023482825306197623
