# Model Experimentation with Minimum Features

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report



In [2]:
pip install catboost

Note: you may need to restart the kernel to use updated packages.


In [3]:
from catboost import CatBoostClassifier, CatBoostRegressor

In [4]:

min_features = pd.read_csv('/Users/sabrinasayed/Documents/GitHub/Data_Center_Sites/Data/Minimum_Features.csv', index_col=0)
min_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7455 entries, 60 to 9556
Data columns (total 14 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   State                                7455 non-null   object 
 1   County                               7455 non-null   object 
 2   Risk_Score                           7455 non-null   object 
 3   Risk_Rating                          7455 non-null   object 
 4   IXP_Count                            7455 non-null   float64
 5   Region                               7455 non-null   object 
 6   Regional Electricity Demand          7455 non-null   object 
 7   State Emissions Level                7455 non-null   object 
 8   Utility Annual Energy Savings (MWh)  7455 non-null   object 
 9   SAIFI Major Events                   7455 non-null   object 
 10  CAIDI w/o Major Events               7455 non-null   float64
 11  CAIDI w/ Major Events             

In [5]:
min_features.drop(columns=['Risk_Score'], inplace=True)
min_features.head()

Unnamed: 0,State,County,Risk_Rating,IXP_Count,Region,Regional Electricity Demand,State Emissions Level,Utility Annual Energy Savings (MWh),SAIFI Major Events,CAIDI w/o Major Events,CAIDI w/ Major Events,Commercial Generation Rank,Total Generation Rank
60,AL,Autauga,Relatively Low,8.0,SE,Low,Moderate,Low,Moderate,122.8,186.9,Low,High
61,AL,Autauga County,Unknown,8.0,SE,Low,Moderate,Low,Moderate,122.8,186.9,Low,High
62,AL,Baldwin,Relatively High,8.0,SE,Low,Moderate,Low,Moderate,122.8,186.9,Low,High
63,AL,Baldwin County,Unknown,8.0,SE,Low,Moderate,Low,Moderate,122.8,186.9,Low,High
64,AL,Barbour,Relatively Low,8.0,SE,Low,Moderate,Low,Moderate,122.8,186.9,Low,High


In [6]:
min_features = min_features.replace('Unknown', np.nan).dropna(how='all')

In [7]:
min_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7455 entries, 60 to 9556
Data columns (total 13 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   State                                7455 non-null   object 
 1   County                               7455 non-null   object 
 2   Risk_Rating                          2189 non-null   object 
 3   IXP_Count                            7455 non-null   float64
 4   Region                               7455 non-null   object 
 5   Regional Electricity Demand          7455 non-null   object 
 6   State Emissions Level                7413 non-null   object 
 7   Utility Annual Energy Savings (MWh)  7455 non-null   object 
 8   SAIFI Major Events                   7455 non-null   object 
 9   CAIDI w/o Major Events               7455 non-null   float64
 10  CAIDI w/ Major Events                7455 non-null   float64
 11  Commercial Generation Rank        

In [8]:
min_features.drop(columns=['Risk_Rating'], inplace=True)
min_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7455 entries, 60 to 9556
Data columns (total 12 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   State                                7455 non-null   object 
 1   County                               7455 non-null   object 
 2   IXP_Count                            7455 non-null   float64
 3   Region                               7455 non-null   object 
 4   Regional Electricity Demand          7455 non-null   object 
 5   State Emissions Level                7413 non-null   object 
 6   Utility Annual Energy Savings (MWh)  7455 non-null   object 
 7   SAIFI Major Events                   7455 non-null   object 
 8   CAIDI w/o Major Events               7455 non-null   float64
 9   CAIDI w/ Major Events                7455 non-null   float64
 10  Commercial Generation Rank           6808 non-null   object 
 11  Total Generation Rank             

In [9]:
min_features.dropna(inplace=True)
min_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6766 entries, 60 to 9556
Data columns (total 12 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   State                                6766 non-null   object 
 1   County                               6766 non-null   object 
 2   IXP_Count                            6766 non-null   float64
 3   Region                               6766 non-null   object 
 4   Regional Electricity Demand          6766 non-null   object 
 5   State Emissions Level                6766 non-null   object 
 6   Utility Annual Energy Savings (MWh)  6766 non-null   object 
 7   SAIFI Major Events                   6766 non-null   object 
 8   CAIDI w/o Major Events               6766 non-null   float64
 9   CAIDI w/ Major Events                6766 non-null   float64
 10  Commercial Generation Rank           6766 non-null   object 
 11  Total Generation Rank             

In [10]:
print(min_features['Regional Electricity Demand'].value_counts())
print(min_features['State Emissions Level'].value_counts())
print(min_features['Utility Annual Energy Savings (MWh)'].value_counts())
print(min_features['SAIFI Major Events'].value_counts())
print(min_features['Commercial Generation Rank'].value_counts())
print(min_features['Total Generation Rank'].value_counts())


Regional Electricity Demand
Low         2314
Moderate    2248
High        2204
Name: count, dtype: int64
State Emissions Level
Low          2571
Moderate     1956
Very High    1234
High         1005
Name: count, dtype: int64
Utility Annual Energy Savings (MWh)
Low         3882
High        2255
Moderate     629
Name: count, dtype: int64
SAIFI Major Events
Moderate    5116
Low         1620
High          30
Name: count, dtype: int64
Commercial Generation Rank
High        3864
Moderate    2029
Low          873
Name: count, dtype: int64
Total Generation Rank
High        4643
Moderate    1439
Low          684
Name: count, dtype: int64


In [11]:
min_features.head()

Unnamed: 0,State,County,IXP_Count,Region,Regional Electricity Demand,State Emissions Level,Utility Annual Energy Savings (MWh),SAIFI Major Events,CAIDI w/o Major Events,CAIDI w/ Major Events,Commercial Generation Rank,Total Generation Rank
60,AL,Autauga,8.0,SE,Low,Moderate,Low,Moderate,122.8,186.9,Low,High
61,AL,Autauga County,8.0,SE,Low,Moderate,Low,Moderate,122.8,186.9,Low,High
62,AL,Baldwin,8.0,SE,Low,Moderate,Low,Moderate,122.8,186.9,Low,High
63,AL,Baldwin County,8.0,SE,Low,Moderate,Low,Moderate,122.8,186.9,Low,High
64,AL,Barbour,8.0,SE,Low,Moderate,Low,Moderate,122.8,186.9,Low,High


# Feature encoding

In [12]:
categorical_cols = {
    #Ordinal categories
    'Regional Electricity Demand': ['Low', 'Moderate', 'High'],
    'State Emissions Level': ['Low', 'Moderate', 'High', 'Very High'],
    'Utility Annual Energy Savings (MWh)': ['Low', 'Moderate', 'High'],
    'SAIFI Major Events': ['Low', 'Moderate', 'High'],
    'Commercial Generation Rank': ['Low', 'Moderate', 'High'],
    'Total Generation Rank': ['Low', 'Moderate', 'High'],

}

nominal_cols = ['State', 'County', 'Region']

# Convert and encode ordinal categories
for col, categories in categorical_cols.items():
    min_features[col] = pd.Categorical(min_features[col], 
                                         categories=categories, 
                                         ordered=True)
    min_features[col] = min_features[col].cat.codes

# Encode nominal categories with LabelEncoder
le = LabelEncoder()
for col in nominal_cols:
    min_features[col] = le.fit_transform(min_features[col])

# Standardize numerical columns
numerical_cols = [
    'IXP_Count',
    'CAIDI w/ Major Events',
    'CAIDI w/o Major Events'
]

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
min_features[numerical_cols] = scaler.fit_transform(min_features[numerical_cols])

In [13]:
min_features.head()

Unnamed: 0,State,County,IXP_Count,Region,Regional Electricity Demand,State Emissions Level,Utility Annual Energy Savings (MWh),SAIFI Major Events,CAIDI w/o Major Events,CAIDI w/ Major Events,Commercial Generation Rank,Total Generation Rank
60,0,112,-0.354132,9,0,1,0,1,0.201065,-0.479944,0,2
61,0,113,-0.354132,9,0,1,0,1,0.201065,-0.479944,0,2
62,0,120,-0.354132,9,0,1,0,1,0.201065,-0.479944,0,2
63,0,121,-0.354132,9,0,1,0,1,0.201065,-0.479944,0,2
64,0,132,-0.354132,9,0,1,0,1,0.201065,-0.479944,0,2


## Labeling the Data: Target Variable Creation


In [14]:
# Create suitability scores for each feature

suitability = pd.DataFrame()

# Features with lower values more suitable:
lower_better = {
    'Regional Electricity Demand':1,
    'State Emissions Level':1,
    'SAIFI Major Events':1,
    'CAIDI w/ Major Events':1,
    'CAIDI w/o Major Events':1,
    'Commercial Generation Rank':1
}

higher_better = {
    'IXP_Count':1,
    'Utility Annual Energy Savings (MWh)':1,
    'Commercial Generation Rank':1,
    'Total Generation Rank':1
}

# Calculate feature scores:
for col in lower_better:
    if col in numerical_cols:  # invert normalized values
        suitability[col] = 1 - (min_features[col] - min_features[col].min()) / (min_features[col].max() - min_features[col].min())
    else:
        # For encoded categorical variables, invert the order
        max_val = min_features[col].max()
        suitability[col] = max_val - min_features[col]

for col in higher_better:
    if col in numerical_cols:  # normalize values
        suitability[col] = (min_features[col] - min_features[col].min()) / (min_features[col].max() - min_features[col].min())
    else:
        # For encoded categorical variables, use values directly
        suitability[col] = min_features[col]

overall_score = suitability.mean(axis=1)

overall_score.head()


60    0.939449
61    0.939449
62    0.939449
63    0.939449
64    0.939449
dtype: float64

In [15]:
# Create a column that labels scores above .70 as 'Suitable' and below .70 as 'Unsuitable'
overall_score = pd.to_numeric(overall_score)
overall_score = pd.DataFrame(overall_score, columns=['Overall_Score'])
overall_score['Suitability'] = np.where(overall_score['Overall_Score'] > 0.70, 'Suitable', 'Unsuitable')
overall_score.head()


Unnamed: 0,Overall_Score,Suitability
60,0.939449,Suitable
61,0.939449,Suitable
62,0.939449,Suitable
63,0.939449,Suitable
64,0.939449,Suitable


In [16]:
# Add the suitability column to the min_features dataframe
min_features['Suitability'] = overall_score['Suitability']
min_features.head()


Unnamed: 0,State,County,IXP_Count,Region,Regional Electricity Demand,State Emissions Level,Utility Annual Energy Savings (MWh),SAIFI Major Events,CAIDI w/o Major Events,CAIDI w/ Major Events,Commercial Generation Rank,Total Generation Rank,Suitability
60,0,112,-0.354132,9,0,1,0,1,0.201065,-0.479944,0,2,Suitable
61,0,113,-0.354132,9,0,1,0,1,0.201065,-0.479944,0,2,Suitable
62,0,120,-0.354132,9,0,1,0,1,0.201065,-0.479944,0,2,Suitable
63,0,121,-0.354132,9,0,1,0,1,0.201065,-0.479944,0,2,Suitable
64,0,132,-0.354132,9,0,1,0,1,0.201065,-0.479944,0,2,Suitable


In [17]:
min_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6766 entries, 60 to 9556
Data columns (total 13 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   State                                6766 non-null   int64  
 1   County                               6766 non-null   int64  
 2   IXP_Count                            6766 non-null   float64
 3   Region                               6766 non-null   int64  
 4   Regional Electricity Demand          6766 non-null   int8   
 5   State Emissions Level                6766 non-null   int8   
 6   Utility Annual Energy Savings (MWh)  6766 non-null   int8   
 7   SAIFI Major Events                   6766 non-null   int8   
 8   CAIDI w/o Major Events               6766 non-null   float64
 9   CAIDI w/ Major Events                6766 non-null   float64
 10  Commercial Generation Rank           6766 non-null   int8   
 11  Total Generation Rank             

# Prepare data for CatBoost

In [18]:
X = min_features.drop(columns=['Suitability'])
y = min_features['Suitability']



# Split the data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


In [19]:
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    cat_features=[col for col in categorical_cols.keys()],
    random_state=42
)

model.fit(X_train, y_train)

0:	learn: 0.4106085	total: 95.9ms	remaining: 1m 35s
1:	learn: 0.2379697	total: 111ms	remaining: 55.4s
2:	learn: 0.1511901	total: 139ms	remaining: 46.1s
3:	learn: 0.0994435	total: 188ms	remaining: 46.8s
4:	learn: 0.0580771	total: 201ms	remaining: 40s
5:	learn: 0.0410094	total: 218ms	remaining: 36.1s
6:	learn: 0.0277700	total: 232ms	remaining: 32.9s
7:	learn: 0.0203132	total: 254ms	remaining: 31.5s
8:	learn: 0.0145334	total: 277ms	remaining: 30.5s
9:	learn: 0.0119190	total: 292ms	remaining: 28.9s


10:	learn: 0.0094026	total: 318ms	remaining: 28.6s
11:	learn: 0.0074242	total: 339ms	remaining: 27.9s
12:	learn: 0.0062874	total: 355ms	remaining: 26.9s
13:	learn: 0.0050192	total: 373ms	remaining: 26.3s
14:	learn: 0.0041216	total: 393ms	remaining: 25.8s
15:	learn: 0.0039060	total: 402ms	remaining: 24.7s
16:	learn: 0.0033177	total: 422ms	remaining: 24.4s
17:	learn: 0.0029157	total: 440ms	remaining: 24s
18:	learn: 0.0025640	total: 454ms	remaining: 23.4s
19:	learn: 0.0022536	total: 471ms	remaining: 23.1s
20:	learn: 0.0020332	total: 491ms	remaining: 22.9s
21:	learn: 0.0018902	total: 507ms	remaining: 22.5s
22:	learn: 0.0016482	total: 523ms	remaining: 22.2s
23:	learn: 0.0014935	total: 539ms	remaining: 21.9s
24:	learn: 0.0013515	total: 567ms	remaining: 22.1s
25:	learn: 0.0012138	total: 593ms	remaining: 22.2s
26:	learn: 0.0011663	total: 615ms	remaining: 22.2s
27:	learn: 0.0011189	total: 637ms	remaining: 22.1s
28:	learn: 0.0010389	total: 655ms	remaining: 21.9s
29:	learn: 0.0009579	total: 667ms

<catboost.core.CatBoostClassifier at 0x12cc6afd0>

In [20]:

from sklearn.metrics import classification_report

# Evaluate the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1597
           1       0.99      1.00      0.99        95

    accuracy                           1.00      1692
   macro avg       0.99      1.00      1.00      1692
weighted avg       1.00      1.00      1.00      1692

