In [2]:
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import MinMaxScaler
import numpy as np

In [34]:
#read features
features = pd.read_csv('features_dataset_no_original_columns.csv')
features

Unnamed: 0,battleneturl,ratio_s,ratio_base,ratio_mineral,action_per_5_seconds,ratio_x0,ratio_x1,ratio_x2,Base,SingleMineral,...,hotkey52_f5,hotkey42_f5,hotkey22_f5,hotkey92_f5,hotkey02_f5,hotkey82_f5,hotkey72_f5,played_race_Protoss,played_race_Terran,played_race_Zerg
0,53,0.063965,0.006264,0.000475,6.990596,0.082717,0.000000,0.917283,0.039879,0.003021,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,False,False
1,29,0.051058,0.001139,0.000000,6.384615,0.061559,0.001965,0.936477,0.007251,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,False,False
2,53,0.040809,0.004460,0.000285,6.867347,0.049539,0.000000,0.950461,0.046535,0.002970,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,False,False
3,29,0.037677,0.001993,0.000000,6.861386,0.059794,0.001031,0.939175,0.020896,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,False,False
4,53,0.019740,0.000190,0.000000,5.820755,0.051345,0.000000,0.948655,0.003704,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3047,192,0.054285,0.000000,0.000380,12.346591,0.036273,0.000000,0.963727,0.000000,0.004545,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,True,False
3048,196,0.073550,0.000000,0.002752,14.739316,0.033245,0.000000,0.966755,0.000000,0.024681,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,False,True,False
3049,192,0.085413,0.000000,0.000475,12.995745,0.042306,0.000000,0.957694,0.000000,0.004274,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,True,False
3050,196,0.042896,0.000000,0.003986,20.008547,0.019470,0.000000,0.980530,0.000000,0.072414,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,False,True,False


In [47]:
# Split the data into training and testing sets
X = features.drop(['battleneturl'], axis=1)
y = features['battleneturl']

### Remove features with low variance

In [35]:
# Normalizing all the features to the range [0, 1]
scaler = MinMaxScaler()
X_normalized = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

#removing features with low variance
selector = VarianceThreshold(threshold=0.01)
X_selected = selector.fit_transform(X_normalized)

In [None]:
# Get the original and selected columns
original_columns = X.columns
selected_columns = X.columns[selector.get_support()]

# Identify removed columns
removed_columns = [col for col in original_columns if col not in selected_columns]

In [50]:
print(f"{len(removed_columns)} columns removed")
print(f"removed columns: {removed_columns}")

39 columns removed
removed columns: ['ratio_mineral', 'ratio_x0', 'ratio_x2', 'Base', 'SingleMineral', 'hotkey00', 'hotkey01', 'hotkey02', 'hotkey20', 'hotkey21', 'hotkey30', 'hotkey31', 'hotkey40', 'hotkey41', 'hotkey50', 'hotkey51', 'hotkey60', 'hotkey61', 'hotkey70', 'hotkey71', 'hotkey72', 'hotkey80', 'hotkey81', 'hotkey82', 'hotkey90', 'hotkey91', 'hotkey92', 'hotkey32_f5', 'hotkey62_f5', 'SingleMineral_f5', 'hotkey11_f5', 'hotkey31_f5', 'hotkey51_f5', 'hotkey61_f5', 'hotkey71_f5', 'hotkey92_f5', 'hotkey02_f5', 'hotkey82_f5', 'hotkey72_f5']


### Remove highly correlated features

In [41]:
#create correlation matrix and remove features with correlation higher than 0.85
corr_matrix = X.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.80)]
X_selected = X.drop(columns=to_drop)

In [None]:
corr_matrix

In [51]:
print(f"{len(to_drop)} columns removed")
print(f"removed columns: {to_drop}")

5 columns removed
removed columns: ['SingleMineral', 'hotkey80', 'hotkey90', 'hotkey31_f5', 'hotkey51_f5']


In [54]:
#return filtered dataframe
features_filtered = features.drop(columns=set(removed_columns + to_drop))
features_filtered.shape

(3052, 35)

In [55]:
features_filtered.to_csv('features_filtered.csv', index=False)