In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.impute import SimpleImputer

In [None]:
train_identity = pd.read_csv('train_identity.csv')
train_transaction = pd.read_csv('train_transaction.csv')
# train_identity =train_identity.head(n=500)
# train_transaction = train_transaction.head(n=500)

## Preprocessing

In [None]:
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

df = train_transaction.join(train_identity,how='left',on='TransactionID',lsuffix='_left', rsuffix='_right')
df.drop(inplace=True,columns=['TransactionID_left','TransactionID_right'])
del train_identity
del train_transaction

X = df.iloc[:,1:]
y = df.iloc[:,0]

X_train_na_cols = X.isna().all()
        
X = X.drop(X.columns[X_train_na_cols], axis=1)
        
X_train_na_cols = X.isna().any()
print(len(X.columns[X_train_na_cols]))
imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(X)
X = pd.DataFrame(imputer.transform(X), columns=X.columns)
X_train_na_cols = X.isna().any()
print(len(X.columns[X_train_na_cols]))
X = X.apply(pd.to_numeric, errors='ignore', axis=1)
X = X.drop('TransactionDT', axis=1)
X = pd.get_dummies(X, dtype=float)
print(X.shape)
corr = X.corr()
m = (corr.mask(np.eye(len(corr), dtype=bool)).abs() > 0.75).any()
        
print(sum(m))
raw = corr.loc[m, m]
for i in raw:
    X = X.drop(i, axis=1)
print(X.shape)
scaler = MinMaxScaler()
scaler.fit(X)
X = pd.DataFrame(scaler.transform(X), columns=X.columns)
X.to_csv('edited_train.csv', index=False)
X.iloc

In [82]:
class Encoder:
    
    def __init__(self,X, y):
        self.X = X
        self.y = y
        
    def encode(self, features_vector) -> pd.DataFrame:
        return pd.get_dummies(self.X.iloc[:,np.array(features_vector).astype(bool)],dtype=float)
        
        
proc = Encoder(X,y)
# train_transaction.shape

In [83]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

class Estimator:
    def __init__(self,cv, model_generator):
        self.cv = cv
        self.model_generator = model_generator

    def get_score(self,X, y) -> float:
        return cross_val_score(self.model_generator(), X,y, cv=self.cv,scoring='balanced_accuracy')
    
    def fit(self, X, y):
        return self.model_generator().fit(X,y)

In [86]:
class GreyWolfOptimizer:
    
    def __init__(self,estimator: Estimator, preprocessor: Encoder):
        self.estimator = estimator
        self.encoder = preprocessor
        self.train_score_history_ = []
        self.fitted_model_ = None
        self.features_n_ = None

    def _init_population(self, features_n, population_size):
        population = []
        individual = []
        for _ in range(population_size):
            individual = np.random.random(size=len(features_n))
            population.append(individual)
        return population

    def run(self, iterations = 50, threshold = 0.5, population_size = 10):
        
        #Init algo
        n = self.encoder.n # number of features
        population = self._init_population(population_size=population_size,features_n=n)

        bar = tqdm(range(iterations))

        for i in bar: #Epochs
            step = i * ((2-0) / iterations)
            population = sorted(population, key=lambda x: self._calculate_cost(x,threshold),reverse = True)
            self.train_score_history_.append(population[0])
            X = population[:3] # alpha, beta, delta
            a = np.full((1, n), 2 - step).ravel()
            new_population = []
            for individual in population:
                r1 = np.random.random(size=n)
                r2 = np.random.random(size=n)
                A = 2 * a * r1 - a
                C = 2 * r2
                new_individual = np.full((1, n), 0.0).ravel()
                for x in X:
                    D = abs(C * x - individual)
                    new_x = x - A * D
                    new_individual = new_individual + new_x
                new_population.append(new_individual / 3)
            
            population = new_population

        # Best solution after n iterations
        population = sorted(population, key=lambda x: self._calculate_cost(x,threshold),reverse = True)
        X_alpha = population[0]
        self.train_score_history_.append(X_alpha)
        self.fitted_model_ = self.estimator.fit(self.encoder.encode(np.array(X_alpha) >= threshold),self.encoder.y)
        self.features_n_ = np.count_nonzero(X_alpha >= threshold)


    def _calculate_cost(self, weights, threshold):
        X = self.encoder.encode(np.array(weights) >= threshold)
        y = self.encoder.y

        return self.estimator.get_score(X,y)
    