In [81]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.impute import SimpleImputer

In [None]:
train_identity = pd.read_csv('train_identity.csv')
train_transaction = pd.read_csv('train_transaction.csv')
# train_identity =train_identity.head(n=500)
# train_transaction = train_transaction.head(n=500)

In [82]:
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

class Preprocessor:
    
    def __init__(self,df_identity, df_transactions):
        df = df_transactions.join(df_identity,how='left',on='TransactionID',lsuffix='_left', rsuffix='_right')
        df.drop(inplace=True,columns=['TransactionID_left','TransactionID_right'])
        self.X = df.iloc[:,1:]
        self.y = df.iloc[:,0]
    
    def preprocess(self, weights, threshold) -> pd.DataFrame:
        
        X_train_na_cols = self.X.isna().all()
        
        self.X = self.X.drop(self.X.columns[X_train_na_cols], axis=1)
        
        X_train_na_cols = self.X.isna().any()
        print(len(self.X.columns[X_train_na_cols]))

        imputer = SimpleImputer(strategy='most_frequent')
        imputer.fit(self.X)
        self.X = pd.DataFrame(imputer.transform(self.X), columns=self.X.columns)

        X_train_na_cols = self.X.isna().any()
        print(len(self.X.columns[X_train_na_cols]))
        self.X = self.X.apply(pd.to_numeric, errors='ignore', axis=1)

        self.X = self.X.drop('TransactionDT', axis=1)
        self.X = pd.get_dummies(self.X, dtype=float)
        print(self.X.shape)

        corr = self.X.corr()
        m = (corr.mask(np.eye(len(corr), dtype=bool)).abs() > 0.75).any()
        
        print(sum(m))
        raw = corr.loc[m, m]
        for i in raw:
            self.X = self.X.drop(i, axis=1)
        print(self.X.shape)

        scaler = MinMaxScaler()
        scaler.fit(self.X)
        self.X = pd.DataFrame(scaler.transform(self.X), columns=self.X.columns)

        self.X.to_csv('edited_train.csv', index=False)
        
proc = Preprocessor(train_identity, train_transaction)
proc.preprocess(1, 2)
# train_transaction.shape

In [83]:
from sklearn.tree import DecisionTreeClassifier

class Estimator:
    def __init__(self):
        pass

    def get_score(X, y) -> float:
        pass

In [86]:
class GreyWolfOptimizer:
    
    def __init__(self,estimator: Estimator, preprocessor: Preprocessor):
        self.estimator = estimator
        self.preprocessor = preprocessor

    def _init_population(self, features_n, population_size):
        population = []
        individual = []
        for _ in range(population_size):
            individual = np.random.random(size=len(features_n))
            population.append(individual)
        return population

    def run(self, iterations = 50, threshold = 0.5, population_size = 10):
        
        #Init algo
        n = self.preprocessor.n # number of features
        population = self._init_population(population_size=population_size,features_n=n)

        bar = tqdm(range(iterations))

        for i in bar: #Epochs
            step = i * ((2-0) / iterations)
            population = sorted(population, key=lambda x: self._calculate_cost(x,threshold),reverse = True)
            X = population[:3] # alpha, beta, delta
            a = np.full((1, n), 2 - step)
            new_population = []
            for individual in population:
                r1 = np.random.random(size=n)
                r2 = np.random.random(size=n)
                A = 2 * a * r1 - a
                C = 2 * r2
                new_individual = np.full((1, n), 0.0)
                for x in X:
                    D = abs(C * x - individual)
                    new_x = x - A * D
                    new_individual = new_individual + new_x
                new_population.append(new_individual / 3)
            
            population = new_population

        # Best solution after n iterations
        population = sorted(population, key=lambda x: self._calculate_cost(x,threshold),reverse = True)
        X_alpha = population[0]
        return X_alpha


    def _calculate_cost(self, weights, threshold):
        X = self.preprocessor.preprocess(weights,threshold=threshold)
        y = self.preprocessor.y

        return self.estimator.get_score(X,y)
    