In [1]:
import os
import yaml
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2,f_classif, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score, make_scorer

def filter_feature_selection(X_train, X_valid, y_train, y_valid,method,rand):
        f1_train_all=[]
        f1_valid_all=[]
        column_names=[]
        for k in range(1,X_train.shape[1]):
            Column_Names = filtering(X_train,y_train,method,k)
            f1_train, f1_valid = train(X_train[X_train.columns[Column_Names]], y_train, X_valid[X_valid.columns[Column_Names]], y_valid, model,rand)
            f1_train_all.append(f1_train)
            f1_valid_all.append(f1_valid)
            column_names.append(X_train.columns[Column_Names])
        return f1_train_all,f1_valid_all,column_names

def filtering(X,y,method,nf):
    Selector = SelectKBest(method, k=nf).fit(X, y)
    Column_Names = Selector.get_support(indices=True)
    return Column_Names
    
def mean_diff(f1_valid_all):
    difflist=np.array(f1_valid_all[1:len(f1_valid_all)])-np.array(f1_valid_all[0:(len(f1_valid_all)-1)])
    step_threshold =sum(difflist)/len(difflist)
    return step_threshold

#verilen train ve test setlerine ve modele göre f1_train ve f1_valid değerlerini döndürür
def train(X,y,X_t,y_t,model,rand):
    classifier = model(max_depth = 10, n_jobs = 10, random_state = rand)
    classifier.fit(X,y)
    y_pred_train = classifier.predict(X)
    y_pred = classifier.predict(X_t)
    f1_train = f1_score(y_pred_train, y)
    f1_valid = f1_score(y_pred, y_t)
    return f1_train, f1_valid

## count according to numberof reference column (sum,mean,median ya da tanımlanan bir fonksiyon input olarak verilebilir)
def select_WT_fault_count_bigger_than_threshold(df,numbercolumn,labelcolumn,threshold):
    tagnumber=[]
    for i in df[numbercolumn].unique():
        #print(i)
        systemnumber=df[df[numbercolumn]==i].copy()
        #print(systemnumber[labelcolumn].sum())
        if systemnumber[labelcolumn].sum()>threshold:  
            tagnumber.append(i)
    return tagnumber

def select_feature_performance_increase_bigger_than_threshold(f1_train_all,f1_valid_all,column_names,step_threshold):
    temp_f1 = f1_valid_all[0]
    temp_features = column_names[0]
    clist=f1_train_all[1:len(f1_train_all)]>np.array(f1_train_all[0:(len(f1_train_all)-1)])+step_threshold
    indexlist=np.where(clist)[0]+1    
    for i in indexlist:
        temp_features_new=column_names[i]
        temp_features_old=column_names[i-1]
        temp_features_new=temp_features_new.drop(temp_features_old)
        temp_features = temp_features.append(temp_features_new)
    return temp_features

# Function to load yaml configuration file
def load_config(config_name):
    with open(os.path.join(CONFIG_PATH,  config_name), 'r', encoding='utf8') as file:
        config = yaml.safe_load(file)

    return config

if __name__ == '__main__':

    CONFIG_PATH = "."
    config = load_config('importcsv.yaml')
    processed_path=config["Upload_list_all"]['Parent_path']+"data/03_processed/"

    with open(processed_path +'X_trains', 'rb') as config_file:
        X_trains = pickle.load(config_file)
    with open(processed_path +'y_trains', 'rb') as config_file:
        y_trains = pickle.load(config_file)

    with open(processed_path +'X_trains_list', 'rb') as config_file:
        X_trains_list = pickle.load(config_file)
    with open(processed_path +'y_trains_list', 'rb') as config_file:
        y_trains_list = pickle.load(config_file)


    methods = [chi2, f_classif]
    method_names = ['chi2','f_classif']
    model = RandomForestClassifier

    X_train, X_valid, y_train, y_valid = train_test_split(X_trains,y_trains,test_size=0.2,shuffle=True)

    #train ve validation ayırıp verilen feature selection algoritmaları için performans skorlaması yapıyoruz.
    f1_train_all,f1_valid_all,column_names=filter_feature_selection(X_train, X_valid, y_train, y_valid,f_classif,42)
    #mean_inc_threshold=sum(np.array(f1_train_all[1:30])-np.array(f1_train_all[0:29]))/len(np.array(f1_train_all[1:30])-np.array(f1_train_all[0:29]))
    #mean_step_threshold=mean_diff(f1_valid_all)

    #Sonuçlara göre thresholddan yuksek olanları seçiyoruz
    selectedfeatures=select_feature_performance_increase_bigger_than_threshold(f1_train_all,f1_valid_all,column_names,0.005)   

    with open(processed_path + 'selectedfeatures', 'wb') as config_file:
        pickle.dump(selectedfeatures, config_file)

