# GDFS Measure

In [1]:
import numpy as np
import pandas as pd
import math
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import pandas as pd

### 1. 함수 eigen(X) : data X의 공분산행렬의 eigenValue와 eigenVector return

In [8]:
def eigen(X):
    X_cen = X - X.mean(axis=0)  # 평균을 0으로
    X_cov = np.dot(X_cen.T, X_cen) / (len(X)-1)
    w, v = np.linalg.eig(X_cov)
    return w[0],v[0]

### 2. 함수 distance_min() : class I와 class J간의 최소거리를 구하는 함수

In [9]:
# I,J간의 최소 거리 구하는 distance_min 함수
def distance_min(I,J,total):
    
    # inter_dist
    ui =I.mean(axis=0)
    uj =J.mean(axis=0)
    inter_class_dist = ((len(I)+len(J))/total) * np.linalg.norm(ui-uj)
   
    # eigen value, vector each class
    valueI, vectorI = eigen(I)
    valueJ, vectorJ = eigen(J)
    
    # inter class center connect vector
    dij = uj - ui
    dji = ui - uj
    
    # cosine
    cosij = np.inner(dij,vectorI) / (np.linalg.norm(dij)*np.linalg.norm(vectorI))
    cosji = np.inner(dji,vectorJ) / (np.linalg.norm(dji)*np.linalg.norm(vectorJ))
    
    #intra-class variance
    intra_class_variance = 0.5*(math.sqrt(valueI)*abs(cosij) + math.sqrt(valueJ)*abs(cosji))
    intra_class_variance = intra_class_variance.item(0,0)
    
    return inter_class_dist - intra_class_variance


### 3. 함수 eveness() : 클래스 간 거리의 균등도 return 

In [12]:
#클래스간 거리의 균등도를 return
def eveness(dic,total):
    d = []
    for i in range(len(dic)-1):
        I=dic[i]
        J=dic[i+1]
        AVG_I =I.mean(axis=0)
        AVG_J =J.mean(axis=0)
        inter = ((len(I)+len(J))/total)*np.linalg.norm(AVG_I-AVG_J)
        d.append(inter)
    
    U=0
    avg = np.mean(d)
    for i in range(len(d)):
        U+=abs(d[i]-avg)
        
    c =len(dic)
    U = U/ (c*(c-1)/2)
    print(U)
    print(avg)
    return 2- (U/avg)
    

### 6. 함수 GDFS : dataframe과 feature subset, target 이름을 넣으면, GDFS값을 return

In [14]:
def gdfs(df,col,target):
    
    #feature subset + target list
    col.append(target)
    df = df[col]
    
    #DataFrame과 target column명을 전달하면 dic return
    dic = {}
    category = df[target].unique()
    for i in category:
        dic[i] = np.matrix(df[df[target]==i].drop(target,axis=1))
    
    # Evaluate 값 계산
    total = len(df)
    D =0
    for i in range(len(dic)-1):
        D+=(len(dic[i])+len(dic[i+1]))*(distance_min(dic[i],dic[i+1],total))
    
    D/=total
    E = eveness(dic,total)
    return D*E

### 실험

In [6]:
#if __name__ == "__main__":
data = load_iris()
df = pd.DataFrame(data.data)
result = pd.DataFrame(data.target)
result.rename(columns={0:'flower'},inplace=True)
df.rename(columns={0:'sepal_len',1:'sepal_wid',2:'petal_len',3:'petal_wid'},inplace=True)
total = pd.concat([df,result],axis=1)
total.head(3)

Unnamed: 0,sepal_len,sepal_wid,petal_len,petal_wid,flower
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0


In [7]:
total

Unnamed: 0,sepal_len,sepal_wid,petal_len,petal_wid,flower
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [76]:
least(total,['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid'],'flower')

0.3526657523838271
1.5180228390422437
Distance= 1.5781776807335683
0.3405518420745204
1.5825590727225431
Distance= 1.4832379203629922
0.13148373160154805
0.8493060287057681
Distance= 0.7889706567810506
0.3465637714238859
1.4941793896023907
Distance= 1.719517358362173


''

In [62]:
SFS_GDFS(total,'flower')

['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid']
0.06177777777777808
0.5273333333333327
Distance= 0.02954058623924008
0.10088888888888885
0.28733333333333305
Distance= -0.05994144023080555
0.33466666666666645
1.3633333333333333
Distance= 1.2626508955697502
0.08444444444444432
0.5933333333333334
Distance= 0.5325974427691775
['sepal_len', 'sepal_wid', 'petal_wid']
0.33362569561502625
1.4652339736678437
Distance= 1.675567522899369
0.34807178287274115
1.3941117517715464
Distance= 1.330097252626444
0.3399460284661286
1.489547997400354
Distance= 1.5585928552497046
['sepal_wid', 'petal_wid']
0.3465637714238859
1.4941793896023907
Distance= 1.3307317393122298
0.3405518420745204
1.5825590727225431
Distance= 1.412966127200537
['sepal_wid']
0.35284274324600845
1.6095899916302796
Distance= 1.4227939293545044
  feature to add      gdfs
0      petal_len  2.215350
1      sepal_len  2.969618
2      petal_wid  2.521875
3      sepal_wid  2.533693


array([['petal_len'],
       ['sepal_len']], dtype=object)

# Sequential Forward Selection

In [63]:
def SFS_GDFS(df,target):
    
    # 단계별 선택되는 feature와 gdfs값 list []
    gdfs_by_feature = []
    chosen_features = []

    # feature list
    X = df.drop(target,axis=1).columns
    available_features = list(X)
    run = 0
    
    # Loop : feature가 모두 선택될 때 까지
    while len(available_features)> 0:

        run += 1
        print(available_features)
        # Reset best
        d = SFS(df,available_features,chosen_features,gdfs_by_feature,target)
        gdfs_by_feature = d['gdfs']
        chosen_features = d['chosen']
        available_features = d['available']
    

        
    # Put results in DataFrame
    results = pd.DataFrame()
    results['feature to add'] = chosen_features
    results['gdfs'] = gdfs_by_feature
    print(results)
    
    #  return max gdfs feature subset
    maxidx = results['gdfs'].idxmax()
    return np.array(results.loc[:maxidx,['feature to add']])
   

## 실험

In [64]:
import pandas as pd
total = pd.read_csv('creditcard.csv')

In [15]:
def sequentialFeatureSelection(df,available_features,chosen_features,gdfs_by_feature,target):
    
    # Reset best
    best_result = -1000000
    best_feature = ''

    # Loop : abailable_feature 하나씩 check
    for feature in available_features:

        # Create copy of already chosen features
        features_to_use = chosen_features.copy()
        features_to_use.append(feature)

        # gdfs restult of features_to_sure
        result = GDFS(df,features_to_use,target)

        # Update chosen feature and result if this feature is a new best
        if result > best_result:
            best_result = result
            best_feature = feature

    gdfs_by_feature.append(best_result)
    chosen_features.append(best_feature)
    available_features.remove(best_feature)
    
    d = {}
    
    d['chosen'] = chosen_features
    d['gdfs'] = gdfs_by_feature
    d['available'] = available_features
    
    return d


# Sequential Forward Floating Selection Algorithm

In [75]:
def least(df,chosen,target):
    # Reset best
    worst_result = 100000000
    worst_feature = ''

    # Loop : abailable_feature 하나씩 check
    for feature in chosen:

        # Create copy of already chosen features
        features_consider = chosen.copy()
        features_consider.remove(feature)

        # gdfs restult of features_to_sure
        result = GDFS(df,features_consider,target)

        # Update chosen feature and result if this feature is a new best
        if result > worst_result:
            worst_result = result
            worst_feature = feature

    return worst_feature
    

In [None]:
def SFFS(df,target):
    
    # 단계별 선택되는 feature와 gdfs값 list []
    gdfs_by_feature = []
    chosen_features = []

    # feature list
    X = df.drop(target,axis=1).columns
    available_features = list(X)
    run = 0
    
    # SFS로 3개 선택
    for c in rage(2):
        
        d = SFS(df,available_features,chosen_features,gdfs_by_feature,target)
        gdfs_by_feature = d['gdfs']
        chosen_features = d['chosen']
        available_features = d['available']
    
    #step1
    d = SFS(df,available_features,chosen_features,gdfs_by_feature,target)
        gdfs_by_feature = d['gdfs']
        chosen_features = d['chosen']
        available_features = d['available']
    
    #step2
    
    
    #step3
    
    
    
    
    
    # Put results in DataFrame
    results = pd.DataFrame()
    results['feature to add'] = chosen_features
    results['gdfs'] = gdfs_by_feature
    print(results)
    
    #  return max gdfs feature subset
    maxidx = results['gdfs'].idxmax()
    return np.array(results.loc[:maxidx,['feature to add']])
   

In [84]:
import seaborn as sns

ImportError: DLL load failed: 지정된 모듈을 찾을 수 없습니다.

In [78]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.datasets import make_moons

from sklearn.model_selection import train_test_split



In [86]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()



x_train, x_test, y_train, y_test = train_test_split(cancer.data, cancer.target,

                                                    stratify=cancer.target, random_state=0)

n_feature = cancer.data.shape[1]

index = np.arange(n_feature)



forest = RandomForestClassifier(n_estimators=100, n_jobs=-1)

forest.fit(x_train, y_train)

forest.feature_importances_
plt.barh(index, forest.feature_importances_, align='center')

plt.yticks(index, cancer.feature_names)

plt.ylim(-1, n_feature)

plt.xlabel('feature importance', size=15)

plt.ylabel('feature', size=15)

plt.show()



NameError: name 'plt' is not defined

In [117]:
X = pd.DataFrame(cancer.data)

In [123]:
X 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [116]:
Y = pd.DataFrame(cancer.target)

In [125]:
X['target']=Y

In [126]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,target
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,0
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,0
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,0
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,0
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,0
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,0
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,0


In [127]:
r = SFS_GDFS(X,'target')

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
0.0
5.316306379155437
Distance= 2.8240650056952337
0.0
3.6901437556154555
Distance= -0.19715350161067047
0.0
37.28997119602556
Distance= 20.458925760434333
0.0
515.5862190159085
Distance= 264.4736711390435
0.0
0.010420843507214214
Distance= -0.0026063129084520145
0.0
0.06510316117012842
Distance= 0.021234436347157992
0.0
0.11471709597272872
Distance= 0.055486356525069405
0.0
0.062272593837535015
Distance= 0.03713125021069687
0.0
0.018722967866391826
Distance= -0.007499457308519898
0.0
0.00018730061836055856
Distance= -0.006973028301006268
0.0
0.3250001942286348
Distance= 0.09619609970331988
0.0
0.009465489403308602
Distance= -0.526713412738965
0.0
2.3236079567676122
Distance= 0.6537505283855045
0.0
51.537257200993594
Distance= 16.437886974182163
0.0
0.0004158076211616714
Distance= -0.002559712168962028
0.0
0.010842918595740186
Distance= -0.0065264320449746685
0.0
0.01582727385

0.0
1009.0927476903877
Distance= 741.8899308555689
0.0
1009.0927476769812
Distance= 741.8899313274324
0.0
1009.092747677065
Distance= 741.8899314979934
0.0
1009.1225463725771
Distance= 809.829846881709
0.0
1009.1094340156242
Distance= 628.7837973853473
0.0
1009.0927478729146
Distance= 741.8899354268319
0.0
1009.0927659717352
Distance= 741.8899448777918
0.0
1009.0927877451793
Distance= 742.3031572916234
0.0
1009.0927534342869
Distance= 742.4608787615169
0.0
1009.0927490805029
Distance= 742.3032384751472
0.0
1009.092747749375
Distance= 741.889931633439
[0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 21, 24, 25, 26, 27, 28, 29]
0.0
1009.1365500817969
Distance= 808.6713770218041
0.0
1009.1292933803928
Distance= 628.8100729909465
0.0
1009.1225464263832
Distance= 808.6513176394494
0.0
1009.12254847263
Distance= 808.6513673697027
0.0
1009.1225528930993
Distance= 808.6513734802488
0.0
1009.1225482939868
Distance= 808.6513243836268
0.0
1009.1225465462672
Distance= 808.6513150348117

0.0
1009.1392878402005
Distance= 810.6256814679792
0.0
1009.1392832411643
Distance= 810.6313288066008
0.0
1009.1392814934737
Distance= 811.5830382905161
0.0
1009.1392813198037
Distance= 810.6312772725801
0.0
1009.1392813641784
Distance= 811.030281076863
0.0
1009.139281319872
Distance= 810.6271854637254
0.0
1009.1392813780384
Distance= 810.6272780058671
0.0
1009.1392814439033
Distance= 810.6215834936932
0.0
1009.1392813331985
Distance= 810.6313294037132
0.0
1009.1392813197925
Distance= 810.6313546908436
0.0
1009.1392813198764
Distance= 810.6271956493064
0.0
1009.1559668890043
Distance= 559.0721669501481
0.0
1009.139299613703
Distance= 810.2025965935081
0.0
1009.139321386143
Distance= 810.2292843115774
0.0
1009.1392870768328
Distance= 810.6257016954117
[1, 4, 6, 7, 9, 11, 14, 15, 16, 17, 18, 19, 21, 25, 26, 27]
0.0
1009.1460283894008
Distance= 807.415971564991
0.0
1009.139281547279
Distance= 811.5871261425993
0.0
1009.1392880138878
Distance= 810.6215977859024
0.0
1009.1392834148517
Dista

In [135]:
r.sort()

In [139]:
select_by_gdfs = []

In [140]:
for i in range(len(r)):
    select_by_gdfs.append(r[i][0])

In [142]:
select_by_gdfs.sort()

In [169]:
select_by_gdfs

[0, 2, 3, 5, 8, 10, 12, 13, 14, 17, 20, 22, 23, 24, 28, 29]

In [178]:
bool = select.get_support()


In [187]:
from sklearn.feature_selection import SelectFromModel
#from sklearn.ensemble import RandomForestClassifierfrom sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()

n_feature = cancer.data.shape[1]

select = SelectFromModel(RandomForestClassifier(n_estimators=50,random_state=42),threshold = 'median')

select.fit(cancer.data, cancer.target)
bool = select.get_support()

select_by_forest = []
for i in range(len(bool)):
    if bool[i]:
        select_by_forest.append(i)
select_by_forest.sort()

select_by_forest

[0, 2, 3, 5, 6, 7, 10, 13, 20, 21, 22, 23, 25, 26, 27]

In [106]:
x_train

array([[1.231e+01, 1.652e+01, 7.919e+01, ..., 8.660e-02, 2.618e-01,
        7.609e-02],
       [1.754e+01, 1.932e+01, 1.151e+02, ..., 1.939e-01, 2.928e-01,
        7.867e-02],
       [1.049e+01, 1.861e+01, 6.686e+01, ..., 6.528e-02, 2.213e-01,
        7.842e-02],
       ...,
       [1.371e+01, 1.868e+01, 8.873e+01, ..., 1.284e-01, 2.849e-01,
        9.031e-02],
       [9.000e+00, 1.440e+01, 5.636e+01, ..., 1.389e-02, 2.991e-01,
        7.804e-02],
       [1.134e+01, 2.126e+01, 7.248e+01, ..., 8.278e-02, 2.829e-01,
        8.832e-02]])

In [193]:
rf = RandomForestClassifier(n_estimators=20, random_state=0)
rf.fit(cancer.data,cancer.target );
importance = rf.feature_importances_
len(importance)
import_by_random = pd.DataFrame(importance)
import_by_random = import_by_random.sort_values(by=0,ascending=False)


In [194]:
import_by_random

Unnamed: 0,0
27,0.219418
20,0.130657
7,0.112311
23,0.074015
13,0.069039
22,0.065756
3,0.044425
10,0.043831
6,0.04345
26,0.040815
