# Problem 1 
## Predicting Credit Approval 

This Problem has been solved in the following journal article using SOED: 

[Self-Organizing and Error Driven (SOED) artificial neural network for smarter](classifications[https://academic.oup.com/jcde/article/4/4/282/5729001)

We will be using ucimlrepo to import the sample data. Make sure to install it:

`pip install ucimlrepo`


In [1]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
credit_approval = fetch_ucirepo(id=27) 
  
# data (as pandas dataframes) 
X = credit_approval.data.features 
y = credit_approval.data.targets 

print(credit_approval.variables) 


   name     role         type demographic description units missing_values
0   A16   Target  Categorical        None        None  None             no
1   A15  Feature   Continuous        None        None  None             no
2   A14  Feature   Continuous        None        None  None            yes
3   A13  Feature  Categorical        None        None  None             no
4   A12  Feature  Categorical        None        None  None             no
5   A11  Feature   Continuous        None        None  None             no
6   A10  Feature  Categorical        None        None  None             no
7    A9  Feature  Categorical        None        None  None             no
8    A8  Feature   Continuous        None        None  None             no
9    A7  Feature  Categorical        None        None  None            yes
10   A6  Feature  Categorical        None        None  None            yes
11   A5  Feature  Categorical        None        None  None            yes
12   A4  Feature  Categor

In [2]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from soed import SOEDClassifier
from sklearn.metrics import precision_score, recall_score, roc_auc_score, accuracy_score

### Filling missing values

In [3]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A15     690 non-null    int64  
 1   A14     677 non-null    float64
 2   A13     690 non-null    object 
 3   A12     690 non-null    object 
 4   A11     690 non-null    int64  
 5   A10     690 non-null    object 
 6   A9      690 non-null    object 
 7   A8      690 non-null    float64
 8   A7      681 non-null    object 
 9   A6      681 non-null    object 
 10  A5      684 non-null    object 
 11  A4      684 non-null    object 
 12  A3      690 non-null    float64
 13  A2      678 non-null    float64
 14  A1      678 non-null    object 
dtypes: float64(4), int64(2), object(9)
memory usage: 81.0+ KB


In [4]:
# Filling Missing Values
variable_df = credit_approval.variables
for c,t in variable_df.set_index('name').drop(index=['A16']).type.to_dict().items():
    if t == 'Categorical':
        X.loc[:,c] = np.where(X[c].isna(),X[c].mode(),X[c])
    if t == 'Continuous':
        X.loc[:,c] = X[c].fillna(X[c].median())


In [5]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A15     690 non-null    int64  
 1   A14     690 non-null    float64
 2   A13     690 non-null    object 
 3   A12     690 non-null    object 
 4   A11     690 non-null    int64  
 5   A10     690 non-null    object 
 6   A9      690 non-null    object 
 7   A8      690 non-null    float64
 8   A7      690 non-null    object 
 9   A6      690 non-null    object 
 10  A5      690 non-null    object 
 11  A4      690 non-null    object 
 12  A3      690 non-null    float64
 13  A2      690 non-null    float64
 14  A1      690 non-null    object 
dtypes: float64(4), int64(2), object(9)
memory usage: 81.0+ KB


### Binary Coding Categorical Attributes

In [6]:
X = pd.get_dummies(X)
dummy_vars = [c for c in X.columns if '_' in c]
X[dummy_vars] = X[dummy_vars].astype(int)

y = np.where(y=='+',1,0)[:,0]

In [7]:
X

Unnamed: 0,A15,A14,A11,A8,A3,A2,A13_g,A13_p,A13_s,A12_f,...,A6_w,A6_x,A5_g,A5_gg,A5_p,A4_l,A4_u,A4_y,A1_a,A1_b
0,0,202.0,1,1.25,0.000,30.83,1,0,0,1,...,1,0,1,0,0,0,1,0,0,1
1,560,43.0,6,3.04,4.460,58.67,1,0,0,1,...,0,0,1,0,0,0,1,0,1,0
2,824,280.0,0,1.50,0.500,24.50,1,0,0,1,...,0,0,1,0,0,0,1,0,1,0
3,3,100.0,5,3.75,1.540,27.83,1,0,0,0,...,1,0,1,0,0,0,1,0,0,1
4,0,120.0,0,1.71,5.625,20.17,0,0,1,1,...,1,0,1,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,0,260.0,0,1.25,10.085,21.08,1,0,0,1,...,0,0,0,0,1,0,0,1,0,1
686,394,200.0,2,2.00,0.750,22.67,1,0,0,0,...,0,0,1,0,0,0,1,0,1,0
687,1,200.0,1,2.00,13.500,25.25,1,0,0,0,...,0,0,0,0,1,0,0,1,1,0
688,750,280.0,0,0.04,0.205,17.92,1,0,0,1,...,0,0,1,0,0,0,1,0,0,1


### Standardizing Data

In [8]:
X  = (X - X.mean())/X.std()

In [9]:
X.describe()

Unnamed: 0,A15,A14,A11,A8,A3,A2,A13_g,A13_p,A13_s,A12_f,...,A6_w,A6_x,A5_g,A5_gg,A5_p,A4_l,A4_u,A4_y,A1_a,A1_b
count,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,...,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0
mean,1.0297720000000001e-17,4.8914170000000004e-17,1.0297720000000001e-17,1.029772e-16,1.0297720000000001e-17,2.059544e-16,-1.699124e-16,5.96946e-17,-2.9605950000000004e-17,-1.055516e-16,...,-7.465848e-17,4.633974e-17,-1.055516e-16,-1.5446580000000003e-17,-1.0297720000000001e-17,-1.5446580000000003e-17,-1.055516e-16,-1.0297720000000001e-17,-3.604202e-17,3.604202e-17
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-0.1952717,-1.066043,-0.4935286,-0.6643947,-0.9559198,-1.497787,-3.098621,-0.1082276,-0.2998615,-1.08712,...,-0.3195125,-0.2412421,-1.782472,-0.0538773,-0.5557424,-0.0538773,-1.782472,-0.5557424,-0.6609584,-1.510762
25%,-0.1952717,-0.6014412,-0.4935286,-0.6150897,-0.7550425,-0.7456942,0.3222565,-0.1082276,-0.2998615,-1.08712,...,-0.3195125,-0.2412421,0.5602055,-0.0538773,-0.5557424,-0.0538773,0.5602055,-0.5557424,-0.6609584,-1.510762
50%,-0.194312,-0.1368388,-0.4935286,-0.3655762,-0.4035072,-0.2575087,0.3222565,-0.1082276,-0.2998615,0.9185289,...,-0.3195125,-0.2412421,0.5602055,-0.0538773,-0.5557424,-0.0538773,0.5602055,-0.5557424,-0.6609584,0.6609584
75%,-0.1193615,0.5136044,0.1233822,0.1200038,0.4919034,0.522197,0.3222565,-0.1082276,-0.2998615,0.9185289,...,-0.3195125,-0.2412421,0.5602055,-0.0538773,-0.5557424,-0.0538773,0.5602055,-0.5557424,1.510762,0.6609584
max,18.99821,10.54901,13.28414,7.851932,4.668645,4.10918,0.3222565,9.2264,3.33004,0.9185289,...,3.125232,4.139206,0.5602055,18.53379,1.796787,18.53379,0.5602055,1.796787,1.510762,0.6609584


### PCA Transformation

In [10]:
pca = PCA(n_components=23)  # Reduce to 23 dimensions
X_pca = pd.DataFrame(pca.fit_transform(X),columns = [f'PC{i}' for i in range(1,24)])

In [11]:
X_pca.describe()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC14,PC15,PC16,PC17,PC18,PC19,PC20,PC21,PC22,PC23
count,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,...,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0
mean,1.647635e-16,-1.235726e-16,-1.029772e-16,-8.238177000000001e-17,4.1190880000000005e-17,9.911556000000001e-17,-2.0595440000000003e-17,7.208405e-17,1.647635e-16,-6.693519e-17,...,4.1190880000000005e-17,1.004028e-16,1.029772e-16,-9.267949e-17,-1.338704e-16,2.008056e-16,4.8914170000000004e-17,1.184238e-16,-1.415937e-17,3.346759e-17
std,2.196496,1.862298,1.748996,1.684505,1.504244,1.457849,1.37968,1.343024,1.304737,1.283267,...,1.137893,1.067338,1.048601,1.037334,1.016788,1.007276,0.994112,0.9486086,0.9289733,0.9104386
min,-5.046569,-6.308442,-1.120538,-7.846603,-6.167302,-3.338926,-4.867461,-5.576225,-5.247459,-3.535307,...,-3.139304,-3.359711,-3.191515,-2.783939,-2.785048,-4.337297,-2.880427,-5.648884,-2.666067,-7.059714
25%,-1.152886,-1.339565,-0.6039851,-1.193876,-0.8666296,-1.200434,-0.7433507,-0.7162327,-0.8883733,-0.7192713,...,-0.7549724,-0.8229073,-0.6985335,-0.3605853,-0.6224672,-0.5057239,-0.6895676,-0.5626868,-0.5076593,-0.5281457
50%,0.02907246,0.0618604,-0.2500572,-0.1836852,0.2253946,-0.03938136,-0.1528551,-0.09291286,-0.0297732,-0.06621645,...,-0.2140973,-0.02129418,-0.07182996,-0.008381566,-0.01217567,0.01393103,-0.0635097,-0.05657254,-0.03084555,-0.05557615
75%,1.932911,1.260647,0.2160807,0.8288204,0.963317,0.9405794,0.4895679,0.5040649,0.8234462,0.5412146,...,0.5806877,0.6599459,0.4136699,0.3884491,0.4169731,0.5345574,0.4835641,0.5030928,0.5233554,0.3885424
max,5.358202,6.930683,35.89772,5.150107,6.662834,6.712885,6.733803,4.452799,3.284663,8.377486,...,4.960387,3.183814,5.769187,2.854004,3.509009,6.549554,2.827021,3.757672,3.475318,5.441682


### Split Data

In [12]:
random_index = np.random.permutation(X.shape[0])

i = int(round(X.shape[0]*0.5))
train_index = random_index[:i]
test_index = random_index[i+1:]

X_train = X_pca.iloc[train_index]
X_test = X_pca.iloc[test_index]

y_train = y[train_index]
y_test = y[test_index]

In [13]:
X_train

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC14,PC15,PC16,PC17,PC18,PC19,PC20,PC21,PC22,PC23
319,-3.818402,1.253278,-0.745384,-1.485147,0.403058,0.118361,-0.626777,-0.336934,-0.766951,0.123724,...,2.051077,0.336955,0.312030,0.057148,-0.116977,-0.406713,0.598810,-0.008667,-0.047658,-0.546951
68,2.305446,0.826142,-0.018667,-0.145838,0.273122,-1.297457,-0.323760,-0.277411,1.267905,0.887407,...,-0.950683,-0.434887,-1.231909,-1.791393,-1.529528,0.293403,-0.755722,-0.950488,-0.006728,0.741854
660,-0.675742,-1.912761,-0.890451,0.905831,0.534794,-1.539201,-0.089005,-1.385351,-0.457489,-0.063745,...,-1.599074,0.682600,0.938375,0.550767,0.847626,0.410895,-0.018804,-1.854228,-0.987449,1.085588
289,-4.367184,2.120010,0.253665,4.502956,2.737175,4.684014,6.419294,0.203836,-1.468523,7.212628,...,-0.759477,-0.233692,-0.176102,-0.053441,0.398543,-0.134862,0.560421,-0.062713,-0.492494,0.134955
629,-0.550341,-2.008806,-0.711960,1.774126,1.581353,-0.310105,-0.059132,-0.796550,-0.323565,-0.578901,...,-0.865268,-0.307157,-1.062103,-1.596323,-1.741382,-1.426797,-0.836375,-0.127458,1.039847,-0.257695
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446,-0.991791,-2.235608,-1.074706,0.489888,0.562132,-1.536512,-0.311659,-1.100688,0.479697,0.898466,...,1.267624,0.208981,0.439989,-0.033538,0.038056,0.115445,0.320360,-0.492689,-0.065562,-1.201527
109,1.999693,0.131123,-0.328796,0.191847,2.245578,0.729510,-0.305335,0.444804,-0.732215,-0.270720,...,1.976110,0.733730,0.578387,0.085900,-0.302815,0.291185,-0.715962,0.191674,0.231320,-1.127749
28,3.001116,1.650808,0.568177,0.001455,-2.900201,-1.431718,2.186589,-1.486112,-0.045664,-0.503765,...,0.145437,-0.086509,-0.151148,0.074154,-0.182555,0.085398,0.379197,-0.176012,-0.104479,0.614411
493,1.295472,-0.047184,0.249052,1.176115,0.354788,0.617684,1.149459,-0.615810,2.111838,-0.753865,...,1.205827,0.904645,0.316078,-0.364828,-0.666162,-0.023496,-0.055835,-0.488006,0.691453,-1.135268


### Model Trainig

In [14]:
soed = SOEDClassifier(mlp_max_iter=10000,som_input_len=X_train.shape[1])
out = soed.fit(X_train.values,y_train)
out

Model training complete.


<soed.SOEDClassifier at 0x114edcad0>

In [15]:
y_proba = soed.predict_proba(X_test.values)
y_pred = soed.predict(X_test.values)

recall = recall_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
accuracy = accuracy_score(y_test,y_pred)
auc = roc_auc_score(y_test,y_proba[:,1])

performance = {'recall':recall,'precision':precision,'accuracy':accuracy,'auc':auc}

In [16]:
print(performance)

{'recall': 0.6772151898734177, 'precision': 0.8629032258064516, 'accuracy': 0.8023255813953488, 'auc': np.float64(0.8104498434735266)}


### Model Comparison

In [17]:
report_df = pd.DataFrame(index = [f'repeat{i}' for i in range(1,6)],
                         columns = ['DT','MLP','SOED'] )
report_df

Unnamed: 0,DT,MLP,SOED
repeat1,,,
repeat2,,,
repeat3,,,
repeat4,,,
repeat5,,,


In [18]:
for loop_i in range(1,6):
    print(loop_i)
    random_index = np.random.permutation(X.shape[0])

    i = int(round(X.shape[0]*0.7))
    train_index = random_index[:i]
    test_index = random_index[i+1:]
    
    X_train = X_pca.iloc[train_index]
    X_test = X_pca.iloc[test_index]
    
    y_train = y[train_index]
    y_test = y[test_index]

    #soed
    soed = SOEDClassifier(mlp_max_iter=10000,som_x=7,som_y=7,som_sigma=7,som_input_len=X_train.shape[1])
    soed.fit(X_train.values,y_train)
    y_proba = soed.predict_proba(X_test.values)
    auc = roc_auc_score(y_test,y_proba[:,1])
    report_df.loc[f'repeat{loop_i}','SOED'] = auc

    #dt
    dt = DecisionTreeClassifier(max_depth=10)
    dt.fit(X_train.values,y_train)
    y_proba = dt.predict_proba(X_test.values)
    auc = roc_auc_score(y_test,y_proba[:,1])
    report_df.loc[f'repeat{loop_i}','DT'] = auc

    #mlp
    mlp = MLPClassifier(max_iter=10000)
    mlp.fit(X_train.values,y_train)
    y_proba = mlp.predict_proba(X_test.values)
    auc = roc_auc_score(y_test,y_proba[:,1])
    report_df.loc[f'repeat{loop_i}','MLP'] = auc

report_df.loc['Average'] = report_df.mean()

1
Model training complete.
2
Model training complete.
3
Model training complete.
4
Model training complete.
5
Model training complete.


In [19]:
report_df

Unnamed: 0,DT,MLP,SOED
repeat1,0.722929,0.894567,0.881791
repeat2,0.745813,0.875992,0.832403
repeat3,0.76714,0.908049,0.831155
repeat4,0.809021,0.883909,0.895613
repeat5,0.734783,0.924032,0.891018
Average,0.755937,0.89731,0.866396


We can see that SOED and MLP are performing very similarly. SOED will show its superiority in the decision-making problem. 

# Problem 2
## Deciding Credit Approval 

We must assume some data to use credit approval for a decision-making problem. We will assume that when we make a false negative, we will incur a 1000-dollar mistake cost; when we make a false positive, we will incur a 2-dollar cost of errors.


In [20]:
c0 = np.where(y==0,0,1000)
c1 = np.where(y==1,0,2)
c = np.column_stack((c0,c1))


random_index = np.random.permutation(X.shape[0])

i = int(round(X.shape[0]*0.5))
train_index = random_index[:i]
test_index = random_index[i+1:]

X_train = X_pca.iloc[train_index]
X_test = X_pca.iloc[test_index]

y_train = y[train_index]
y_test = y[test_index]

c_train = c[train_index,:]
c_test = c[test_index,:]

In [21]:
soed = SOEDClassifier(mlp_max_iter=10000,som_input_len=X_train.shape[1])
soed.fit(X_train.values,y_train,c_train)

Model training complete.


<soed.SOEDClassifier at 0x114edc2f0>

In [22]:
y_decide = soed.decide(X_test.values)

In [23]:
df = pd.DataFrame(np.column_stack((y_test,y_decide)),columns = ['Reality','Decision'])
df['FN_cost'] = np.where((df.Decision==0) & (df.Reality==1),1000,0)
df['FP_cost'] = np.where((df.Decision==1) & (df.Reality==0),2,0)
df['cost'] = df.FP_cost + df.FN_cost
df

Unnamed: 0,Reality,Decision,FN_cost,FP_cost,cost
0,0,0,0,0,0
1,1,1,0,0,0
2,0,0,0,0,0
3,0,1,0,2,2
4,0,0,0,0,0
...,...,...,...,...,...
339,1,1,0,0,0
340,1,1,0,0,0
341,1,1,0,0,0
342,1,1,0,0,0


In [24]:
total_cost = df.cost.sum()
print(f'total cost is {total_cost}.')

total cost is 19156.


In [25]:
df.cost.value_counts()

cost
0       247
2        78
1000     19
Name: count, dtype: int64

### Comparing Models

In [26]:
report_df = pd.DataFrame(index = [f'repeat{i}' for i in range(1,6)],
                         columns = ['DT','MLP','SOED'] )
report_df

Unnamed: 0,DT,MLP,SOED
repeat1,,,
repeat2,,,
repeat3,,,
repeat4,,,
repeat5,,,


In [27]:
def calc_cost(y_reality,y_decide):
    df = pd.DataFrame(np.column_stack((y_reality,y_decide)),columns = ['Reality','Decision'])
    df['FN_cost'] = np.where((df.Decision==0) & (df.Reality==1),1000,0)
    df['FP_cost'] = np.where((df.Decision==1) & (df.Reality==0),10,0)
    df['cost'] = df.FP_cost + df.FN_cost
    return df.cost.sum()

In [28]:
def get_cost_minimizing_threshold(y_reality,y_prob):
    candidate_df = pd.DataFrame(index=range(99),columns = ['thresh','cost'])
    candidate_df.thresh = np.linspace(0.01,0.99,99)
    candidate_df = candidate_df.set_index('thresh')
    
    for t in candidate_df.index.tolist():
        y_decide = np.where(y_prob>t,1,0)
        candidate_df.loc[t,'cost'] = calc_cost(y_reality,y_decide)
    return candidate_df[candidate_df.cost == candidate_df.cost.min()].index[0]

In [None]:
for loop_i in range(1,6):
    print(loop_i)
    random_index = np.random.permutation(X.shape[0])

    i = int(round(X.shape[0]*0.7))
    train_index = random_index[:i]
    test_index = random_index[i+1:]
    
    X_train = X_pca.iloc[train_index]
    X_test = X_pca.iloc[test_index]
    
    y_train = y[train_index]
    y_test = y[test_index]

    c_train = c[train_index,:]
    c_test = c[test_index,:]

    #soed
    soed = SOEDClassifier(mlp_max_iter=10000,som_input_len=X_train.shape[1])
    soed.fit(X_train.values,y_train,c_train)
    y_decide = soed.decide(X_test.values)
    total_cost= calc_cost(y_test,y_decide)
    report_df.loc[f'repeat{loop_i}','SOED'] = total_cost

    #dt
    dt = DecisionTreeClassifier()
    dt.fit(X_train.values,y_train)

    y_prob = dt.predict_proba(X_train.values)[:,1]
    thresh = get_cost_minimizing_threshold(y_train,y_prob)
    print(f'dt threshod= {thresh}')
    y_prob = dt.predict_proba(X_test.values)[:,1]
    y_decide = np.where(y_prob>thresh,1,0)
    total_cost= calc_cost(y_test,y_decide)
    report_df.loc[f'repeat{loop_i}','DT'] = total_cost

    #mlp
    mlp = MLPClassifier(max_iter=10000)
    mlp.fit(X_train.values,y_train)
    
    y_prob = mlp.predict_proba(X_train.values)[:,1]
    thresh = get_cost_minimizing_threshold(y_train,y_prob)
    print(f'mlp threshod= {thresh}')
    y_prob = mlp.predict_proba(X_test.values)[:,1]
    y_decide = np.where(y_prob>thresh,1,0)
    total_cost= calc_cost(y_test,y_decide)
    report_df.loc[f'repeat{loop_i}','MLP'] = total_cost


report_df.loc['Average'] = report_df.mean()

1
Model training complete.
dt threshod= 0.01
mlp threshod= 0.41000000000000003
2
Model training complete.
dt threshod= 0.01
mlp threshod= 0.35000000000000003
3
Model training complete.
dt threshod= 0.01
mlp threshod= 0.3
4
Model training complete.
dt threshod= 0.01
mlp threshod= 0.37
5


In [None]:
report_df

We can see that SOED will incur significantly less cost than MLP or DT.