# Problem 1 
## Predicting Credit Approval 

This Problem has been solved in the following journal article using SOED: 

[Self-Organizing and Error Driven (SOED) artificial neural network for smarter](classifications[https://academic.oup.com/jcde/article/4/4/282/5729001)

We will be using ucimlrepo to import the sample data. Make sure to install it:

`pip install ucimlrepo`


In [1]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
credit_approval = fetch_ucirepo(id=27) 
  
# data (as pandas dataframes) 
X = credit_approval.data.features 
y = credit_approval.data.targets 

print(credit_approval.variables) 


   name     role         type demographic description units missing_values
0   A16   Target  Categorical        None        None  None             no
1   A15  Feature   Continuous        None        None  None             no
2   A14  Feature   Continuous        None        None  None            yes
3   A13  Feature  Categorical        None        None  None             no
4   A12  Feature  Categorical        None        None  None             no
5   A11  Feature   Continuous        None        None  None             no
6   A10  Feature  Categorical        None        None  None             no
7    A9  Feature  Categorical        None        None  None             no
8    A8  Feature   Continuous        None        None  None             no
9    A7  Feature  Categorical        None        None  None            yes
10   A6  Feature  Categorical        None        None  None            yes
11   A5  Feature  Categorical        None        None  None            yes
12   A4  Feature  Categor

In [2]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from soed import SOEDClassifier
from sklearn.metrics import precision_score, recall_score, roc_auc_score, accuracy_score

### Filling missing values

In [3]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A15     690 non-null    int64  
 1   A14     677 non-null    float64
 2   A13     690 non-null    object 
 3   A12     690 non-null    object 
 4   A11     690 non-null    int64  
 5   A10     690 non-null    object 
 6   A9      690 non-null    object 
 7   A8      690 non-null    float64
 8   A7      681 non-null    object 
 9   A6      681 non-null    object 
 10  A5      684 non-null    object 
 11  A4      684 non-null    object 
 12  A3      690 non-null    float64
 13  A2      678 non-null    float64
 14  A1      678 non-null    object 
dtypes: float64(4), int64(2), object(9)
memory usage: 81.0+ KB


In [4]:
# Filling Missing Values
variable_df = credit_approval.variables
for c,t in variable_df.set_index('name').drop(index=['A16']).type.to_dict().items():
    if t == 'Categorical':
        X.loc[:,c] = np.where(X[c].isna(),X[c].mode(),X[c])
    if t == 'Continuous':
        X.loc[:,c] = X[c].fillna(X[c].median())


In [5]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A15     690 non-null    int64  
 1   A14     690 non-null    float64
 2   A13     690 non-null    object 
 3   A12     690 non-null    object 
 4   A11     690 non-null    int64  
 5   A10     690 non-null    object 
 6   A9      690 non-null    object 
 7   A8      690 non-null    float64
 8   A7      690 non-null    object 
 9   A6      690 non-null    object 
 10  A5      690 non-null    object 
 11  A4      690 non-null    object 
 12  A3      690 non-null    float64
 13  A2      690 non-null    float64
 14  A1      690 non-null    object 
dtypes: float64(4), int64(2), object(9)
memory usage: 81.0+ KB


### Binary Coding Categorical Attributes

In [6]:
X = pd.get_dummies(X)
dummy_vars = [c for c in X.columns if '_' in c]
X[dummy_vars] = X[dummy_vars].astype(int)

y = np.where(y=='+',1,0)[:,0]

In [7]:
X

Unnamed: 0,A15,A14,A11,A8,A3,A2,A13_g,A13_p,A13_s,A12_f,...,A6_w,A6_x,A5_g,A5_gg,A5_p,A4_l,A4_u,A4_y,A1_a,A1_b
0,0,202.0,1,1.25,0.000,30.83,1,0,0,1,...,1,0,1,0,0,0,1,0,0,1
1,560,43.0,6,3.04,4.460,58.67,1,0,0,1,...,0,0,1,0,0,0,1,0,1,0
2,824,280.0,0,1.50,0.500,24.50,1,0,0,1,...,0,0,1,0,0,0,1,0,1,0
3,3,100.0,5,3.75,1.540,27.83,1,0,0,0,...,1,0,1,0,0,0,1,0,0,1
4,0,120.0,0,1.71,5.625,20.17,0,0,1,1,...,1,0,1,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,0,260.0,0,1.25,10.085,21.08,1,0,0,1,...,0,0,0,0,1,0,0,1,0,1
686,394,200.0,2,2.00,0.750,22.67,1,0,0,0,...,0,0,1,0,0,0,1,0,1,0
687,1,200.0,1,2.00,13.500,25.25,1,0,0,0,...,0,0,0,0,1,0,0,1,1,0
688,750,280.0,0,0.04,0.205,17.92,1,0,0,1,...,0,0,1,0,0,0,1,0,0,1


### Standardizing Data

In [8]:
X  = (X - X.mean())/X.std()

In [9]:
X.describe()

Unnamed: 0,A15,A14,A11,A8,A3,A2,A13_g,A13_p,A13_s,A12_f,...,A6_w,A6_x,A5_g,A5_gg,A5_p,A4_l,A4_u,A4_y,A1_a,A1_b
count,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,...,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0
mean,1.0297720000000001e-17,4.8914170000000004e-17,1.0297720000000001e-17,1.029772e-16,1.0297720000000001e-17,2.059544e-16,-1.699124e-16,5.96946e-17,-2.9605950000000004e-17,-1.055516e-16,...,-7.465848e-17,4.633974e-17,-1.055516e-16,-1.5446580000000003e-17,-1.0297720000000001e-17,-1.5446580000000003e-17,-1.055516e-16,-1.0297720000000001e-17,-3.604202e-17,3.604202e-17
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-0.1952717,-1.066043,-0.4935286,-0.6643947,-0.9559198,-1.497787,-3.098621,-0.1082276,-0.2998615,-1.08712,...,-0.3195125,-0.2412421,-1.782472,-0.0538773,-0.5557424,-0.0538773,-1.782472,-0.5557424,-0.6609584,-1.510762
25%,-0.1952717,-0.6014412,-0.4935286,-0.6150897,-0.7550425,-0.7456942,0.3222565,-0.1082276,-0.2998615,-1.08712,...,-0.3195125,-0.2412421,0.5602055,-0.0538773,-0.5557424,-0.0538773,0.5602055,-0.5557424,-0.6609584,-1.510762
50%,-0.194312,-0.1368388,-0.4935286,-0.3655762,-0.4035072,-0.2575087,0.3222565,-0.1082276,-0.2998615,0.9185289,...,-0.3195125,-0.2412421,0.5602055,-0.0538773,-0.5557424,-0.0538773,0.5602055,-0.5557424,-0.6609584,0.6609584
75%,-0.1193615,0.5136044,0.1233822,0.1200038,0.4919034,0.522197,0.3222565,-0.1082276,-0.2998615,0.9185289,...,-0.3195125,-0.2412421,0.5602055,-0.0538773,-0.5557424,-0.0538773,0.5602055,-0.5557424,1.510762,0.6609584
max,18.99821,10.54901,13.28414,7.851932,4.668645,4.10918,0.3222565,9.2264,3.33004,0.9185289,...,3.125232,4.139206,0.5602055,18.53379,1.796787,18.53379,0.5602055,1.796787,1.510762,0.6609584


### Split Data

In [10]:
random_index = np.random.permutation(X.shape[0])

i = int(round(X.shape[0]*0.5))
train_index = random_index[:i]
test_index = random_index[i+1:]

X_train = X.iloc[train_index]
X_test = X.iloc[test_index]

y_train = y[train_index]
y_test = y[test_index]

In [11]:
X_train

Unnamed: 0,A15,A14,A11,A8,A3,A2,A13_g,A13_p,A13_s,A12_f,...,A6_w,A6_x,A5_g,A5_gg,A5_p,A4_l,A4_u,A4_y,A1_a,A1_b
63,-0.195272,-1.066043,-0.287892,-0.190767,-0.788187,-0.935404,0.322257,-0.108228,-0.299861,0.918529,...,-0.319513,-0.241242,0.560206,-0.053877,-0.555742,-0.053877,0.560206,-0.555742,1.510762,-1.510762
567,-0.195272,1.024667,-0.493529,-0.402929,-0.378398,-0.534906,0.322257,-0.108228,-0.299861,0.918529,...,-0.319513,4.139206,0.560206,-0.053877,-0.555742,-0.053877,0.560206,-0.555742,1.510762,-1.510762
7,0.063648,-0.601441,-0.493529,-0.652442,1.371244,-0.724615,0.322257,-0.108228,-0.299861,0.918529,...,-0.319513,-0.241242,0.560206,-0.053877,-0.555742,-0.053877,0.560206,-0.555742,1.510762,-1.510762
49,-0.195272,-0.485291,-0.493529,-0.615090,-0.822336,-0.640300,0.322257,-0.108228,-0.299861,0.918529,...,-0.319513,-0.241242,0.560206,-0.053877,-0.555742,-0.053877,0.560206,-0.555742,-0.660958,0.660958
178,-0.091627,-0.717592,0.329019,-0.302824,0.902195,-1.104034,0.322257,-0.108228,-0.299861,0.918529,...,-0.319513,-0.241242,0.560206,-0.053877,-0.555742,-0.053877,0.560206,-0.555742,1.510762,-1.510762
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101,-0.195272,-0.601441,-0.493529,0.530879,2.358556,0.314992,0.322257,-0.108228,-0.299861,0.918529,...,-0.319513,-0.241242,-1.782472,-0.053877,1.796787,-0.053877,-1.782472,1.796787,-0.660958,0.660958
115,-0.195272,0.095462,-0.082255,-0.278919,-0.729933,-0.513827,0.322257,-0.108228,-0.299861,0.918529,...,-0.319513,-0.241242,0.560206,-0.053877,-0.555742,-0.053877,0.560206,-0.555742,1.510762,-1.510762
69,-0.099304,1.924834,-0.287892,-0.178815,4.091122,0.308247,0.322257,-0.108228,-0.299861,-1.087120,...,-0.319513,4.139206,0.560206,-0.053877,-0.555742,-0.053877,0.560206,-0.555742,-0.660958,0.660958
297,-0.195272,3.115377,-0.082255,-0.141462,-0.453727,0.118538,0.322257,-0.108228,-0.299861,-1.087120,...,-0.319513,-0.241242,0.560206,-0.053877,-0.555742,-0.053877,0.560206,-0.555742,-0.660958,0.660958


### Model Trainig

In [17]:
soed = SOEDClassifier(mlp_max_iter=10000)
out = soed.fit(X_train.values,y_train)
out

version 1.0.8
Model training complete.


<soed.SOEDClassifier at 0x130171850>

In [18]:
y_proba = soed.predict_proba(X_test.values)
y_pred = soed.predict(X_test.values)

recall = recall_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
accuracy = accuracy_score(y_test,y_pred)
auc = roc_auc_score(y_test,y_proba[:,1])

performance = {'recall':recall,'precision':precision,'accuracy':accuracy,'auc':auc}

In [19]:
print(performance)

{'recall': 0.8193548387096774, 'precision': 0.8758620689655172, 'accuracy': 0.8662790697674418, 'auc': np.float64(0.8812937361324459)}


### Model Comparison

In [20]:
report_df = pd.DataFrame(index = [f'repeat{i}' for i in range(1,6)],
                         columns = ['DT','MLP','SOED'] )
report_df

Unnamed: 0,DT,MLP,SOED
repeat1,,,
repeat2,,,
repeat3,,,
repeat4,,,
repeat5,,,


In [21]:
for loop_i in range(1,6):
    print(loop_i)
    random_index = np.random.permutation(X.shape[0])

    i = int(round(X.shape[0]*0.7))
    train_index = random_index[:i]
    test_index = random_index[i+1:]
    
    X_train = X.iloc[train_index]
    X_test = X.iloc[test_index]
    
    y_train = y[train_index]
    y_test = y[test_index]

    #soed
    soed = SOEDClassifier(mlp_max_iter=10000,som_x=7,som_y=7,som_sigma=7)
    soed.fit(X_train.values,y_train)
    y_proba = soed.predict_proba(X_test.values)
    auc = roc_auc_score(y_test,y_proba[:,1])
    report_df.loc[f'repeat{loop_i}','SOED'] = auc

    #dt
    dt = DecisionTreeClassifier(max_depth=10)
    dt.fit(X_train.values,y_train)
    y_proba = dt.predict_proba(X_test.values)
    auc = roc_auc_score(y_test,y_proba[:,1])
    report_df.loc[f'repeat{loop_i}','DT'] = auc

    #mlp
    mlp = MLPClassifier(max_iter=10000)
    mlp.fit(X_train.values,y_train)
    y_proba = mlp.predict_proba(X_test.values)
    auc = roc_auc_score(y_test,y_proba[:,1])
    report_df.loc[f'repeat{loop_i}','MLP'] = auc

report_df.loc['Average'] = report_df.mean()

1
version 1.0.8
Model training complete.
2
version 1.0.8
Model training complete.
3
version 1.0.8
Model training complete.
4
version 1.0.8
Model training complete.
5
version 1.0.8
Model training complete.


In [22]:
report_df

Unnamed: 0,DT,MLP,SOED
repeat1,0.833995,0.906935,0.880149
repeat2,0.828681,0.9043,0.92585
repeat3,0.880228,0.906401,0.918397
repeat4,0.830859,0.913217,0.918736
repeat5,0.80625,0.891098,0.905256
Average,0.836002,0.90439,0.909678


We can see that SOED and MLP are performing very similarly. SOED will show its superiority in the decision-making problem. 

# Problem 2
## Deciding Credit Approval 

We must assume some data to use credit approval for a decision-making problem. We will assume that when we make a false negative, we will incur a 1000-dollar mistake cost; when we make a false positive, we will incur a 2-dollar cost of errors.


In [23]:
c0 = np.where(y==0,0,1000)
c1 = np.where(y==1,0,2)
c = np.column_stack((c0,c1))


random_index = np.random.permutation(X.shape[0])

i = int(round(X.shape[0]*0.5))
train_index = random_index[:i]
test_index = random_index[i+1:]

X_train = X.iloc[train_index]
X_test = X.iloc[test_index]

y_train = y[train_index]
y_test = y[test_index]

c_train = c[train_index,:]
c_test = c[test_index,:]

In [24]:
soed = SOEDClassifier(mlp_max_iter=10000)
soed.fit(X_train.values,y_train,c_train)

version 1.0.8
Model training complete.


<soed.SOEDClassifier at 0x1177a2540>

In [25]:
y_decide = soed.decide(X_test.values)

In [26]:
df = pd.DataFrame(np.column_stack((y_test,y_decide)),columns = ['Reality','Decision'])
df['FN_cost'] = np.where((df.Decision==0) & (df.Reality==1),1000,0)
df['FP_cost'] = np.where((df.Decision==1) & (df.Reality==0),2,0)
df['cost'] = df.FP_cost + df.FN_cost
df

Unnamed: 0,Reality,Decision,FN_cost,FP_cost,cost
0,1,1,0,0,0
1,0,1,0,2,2
2,0,0,0,0,0
3,1,1,0,0,0
4,0,0,0,0,0
...,...,...,...,...,...
339,0,1,0,2,2
340,1,1,0,0,0
341,1,1,0,0,0
342,0,1,0,2,2


In [27]:
total_cost = df.cost.sum()
print(f'total cost is {total_cost}.')

total cost is 21140.


In [28]:
df.cost.value_counts()

cost
0       253
2        70
1000     21
Name: count, dtype: int64

### Comparing Models

In [29]:
report_df = pd.DataFrame(index = [f'repeat{i}' for i in range(1,6)],
                         columns = ['DT','MLP','SOED'] )
report_df

Unnamed: 0,DT,MLP,SOED
repeat1,,,
repeat2,,,
repeat3,,,
repeat4,,,
repeat5,,,


In [30]:
def calc_cost(y_reality,y_decide):
    df = pd.DataFrame(np.column_stack((y_reality,y_decide)),columns = ['Reality','Decision'])
    df['FN_cost'] = np.where((df.Decision==0) & (df.Reality==1),1000,0)
    df['FP_cost'] = np.where((df.Decision==1) & (df.Reality==0),10,0)
    df['cost'] = df.FP_cost + df.FN_cost
    return df.cost.sum()

In [31]:
def get_cost_minimizing_threshold(y_reality,y_prob):
    candidate_df = pd.DataFrame(index=range(99),columns = ['thresh','cost'])
    candidate_df.thresh = np.linspace(0.01,0.99,99)
    candidate_df = candidate_df.set_index('thresh')
    
    for t in candidate_df.index.tolist():
        y_decide = np.where(y_prob>t,1,0)
        candidate_df.loc[t,'cost'] = calc_cost(y_reality,y_decide)
    return candidate_df[candidate_df.cost == candidate_df.cost.min()].index[0]

In [32]:
for loop_i in range(1,6):
    print(loop_i)
    random_index = np.random.permutation(X.shape[0])

    i = int(round(X.shape[0]*0.7))
    train_index = random_index[:i]
    test_index = random_index[i+1:]
    
    X_train = X.iloc[train_index]
    X_test = X.iloc[test_index]
    
    y_train = y[train_index]
    y_test = y[test_index]

    c_train = c[train_index,:]
    c_test = c[test_index,:]

    #soed
    soed = SOEDClassifier(mlp_max_iter=10000)
    soed.fit(X_train.values,y_train,c_train)
    y_decide = soed.decide(X_test.values)
    total_cost= calc_cost(y_test,y_decide)
    report_df.loc[f'repeat{loop_i}','SOED'] = total_cost

    #dt
    dt = DecisionTreeClassifier()
    dt.fit(X_train.values,y_train)

    y_prob = dt.predict_proba(X_train.values)[:,1]
    thresh = get_cost_minimizing_threshold(y_train,y_prob)
    print(f'dt threshod= {thresh}')
    y_prob = dt.predict_proba(X_test.values)[:,1]
    y_decide = np.where(y_prob>thresh,1,0)
    total_cost= calc_cost(y_test,y_decide)
    report_df.loc[f'repeat{loop_i}','DT'] = total_cost

    #mlp
    mlp = MLPClassifier(max_iter=10000)
    mlp.fit(X_train.values,y_train)
    
    y_prob = mlp.predict_proba(X_train.values)[:,1]
    thresh = get_cost_minimizing_threshold(y_train,y_prob)
    print(f'mlp threshod= {thresh}')
    y_prob = mlp.predict_proba(X_test.values)[:,1]
    y_decide = np.where(y_prob>thresh,1,0)
    total_cost= calc_cost(y_test,y_decide)
    report_df.loc[f'repeat{loop_i}','MLP'] = total_cost


report_df.loc['Average'] = report_df.mean()

1
version 1.0.8
Model training complete.
dt threshod= 0.01
mlp threshod= 0.29000000000000004
2
version 1.0.8
Model training complete.
dt threshod= 0.01
mlp threshod= 0.2
3
version 1.0.8
Model training complete.
dt threshod= 0.01
mlp threshod= 0.25
4
version 1.0.8
Model training complete.
dt threshod= 0.01
mlp threshod= 0.45
5
version 1.0.8
Model training complete.
dt threshod= 0.01
mlp threshod= 0.29000000000000004


In [34]:
report_df

Unnamed: 0,DT,MLP,SOED
repeat1,21220.0,12200.0,5500.0
repeat2,22210.0,11200.0,17380.0
repeat3,9250.0,11220.0,7480.0
repeat4,21250.0,14240.0,12470.0
repeat5,29150.0,11180.0,4370.0
Average,20616.0,12008.0,9440.0


We can see that SOED will incur significantly less cost than MLP or DT.