# Problem 1 
## Detecting Poisonous Mushrooms 

This Problem has been solved in the following journal article using SOED: 

[Self-Organizing and Error Driven (SOED) artificial neural network for smarter](classifications[https://academic.oup.com/jcde/article/4/4/282/5729001)

We will be using ucimlrepo to import the sample data. Make sure to install it:

`pip install ucimlrepo`


In [1]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
mushroom = fetch_ucirepo(id=73) 
  
# data (as pandas dataframes) 
X = mushroom.data.features 
y = mushroom.data.targets 

  
# variable information 
print(mushroom.variables) 


                        name     role         type demographic  \
0                  poisonous   Target  Categorical        None   
1                  cap-shape  Feature  Categorical        None   
2                cap-surface  Feature  Categorical        None   
3                  cap-color  Feature       Binary        None   
4                    bruises  Feature  Categorical        None   
5                       odor  Feature  Categorical        None   
6            gill-attachment  Feature  Categorical        None   
7               gill-spacing  Feature  Categorical        None   
8                  gill-size  Feature  Categorical        None   
9                 gill-color  Feature  Categorical        None   
10               stalk-shape  Feature  Categorical        None   
11                stalk-root  Feature  Categorical        None   
12  stalk-surface-above-ring  Feature  Categorical        None   
13  stalk-surface-below-ring  Feature  Categorical        None   
14    stal

In [2]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from soed import SOEDClassifier
from sklearn.metrics import precision_score, recall_score, roc_auc_score, accuracy_score

### Filling missing values

In [3]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   cap-shape                 8124 non-null   object
 1   cap-surface               8124 non-null   object
 2   cap-color                 8124 non-null   object
 3   bruises                   8124 non-null   object
 4   odor                      8124 non-null   object
 5   gill-attachment           8124 non-null   object
 6   gill-spacing              8124 non-null   object
 7   gill-size                 8124 non-null   object
 8   gill-color                8124 non-null   object
 9   stalk-shape               8124 non-null   object
 10  stalk-root                5644 non-null   object
 11  stalk-surface-above-ring  8124 non-null   object
 12  stalk-surface-below-ring  8124 non-null   object
 13  stalk-color-above-ring    8124 non-null   object
 14  stalk-color-below-ring  

In [4]:
# Filling Missing Values
variable_df = mushroom.variables
for c,t in variable_df.set_index('name').drop(index=['poisonous']).type.to_dict().items():
    if t == 'Categorical':
        X.loc[:,c] = np.where(X[c].isna(),X[c].mode(),X[c])
    if t == 'Continuous':
        X.loc[:,c] = X[c].fillna(X[c].median())


In [5]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   cap-shape                 8124 non-null   object
 1   cap-surface               8124 non-null   object
 2   cap-color                 8124 non-null   object
 3   bruises                   8124 non-null   object
 4   odor                      8124 non-null   object
 5   gill-attachment           8124 non-null   object
 6   gill-spacing              8124 non-null   object
 7   gill-size                 8124 non-null   object
 8   gill-color                8124 non-null   object
 9   stalk-shape               8124 non-null   object
 10  stalk-root                8124 non-null   object
 11  stalk-surface-above-ring  8124 non-null   object
 12  stalk-surface-below-ring  8124 non-null   object
 13  stalk-color-above-ring    8124 non-null   object
 14  stalk-color-below-ring  

### Binary Coding Categorical Attributes

In [6]:
X = pd.get_dummies(X)
dummy_vars = [c for c in X.columns if '_' in c]
X[dummy_vars] = X[dummy_vars].astype(int)

y = np.where(y=='p',1,0)

In [7]:
dropping_columns = []
for c in X.columns:
    if X[c].std() == 0:
        print(c)
        dropping_columns.append(c)

veil-type_p


In [8]:
X = X.drop(columns = dropping_columns)

### Standardizing Data

In [9]:
X  = (X - X.mean())/X.std()

In [10]:
X

Unnamed: 0,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,cap-surface_s,cap-surface_y,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,-0.242710,-0.022193,-0.796161,-0.336857,-0.062881,1.105419,-0.632199,-0.022193,1.475851,-0.815274,...,2.347113,-0.994537,-0.516688,-0.795336,-0.599494,-0.337763,-0.193076,-0.404817,4.590587,-0.155572
1,-0.242710,-0.022193,-0.796161,-0.336857,-0.062881,1.105419,-0.632199,-0.022193,1.475851,-0.815274,...,-0.426003,-0.994537,-0.516688,-0.795336,1.667867,-0.337763,-0.193076,-0.404817,-0.217810,-0.155572
2,4.119631,-0.022193,-0.796161,-0.336857,-0.062881,-0.904523,-0.632199,-0.022193,1.475851,-0.815274,...,-0.426003,-0.994537,-0.516688,-0.795336,-0.599494,-0.337763,5.178669,-0.404817,-0.217810,-0.155572
3,-0.242710,-0.022193,-0.796161,-0.336857,-0.062881,1.105419,-0.632199,-0.022193,-0.677492,1.226430,...,2.347113,-0.994537,-0.516688,-0.795336,-0.599494,-0.337763,-0.193076,-0.404817,4.590587,-0.155572
4,-0.242710,-0.022193,-0.796161,-0.336857,-0.062881,1.105419,-0.632199,-0.022193,1.475851,-0.815274,...,-0.426003,-0.994537,-0.516688,-0.795336,1.667867,-0.337763,-0.193076,-0.404817,-0.217810,-0.155572
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,-0.242710,-0.022193,-0.796161,2.968250,-0.062881,-0.904523,-0.632199,-0.022193,1.475851,-0.815274,...,-0.426003,-0.994537,-0.516688,-0.795336,-0.599494,2.960295,-0.193076,-0.404817,-0.217810,-0.155572
8120,-0.242710,-0.022193,-0.796161,-0.336857,-0.062881,1.105419,-0.632199,-0.022193,1.475851,-0.815274,...,-0.426003,1.005369,-0.516688,-0.795336,-0.599494,2.960295,-0.193076,-0.404817,-0.217810,-0.155572
8121,-0.242710,-0.022193,1.255873,-0.336857,-0.062881,-0.904523,-0.632199,-0.022193,1.475851,-0.815274,...,-0.426003,-0.994537,-0.516688,-0.795336,-0.599494,2.960295,-0.193076,-0.404817,-0.217810,-0.155572
8122,-0.242710,-0.022193,-0.796161,2.968250,-0.062881,-0.904523,-0.632199,-0.022193,-0.677492,1.226430,...,-0.426003,1.005369,-0.516688,-0.795336,-0.599494,2.960295,-0.193076,-0.404817,-0.217810,-0.155572


### PCA Transformation

In [11]:
pca = PCA(n_components=50)  # Reduce to 50 dimensions
X_pca = pd.DataFrame(pca.fit_transform(X),columns = [f'PC{i}' for i in range(1,51)])

### Split Data

In [12]:
random_index = np.random.permutation(X.shape[0])

i = int(round(X.shape[0]*0.4))
train_index = random_index[:i]
test_index = random_index[i+1:]

X_train = X_pca.iloc[train_index]
X_test = X_pca.iloc[test_index]

y_train = y[train_index]
y_test = y[test_index]

In [13]:
X_train

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC41,PC42,PC43,PC44,PC45,PC46,PC47,PC48,PC49,PC50
6751,2.606467,1.497804,4.028829,-0.032513,0.703434,-0.158585,-0.733007,-0.081488,0.179695,-0.074203,...,0.164819,-1.121860,-1.203427,-1.245833,-1.547661,0.134364,0.065288,0.030775,0.426489,0.191567
2768,-2.281366,-0.837011,-0.289116,4.989484,-0.028097,-1.153761,1.796172,0.643339,0.591414,-0.501998,...,1.549101,-1.259713,0.186356,0.456818,1.095066,0.019508,-0.301799,-0.141780,-1.100952,0.268336
957,-1.232730,0.354430,0.840964,3.737157,-0.256842,-0.959626,0.374262,0.142047,-0.110514,-0.703924,...,-1.338959,-0.029122,0.257986,0.224691,0.157751,-0.368259,-0.434387,1.101054,0.066114,0.100408
3915,-2.246396,0.179908,0.752862,3.717833,-0.343515,-1.076762,0.706069,0.287210,-0.436958,-0.460100,...,-0.763574,-0.076786,0.536236,-2.165214,1.822539,-1.165735,-0.475819,0.783332,0.309122,0.643503
2504,-2.084704,-0.598434,0.308979,4.061920,-0.092619,-0.999649,1.060452,0.307304,0.212351,-0.401156,...,-0.075819,-0.101243,0.457471,-2.412923,1.615519,-1.191922,-0.021350,-0.316033,0.089202,0.071001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6241,2.918113,1.550859,3.453036,1.423217,0.458660,-0.281654,-0.415828,-0.085575,0.326285,-0.193408,...,-0.448648,-0.464842,-0.372671,0.029547,0.077422,-0.621400,0.264251,0.411770,-0.528318,0.220964
1738,-3.082770,-1.588741,1.308099,-4.955577,-2.065272,-1.590585,1.703493,-0.031141,-0.260748,-0.774614,...,-1.558017,-0.828044,-0.300636,-0.812468,-0.555030,1.284684,-1.077537,-1.145577,-1.265934,1.011906
4609,4.808048,-2.226429,-3.806952,-0.575758,-0.241446,0.690438,-0.948226,-0.304491,0.220372,-0.293323,...,-0.337799,-0.836817,0.063928,0.650347,0.747339,-0.829919,-0.890059,-0.416730,-0.311642,0.109738
4925,4.952901,-2.296068,-3.913659,0.274590,-0.181722,0.359079,-0.597444,0.295636,-0.843329,-0.563980,...,-1.637575,0.322619,0.168226,-0.091043,-0.056747,-0.800100,-0.433783,0.183306,0.152166,-0.814060


### Model Trainig

In [14]:
soed = SOEDClassifier(mlp_max_iter=10000,som_input_len=X_train.shape[1])
soed.fit(X_train.values,y_train)

Model training complete.


<soed.SOEDClassifier at 0x10fe56db0>

### Model Evaluation

In [15]:
y_proba = soed.predict_proba(X_test.values)
y_pred = soed.predict(X_test.values)

recall = recall_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
accuracy = accuracy_score(y_test,y_pred)
auc = roc_auc_score(y_test,y_proba[:,1])

performance = {'recall':recall,'precision':precision,'accuracy':accuracy,'auc':auc}

In [16]:
print(performance)

{'recall': 0.9818965517241379, 'precision': 0.9973730297723292, 'accuracy': 0.9901498050482249, 'auc': np.float64(0.9962338087712901)}


### Model Comparison

In [17]:
report_df = pd.DataFrame(index = [f'repeat{i}' for i in range(1,6)],
                         columns = ['DT','SOED'] )
report_df

Unnamed: 0,DT,SOED
repeat1,,
repeat2,,
repeat3,,
repeat4,,
repeat5,,


In [18]:
for loop_i in range(1,6):
    print(loop_i)
    random_index = np.random.permutation(X.shape[0])

    i = int(round(X.shape[0]*0.4))
    train_index = random_index[:i]
    test_index = random_index[i+1:]
    
    X_train = X_pca.iloc[train_index]
    X_test = X_pca.iloc[test_index]
    
    y_train = y[train_index]
    y_test = y[test_index]

    #soed
    soed = SOEDClassifier(mlp_max_iter=10000,som_input_len=X_train.shape[1])
    soed.fit(X_train.values,y_train)
    y_proba = soed.predict_proba(X_test.values)
    auc = roc_auc_score(y_test,y_proba[:,1])
    report_df.loc[f'repeat{loop_i}','SOED'] = auc

    #dt
    dt = DecisionTreeClassifier(max_depth=10)
    dt.fit(X_train.values,y_train)
    y_proba = dt.predict_proba(X_test.values)
    auc = roc_auc_score(y_test,y_proba[:,1])
    report_df.loc[f'repeat{loop_i}','DT'] = auc


report_df.loc['Average'] = report_df.mean()

1
Model training complete.
2
Model training complete.
3
Model training complete.
4
Model training complete.
5
Model training complete.


In [19]:
report_df

Unnamed: 0,DT,SOED
repeat1,0.994959,0.996968
repeat2,0.995657,0.995719
repeat3,0.995107,0.997904
repeat4,0.99486,0.999675
repeat5,0.994894,0.997872
Average,0.995095,0.997628


We can see that their performance is comparable. Let's see how they will do in the next problem.

# Problem 2
## Deciding to consume mushroom 

We must assume some data to use the mushroom dataset for a decision-making problem. We will assume that when we make a false negative, we will incur a 200-dollar mistake cost; when we make a false positive, we will incur a 5-dollar cost of errors.


In [20]:
c0 = np.where(y==0,0,200)
c1 = np.where(y==1,0,5)
c = np.column_stack((c0,c1))


random_index = np.random.permutation(X.shape[0])

i = int(round(X.shape[0]*0.2))
train_index = random_index[:i]
test_index = random_index[i+1:]

X_train = X_pca.iloc[train_index]
X_test = X_pca.iloc[test_index]

y_train = y[train_index]
y_test = y[test_index]

c_train = c[train_index,:]
c_test = c[test_index,:]

In [21]:
soed = SOEDClassifier(mlp_max_iter=10000,som_input_len=X_train.shape[1])
soed.fit(X_train.values,y_train,c_train)

Model training complete.


<soed.SOEDClassifier at 0x10fe577a0>

In [22]:
y_decide = soed.decide(X_test.values)

In [23]:
df = pd.DataFrame(np.column_stack((y_test,y_decide)),columns = ['Reality','Decision'])
df['FN_cost'] = np.where((df.Decision==0) & (df.Reality==1),200,0)
df['FP_cost'] = np.where((df.Decision==1) & (df.Reality==0),5,0)
df['cost'] = df.FP_cost + df.FN_cost
df

Unnamed: 0,Reality,Decision,FN_cost,FP_cost,cost
0,1,1,0,0,0
1,1,1,0,0,0
2,0,0,0,0,0
3,1,1,0,0,0
4,0,0,0,0,0
...,...,...,...,...,...
6493,1,1,0,0,0
6494,1,1,0,0,0
6495,1,1,0,0,0
6496,1,1,0,0,0


In [24]:
total_cost = df.cost.sum()
print(f'total cost is {total_cost}.')

total cost is 2710.


In [25]:
df.cost.value_counts()

cost
0      6268
5       222
200       8
Name: count, dtype: int64

### Comparing Models

In [26]:
report_df = pd.DataFrame(index = [f'repeat{i}' for i in range(1,6)],
                         columns = ['DT','SOED'] )
report_df

Unnamed: 0,DT,SOED
repeat1,,
repeat2,,
repeat3,,
repeat4,,
repeat5,,


In [27]:
def calc_cost(y_reality,y_decide):
    df = pd.DataFrame(np.column_stack((y_reality,y_decide)),columns = ['Reality','Decision'])
    df['FN_cost'] = np.where((df.Decision==0) & (df.Reality==1),200,0)
    df['FP_cost'] = np.where((df.Decision==1) & (df.Reality==0),5,0)
    df['cost'] = df.FP_cost + df.FN_cost
    return df.cost.sum()

In [28]:
def get_cost_minimizing_threshold(y_reality,y_prob):
    candidate_df = pd.DataFrame(index=range(99),columns = ['thresh','cost'])
    candidate_df.thresh = np.linspace(0.01,0.99,99)
    candidate_df = candidate_df.set_index('thresh')
    
    for t in candidate_df.index.tolist():
        y_decide = np.where(y_prob>t,1,0)
        candidate_df.loc[t,'cost'] = calc_cost(y_reality,y_decide)
    return candidate_df[candidate_df.cost == candidate_df.cost.min()].index[0]

In [29]:
for loop_i in range(1,6):
    print(loop_i)
    random_index = np.random.permutation(X.shape[0])

    i = int(round(X.shape[0]*0.4))
    train_index = random_index[:i]
    test_index = random_index[i+1:]
    
    X_train = X_pca.iloc[train_index]
    X_test = X_pca.iloc[test_index]
    
    y_train = y[train_index]
    y_test = y[test_index]

    c_train = c[train_index,:]
    c_test = c[test_index,:]

    #soed
    soed = SOEDClassifier(mlp_max_iter=10000,som_input_len=X_train.shape[1])
    soed.fit(X_train.values,y_train,c_train)
    y_decide = soed.decide(X_test.values)
    total_cost= calc_cost(y_test,y_decide)
    report_df.loc[f'repeat{loop_i}','SOED'] = total_cost

    #dt
    dt = DecisionTreeClassifier()
    dt.fit(X_train.values,y_train)

    y_prob = dt.predict_proba(X_train.values)[:,1]
    thresh = get_cost_minimizing_threshold(y_train,y_prob)
    print(f'dt threshod= {thresh}')
    y_prob = dt.predict_proba(X_test.values)[:,1]
    y_decide = np.where(y_prob>thresh,1,0)
    total_cost= calc_cost(y_test,y_decide)
    report_df.loc[f'repeat{loop_i}','DT'] = total_cost


report_df.loc['Average'] = report_df.mean()

1
Model training complete.
dt threshod= 0.01
2
Model training complete.
dt threshod= 0.01
3
Model training complete.
dt threshod= 0.01
4
Model training complete.
dt threshod= 0.01
5
Model training complete.
dt threshod= 0.01


In [30]:
report_df

Unnamed: 0,DT,SOED
repeat1,1840.0,380.0
repeat2,3030.0,1170.0
repeat3,1685.0,2275.0
repeat4,3085.0,2705.0
repeat5,4665.0,1945.0
Average,2861.0,1695.0


We can see that SOED will incur significantly less cost than DT.