In [None]:
%%capture
!git clone https://github.com/rasbt/mlxtend.git

In [None]:
%%capture
!cd mlxtend; pip install .; cd ..

In [None]:
import pandas as pd

vert_df = pd.read_csv('https://raw.githubusercontent.com/arundhaj/datamining_class/master/data/vertebrate.csv')

In [None]:
vert_df

Unnamed: 0,Name,Body Temperature,Skin Cover,Gives Birth,Aquatic Creature,Aerial Creature,Has Legs,Hibernates,Class Label
0,human,warm-blooded,hair,yes,no,no,yes,no,mammal
1,python,cold-blooded,scales,no,no,no,no,yes,reptile
2,salmon,cold-blooded,scales,no,yes,no,no,no,fish
3,whale,warm-blooded,hair,yes,yes,no,no,no,mammal
4,frog,cold-blooded,none,no,semi,no,yes,yes,amphibian
5,komodo dragon,cold-blooded,scales,no,no,no,yes,no,reptile
6,bat,warm-blooded,hair,yes,no,yes,yes,yes,mammal
7,pigeon,warm-blooded,feathers,no,no,yes,yes,no,bird
8,cat,warm-blooded,fur,yes,no,no,yes,no,mammal
9,leopard shark,cold-blooded,scales,yes,yes,no,no,no,fish


In [None]:
vert_df.dropna(inplace=True)
vert_df.drop(['Name'], axis=1, inplace=True)
vert_df_cat = vert_df.copy()

In [None]:
mapping = dict()

for col in vert_df_cat.columns:
  mapping[col] = dict(enumerate(vert_df_cat[col].astype('category').cat.categories))
  vert_df_cat[col] = vert_df_cat[col].astype('category').cat.codes

In [None]:
import json

print('Mappings\n=========')
print(json.dumps(mapping, indent=4))

Mappings
{
    "Body Temperature": {
        "0": "cold-blooded",
        "1": "warm-blooded"
    },
    "Skin Cover": {
        "0": "feathers",
        "1": "fur",
        "2": "hair",
        "3": "none",
        "4": "quills",
        "5": "scales"
    },
    "Gives Birth": {
        "0": "no",
        "1": "yes"
    },
    "Aquatic Creature": {
        "0": "no",
        "1": "semi",
        "2": "yes"
    },
    "Aerial Creature": {
        "0": "no",
        "1": "yes"
    },
    "Has Legs": {
        "0": "no",
        "1": "yes"
    },
    "Hibernates": {
        "0": "no",
        "1": "yes"
    },
    "Class Label": {
        "0": "amphibian",
        "1": "bird",
        "2": "fish",
        "3": "mammal",
        "4": "reptile"
    }
}


In [None]:
X = vert_df_cat.drop(['Class Label'], axis=1)
X

Unnamed: 0,Body Temperature,Skin Cover,Gives Birth,Aquatic Creature,Aerial Creature,Has Legs,Hibernates
0,1,2,1,0,0,1,0
1,0,5,0,0,0,0,1
2,0,5,0,2,0,0,0
3,1,2,1,2,0,0,0
4,0,3,0,1,0,1,1
5,0,5,0,0,0,1,0
6,1,2,1,0,1,1,1
7,1,0,0,0,1,1,0
8,1,1,1,0,0,1,0
9,0,5,1,2,0,0,0


In [None]:
y = vert_df_cat['Class Label']
y

0     3
1     4
2     2
3     3
4     0
5     4
6     3
7     1
8     3
9     2
10    4
11    1
12    3
13    2
14    0
Name: Class Label, dtype: int8

In [None]:
from mlxtend.classifier.oner import OneRClassifier

oner = OneRClassifier()
oner.fit(X.values, y.values)
print(X.columns[oner.feature_idx_], oner.prediction_dict_)

Skin Cover {'total error': 3, 'rules (value: class)': {0: 1, 1: 3, 2: 3, 3: 0, 4: 3, 5: 2}}


In [None]:
labels = mapping['Class Label']
sel_feat = X.columns[oner.feature_idx_]

for k, v in oner.prediction_dict_['rules (value: class)'].items():
  _df = vert_df[vert_df[sel_feat] == mapping[sel_feat][k]]['Class Label'] == labels[v]
  print(f'Rule: ({sel_feat} = {mapping[sel_feat][k]}) => {labels[v]}\t\tCoverage: {vert_df[vert_df[sel_feat] == mapping[sel_feat][k]].shape[0]/vert_df.shape[0]:.2f}  Accuracy: {_df.sum()/_df.count():.2f}')

Rule: (Skin Cover = feathers) => bird		Coverage: 0.13  Accuracy: 1.00
Rule: (Skin Cover = fur) => mammal		Coverage: 0.07  Accuracy: 1.00
Rule: (Skin Cover = hair) => mammal		Coverage: 0.20  Accuracy: 1.00
Rule: (Skin Cover = none) => amphibian		Coverage: 0.13  Accuracy: 1.00
Rule: (Skin Cover = quills) => mammal		Coverage: 0.07  Accuracy: 1.00
Rule: (Skin Cover = scales) => fish		Coverage: 0.40  Accuracy: 0.50


Using the rule `(Skin Cover = scales) => fish` produces some misclassified examples, we remove the examples which have been correctly classified so far using other rules with 100% accuracy and drop this attribute from further consideration.

In [None]:
vert_df_iter_2 = vert_df_cat[vert_df['Skin Cover'] == 'scales']
vert_df_iter_2

Unnamed: 0,Body Temperature,Skin Cover,Gives Birth,Aquatic Creature,Aerial Creature,Has Legs,Hibernates,Class Label
1,0,5,0,0,0,0,1,4
2,0,5,0,2,0,0,0,2
5,0,5,0,0,0,1,0,4
9,0,5,1,2,0,0,0,2
10,0,5,0,1,0,1,0,4
13,0,5,0,2,0,0,0,2


In [None]:
X_1 = vert_df_iter_2.drop(['Skin Cover', 'Class Label'], axis=1)
y_1 = vert_df_iter_2['Class Label']

In [None]:
oner_1 = OneRClassifier()
oner_1.fit(X_1.values, y_1.replace({4: 1, 2: 0}).values)
print(X_1.columns[oner_1.feature_idx_], oner_1.prediction_dict_)

Aquatic Creature {'total error': 0, 'rules (value: class)': {0: 1, 1: 1, 2: 0}}


Here, the mappings have changed a bit, so they are as follows:

In [None]:
mappings_1 = {"Class Label": {0: 2, 1: 4}}
mappings_1['Aquatic Creature'] = mapping['Aquatic Creature']
print(json.dumps(mappings_1, indent=4))

{
    "Class Label": {
        "0": 2,
        "1": 4
    },
    "Aquatic Creature": {
        "0": "no",
        "1": "semi",
        "2": "yes"
    }
}


In [None]:
sel_feat_1 = X_1.columns[oner_1.feature_idx_]
labels_1 = mappings_1['Class Label']

for k, v in oner_1.prediction_dict_['rules (value: class)'].items():
  _label = labels[mappings_1['Class Label'][v]]
  _df = vert_df_iter_2[vert_df_iter_2[sel_feat_1] == k]['Class Label'] == mappings_1['Class Label'][v]
  print(f'Rule: ({sel_feat_1} = {mappings_1[sel_feat_1][k]}) => {_label}  \t\t',
        f'Coverage: {vert_df_iter_2[vert_df_iter_2[sel_feat_1] == k].shape[0]/vert_df.shape[0]:.2f} Accuracy: {_df.sum()/_df.count():.2f}')

Rule: (Aquatic Creature = no) => reptile  		 Coverage: 0.13 Accuracy: 1.00
Rule: (Aquatic Creature = semi) => reptile  		 Coverage: 0.07 Accuracy: 1.00
Rule: (Aquatic Creature = yes) => fish  		 Coverage: 0.20 Accuracy: 1.00


After integrating the new rules to the original ruleset, the following rules can be said to have been inferred from the data:

```
R1: (Skin Cover = feathers) => bird
R2: (Skin Cover = fur) => mammal
R3: (Skin Cover = hair) => mammal
R4: (Skin Cover = none) => amphibian
R5: (Skin Cover = quills) => mammal
R6: (Skin Cover = scales) ^ (Aquatic Creature = yes) => fish
R6: (Skin Cover = scales) ^ (Aquatic Creature = no) => reptile
R7: (Skin Cover = scales) ^ (Aquatic Creature = semi) => reptile
```

Based on the above rules, the following tuple can now be classified correctly

In [None]:
test_df = pd.read_csv('https://raw.githubusercontent.com/arundhaj/datamining_class/master/data/vertebrate.csv')
test_df[test_df['Name'] == 'gila monster']

Unnamed: 0,Name,Body Temperature,Skin Cover,Gives Birth,Aquatic Creature,Aerial Creature,Has Legs,Hibernates,Class Label
15,gila monster,cold-blooded,scales,no,no,no,yes,yes,


Applying R6, we get the classification for this tuple as __reptile__ which is [correct](https://en.wikipedia.org/wiki/Gila_monster).