#### Usage example on congress dataset

These examples use the RIPPER algorithm. IREP usage is similar, with only slight hyperparameter differences.

In [1]:
import wittgenstein as lw
import pandas as pd

Load our dataset:

In [2]:
df = pd.read_csv('../../datasets/adult.csv')
df.head(1)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K


Split our data into train-test sets:

In [3]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, random_state=42)

#### Training

Create a ruleset classifier:

In [4]:
clf = lw.RIPPER(random_state=42, verbosity=1)
clf

<RIPPER(prune_size=0.33, max_rule_conds=None, verbosity=1, max_total_conds=None, k=2, random_state=42, max_rules=None, n_discretize_bins=10, dl_allowance=64)>

Train the ruleset classifier on the trainset:

In [5]:
clf.fit(train, class_feat='income', pos_class='>50K')
clf.ruleset_ # Access underlying model

discretizing 6 features

GREW INITIAL RULESET:
[[marital.status=Married-civ-spouse ^ education.num=>13.0 ^ occupation=Exec-managerial ^ hours.per.week=40.0-50.0 ^ native.country=United-States] V
[marital.status=Married-civ-spouse ^ education.num=11.0-13.0 ^ occupation=Exec-managerial ^ hours.per.week=40.0-50.0 ^ capital.gain=9999.9-19999.8] V
[marital.status=Married-civ-spouse ^ education=Bachelors ^ occupation=Exec-managerial ^ hours.per.week=40.0-50.0 ^ workclass=Private ^ age=50.0-58.0] V
[marital.status=Married-civ-spouse ^ education=Bachelors ^ occupation=Exec-managerial ^ age=45.0-50.0 ^ hours.per.week=35.0-40.0 ^ workclass=State-gov] V
[marital.status=Married-civ-spouse ^ education=Bachelors ^ occupation=Exec-managerial ^ age=45.0-50.0] V
[marital.status=Married-civ-spouse ^ education.num=>13.0 ^ occupation=Prof-specialty ^ education=Prof-school ^ workclass=Self-emp-inc ^ hours.per.week=>55.0] V
[marital.status=Married-civ-spouse ^ education.num=>13.0 ^ occupation=Prof-specialty


OPTIMIZED RULESET:
[[marital.status=Married-civ-spouse ^ education.num=>13.0 ^ occupation=Exec-managerial ^ hours.per.week=40.0-50.0 ^ native.country=United-States] V
[marital.status=Married-civ-spouse ^ education.num=11.0-13.0 ^ occupation=Exec-managerial ^ hours.per.week=40.0-50.0 ^ capital.gain=9999.9-19999.8] V
[marital.status=Married-civ-spouse ^ education=Bachelors ^ occupation=Exec-managerial ^ hours.per.week=40.0-50.0 ^ workclass=Private ^ age=50.0-58.0] V
[marital.status=Married-civ-spouse ^ education=Bachelors ^ occupation=Exec-managerial ^ age=45.0-50.0 ^ hours.per.week=35.0-40.0 ^ workclass=State-gov] V
[marital.status=Married-civ-spouse ^ education=Bachelors ^ occupation=Exec-managerial ^ age=45.0-50.0] V
[marital.status=Married-civ-spouse ^ education.num=>13.0 ^ occupation=Prof-specialty ^ education=Prof-school ^ workclass=Self-emp-inc ^ hours.per.week=>55.0] V
[marital.status=Married-civ-spouse ^ education.num=>13.0 ^ occupation=Prof-specialty ^ capital.loss=1742.4-2178

GREW FINAL RULES
[[marital.status=Married-civ-spouse ^ education.num=>13.0 ^ occupation=Exec-managerial ^ hours.per.week=40.0-50.0 ^ native.country=United-States] V
[marital.status=Married-civ-spouse ^ education.num=11.0-13.0 ^ occupation=Exec-managerial ^ hours.per.week=40.0-50.0 ^ capital.gain=9999.9-19999.8] V
[marital.status=Married-civ-spouse ^ education=Bachelors ^ occupation=Exec-managerial ^ hours.per.week=40.0-50.0 ^ workclass=Private ^ age=50.0-58.0] V
[marital.status=Married-civ-spouse ^ education=Bachelors ^ occupation=Exec-managerial ^ age=45.0-50.0 ^ hours.per.week=35.0-40.0 ^ workclass=State-gov] V
[marital.status=Married-civ-spouse ^ education=Bachelors ^ occupation=Exec-managerial ^ age=45.0-50.0] V
[marital.status=Married-civ-spouse ^ education.num=>13.0 ^ occupation=Prof-specialty ^ education=Prof-school ^ workclass=Self-emp-inc ^ hours.per.week=>55.0] V
[marital.status=Married-civ-spouse ^ education.num=>13.0 ^ occupation=Prof-specialty ^ capital.loss=1742.4-2178.0]

FINAL RULESET:
[[marital.status=Married-civ-spouse ^ education.num=>13.0 ^ occupation=Exec-managerial ^ hours.per.week=40.0-50.0 ^ native.country=United-States] V
[marital.status=Married-civ-spouse ^ education.num=11.0-13.0 ^ occupation=Exec-managerial ^ hours.per.week=40.0-50.0 ^ capital.gain=9999.9-19999.8] V
[marital.status=Married-civ-spouse ^ education=Bachelors ^ occupation=Exec-managerial ^ hours.per.week=40.0-50.0 ^ workclass=Private ^ age=50.0-58.0] V
[marital.status=Married-civ-spouse ^ education=Bachelors ^ occupation=Exec-managerial ^ age=45.0-50.0] V
[marital.status=Married-civ-spouse ^ education.num=>13.0 ^ occupation=Prof-specialty ^ education=Prof-school ^ workclass=Self-emp-inc ^ hours.per.week=>55.0] V
[marital.status=Married-civ-spouse ^ education.num=>13.0 ^ occupation=Prof-specialty ^ capital.loss=1742.4-2178.0] V
[marital.status=Married-civ-spouse ^ education.num=11.0-13.0 ^ hours.per.week=40.0-50.0 ^ age=37.0-41.0 ^ workclass=Private ^ education=Bachelors ^ race=

  covered = covered.append(rule.covers(df))
  covered = covered.append(rule.covers(df))
  covered = covered.append(rule.covers(df))
  covered = covered.append(rule.covers(df))
  covered = covered.append(rule.covers(df))
  covered = covered.append(rule.covers(df))
  covered = covered.append(rule.covers(df))
  covered = covered.append(rule.covers(df))
  covered = covered.append(rule.covers(df))
  covered = covered.append(rule.covers(df))
  covered = covered.append(rule.covers(df))
  covered = covered.append(rule.covers(df))
  covered = covered.append(rule.covers(df))
  covered = covered.append(rule.covers(df))
  covered = covered.append(rule.covers(df))
  covered = covered.append(rule.covers(df))
  covered = covered.append(rule.covers(df))
  covered = covered.append(rule.covers(df))
  covered = covered.append(rule.covers(df))
  covered = covered.append(rule.covers(df))
  covered = covered.append(rule.covers(df))
  covered = covered.append(rule.covers(df))
  covered = covered.append(rule.

  covered = covered.append(rule.covers(df))
  covered = covered.append(rule.covers(df))
  covered = covered.append(rule.covers(df))
  covered = covered.append(rule.covers(df))
  covered = covered.append(rule.covers(df))
  covered = covered.append(rule.covers(df))
  covered = covered.append(rule.covers(df))
  covered = covered.append(rule.covers(df))
  covered = covered.append(rule.covers(df))
  covered = covered.append(rule.covers(df))
  covered = covered.append(rule.covers(df))
  covered = covered.append(rule.covers(df))
  covered = covered.append(rule.covers(df))
  covered = covered.append(rule.covers(df))
  covered = covered.append(rule.covers(df))


<Ruleset [[marital.status=Married-civ-spouse^education.num=>13.0^occupation=Exec-managerial^hours.per.week=40.0-50.0^native.country=United-States] V [marital.status=Married-civ-spouse^education.num=11.0-13.0^occupation=Exec-managerial^hours.per.week=40.0-50.0^capital.gain=9999.9-19999.8] V [marital.status=Married-civ-spouse^education=Bachelors^occupation=Exec-managerial^hours.per.week=40.0-50.0^workclass=Private^age=50.0-58.0] V [marital.status=Married-civ-spouse^education=Bachelors^occupation=Exec-managerial^age=45.0-50.0] V [marital.status=Married-civ-spouse^education.num=>13.0^occupation=Prof-specialty^education=Prof-school^workclass=Self-emp-inc^hours.per.week=>55.0] V [marital.status=Married-civ-spouse^education.num=>13.0^occupation=Prof-specialty^capital.loss=1742.4-2178.0] V [marital.status=Married-civ-spouse^education.num=11.0-13.0^hours.per.week=40.0-50.0^age=37.0-41.0^workclass=Private^education=Bachelors^race=White^occupation=Exec-managerial] V [marital.status=Married-civ-sp

In [6]:
end

NameError: name 'end' is not defined

The fit method is flexible and can be called in various ways, including with X_train and y_train, or with numpy arrays.  

Unlike dataframes, arrays don't have feature names...

In [None]:
arr = train.values
X_arr, y_arr = arr[:,:-1], arr[:,-1]

clf = lw.RIPPER(random_state=42)
clf.fit(X_arr, y_arr, pos_class='>=50K', class_feat='Class')
clf.ruleset_

But we can pass them in:

In [None]:
clf.fit(X_arr, y_arr, pos_class='>=50K')

X_array, y_arry = train.drop('Party', axis=1).values, train['Party'].values
ripper_clf.fit(X_array, y_arry, 
               pos_class='democrat', class_feat='Party', 
               feature_names=df.columns[1:])
ripper_clf.ruleset_

We can force a simpler ruleset using max_rules, max_total_conds, or max_rule_conds.

In [None]:
ripper_clf = lw.RIPPER(max_rules=2, random_state=1)
ripper_clf.fit(train, class_feat='Party', pos_class='democrat')
ripper_clf.ruleset_ 

Verbosity allows us to view training steps...

In [None]:
ripper_clf = lw.RIPPER(random_state=42, verbosity=1) # Scale of 0-5
ripper_clf.fit(train, class_feat='Party', pos_class='democrat')
ripper_clf.ruleset_

#### Model selection

Some sklearn methods are supported. Cross-validation:

In [None]:
from sklearn.model_selection import cross_val_score

# Dummify our data to make sklearn happy
X_train = pd.get_dummies(X_train, columns=X_train.select_dtypes('object').columns)
y_train = y_train.map(lambda x: 1 if x=='democrat' else 0)

ripper_clf = lw.RIPPER(random_state=42)
cross_val_score(ripper_clf, X_train, y_train)


Grid-search:

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {"prune_size": [0.1, 0.25, 0.33, 0.5], "k": [1, 2]}
grid = GridSearchCV(estimator=ripper_clf, param_grid=param_grid)
grid.fit(X_train, y_train)
grid.best_params_

Ensemble:

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

nb = GaussianNB()
tree = DecisionTreeClassifier(random_state=42)
estimators = [("rip", ripper_clf), ("tree", tree), ("nb", nb)]
ensemble_clf = StackingClassifier(
  estimators=estimators, final_estimator=LogisticRegression()
)


#### Testing

How good is our model?

In [None]:
X_test = test.drop('Party', axis=1)
y_test = test['Party']
ripper_clf = lw.RIPPER(random_state=42)
ripper_clf.fit(train, class_feat='Party', pos_class='democrat')
ripper_clf.score(X_test, y_test) # Default metric is accuracy

We can also score it on custom metrics, including sklearn's:

In [None]:
from sklearn.metrics import precision_score, recall_score
precision = ripper_clf.score(X_test, y_test, precision_score)
recall = ripper_clf.score(X_test, y_test, recall_score)
print(f'precision: {precision}')
print(f'recall: {recall}')

#### Prediction

To make predictions, use the predict method.

In [None]:
ripper_clf.predict(X_test.tail(10))

For predicted probabilities, use predict_proba.

In [None]:
ripper_clf.predict_proba(X_test.tail(10))

We can also ask our model to give us the reasons for its predictions.

In [None]:
ripper_clf.predict(X_test.tail(), give_reasons=True)