In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import wittgenstein as lw  # RIPPER implementation

In [4]:
# Step 1: Data Generation
# -------------------------
np.random.seed(42)
n_samples = 100
n_features = 5

# Generate random features and a binary target
X = np.random.rand(n_samples, n_features)
y = np.random.choice([0, 1], size=n_samples)

# Create a DataFrame for ease of use
feature_names = [f'feature_{i}' for i in range(1, n_features+1)]
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y

print("=== Initial Data ===")
print(df.head())

=== Initial Data ===
   feature_1  feature_2  feature_3  feature_4  feature_5  target
0   0.374540   0.950714   0.731994   0.598658   0.156019       1
1   0.155995   0.058084   0.866176   0.601115   0.708073       0
2   0.020584   0.969910   0.832443   0.212339   0.181825       0
3   0.183405   0.304242   0.524756   0.431945   0.291229       0
4   0.611853   0.139494   0.292145   0.366362   0.456070       0


In [5]:
# Step 2: Preprocessing
# -------------------------
# For demonstration, we simply standardize the features.
scaler = StandardScaler()
df[feature_names] = scaler.fit_transform(df[feature_names])

print("\n=== Preprocessed Data ===")
print(df.head())


=== Preprocessed Data ===
   feature_1  feature_2  feature_3  feature_4  feature_5  target
0  -0.439808   1.425870   0.832798   0.265310  -0.971652       1
1  -1.178208  -1.565503   1.290222   0.273730   0.828999       0
2  -1.635718   1.490198   1.175225  -1.058834  -0.887479       0
3  -1.085598  -0.740579   0.126331  -0.306115  -0.530632       0
4   0.362002  -1.292682  -0.666636  -0.530907   0.007034       0


In [6]:
# Step 3: Train-Test Split
# -------------------------
X = df[feature_names].values
y = df['target'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\n=== Train-Test Split ===")
print(f"Training samples: {X_train.shape[0]}, Testing samples: {X_test.shape[0]}")


=== Train-Test Split ===
Training samples: 80, Testing samples: 20


In [12]:
# Step 4: Train RIPPER Model
# -------------------------
# Convert training data back to DataFrame for RIPPER implementation
train_df = pd.DataFrame(X_train, columns=feature_names)
train_df['target'] = y_train  # Add the target column to the DataFrame

ripper = lw.RIPPER()
ripper.fit(train_df, class_feat='target')  # Just pass the class_feat

print("\n=== Trained RIPPER Model Rules ===")
print(ripper.ruleset_)


=== Trained RIPPER Model Rules ===
[[feature_3=-0.71--0.36] V [feature_2=0.36-0.65]]


In [14]:
# Step 5: Optimize Rule Set
# -------------------------
# Dummy optimization: Remove rules that are too "simple" (for demonstration purposes)
def optimize_ruleset(ruleset):
    # Convert each rule to string and check if it has more than 5 tokens (words)
    optimized_rules = [str(rule) for rule in ruleset if len(str(rule).split()) > 5]
    return optimized_rules

optimized_ruleset = optimize_ruleset(ripper.ruleset_)
print("\n=== Optimized Rule Set ===")
print(optimized_ruleset)


=== Optimized Rule Set ===
[]


In [15]:
# Step 6: Compute Predictions
# -------------------------
# Predict on the test set using the original (unoptimized) model for now
y_pred = ripper.predict(pd.DataFrame(X_test, columns=feature_names))
print("\n=== Predictions on Test Data ===")
print(y_pred)


=== Predictions on Test Data ===
[True, True, False, False, False, False, False, False, False, False, True, False, False, False, False, True, True, False, False, True]


In [16]:
# Step 7: Evaluate the Model
# -------------------------
accuracy = accuracy_score(y_test, y_pred)
print("\n=== Evaluation: Accuracy Score ===")
print(f"Accuracy: {accuracy:.2f}")


=== Evaluation: Accuracy Score ===
Accuracy: 0.60


In [18]:
# Step 8: Predict New Data
# -------------------------
# Generate new random data (make sure to use the same number of features)
new_data = np.random.rand(5, n_features)  # Random values for new data

# Preprocess new data with the same scaler, ensuring it has feature names
new_data_df = pd.DataFrame(new_data, columns=feature_names)  # Add feature names to new data
new_data_scaled = scaler.transform(new_data_df)  # Scale the new data using the same scaler

# Predict with the trained RIPPER model
new_predictions = ripper.predict(pd.DataFrame(new_data_scaled, columns=feature_names))

print("\n=== Predictions on New Data ===")
print(new_predictions)


=== Predictions on New Data ===
[False, False, False, True, True]
