[Last chapter](../03_oner/oner.ipynb) we implemented the OneR [@Holte1993] algorithm. In this chapter we'll improve the model without improving the model. Sounds strange, but bear with me.

In [2]:
import numpy as np
a = np.array([[1, 2], [3, 4]])
for x in a.T:
    print(x)

[1 3]
[2 4]


In [38]:
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
from sklearn.dummy import DummyClassifier

#class BagOfLetters(TransformerMixin, BaseEstimator):
    

class OneR(ClassifierMixin, BaseEstimator):
    def fit(self, X, y):
        """Train the model with inputs `X` on labels `y`."""
        best_predictors = None
        best_i = None
        best_score = float('-inf')
        self.fallback_ = DummyClassifier().fit(X, y)
        # Added fallback for missing categories.
        X, y = np.array(X), np.array(y)
        for i, x in enumerate(X.T):
            predictors = {}
            for x_val in np.unique(x):
                is_x = x == x_val
                predictors[x_val] = DummyClassifier().fit(x[is_x], y[is_x])

            self.predictors_ = predictors
            self.i_ = i
            score = self.score(X, y)
            if score > best_score:
                best_predictors = predictors
                best_i = i
                best_score = score

        self.predictors_ = best_predictors
        self.i_ = best_i

        return self

    def predict(self, X):
        """Predict the labels for inputs `X`."""
        X = np.array(X)
        rv = []
        for x in X[:, self.i_]:
            try:
                rv.append(self.predictors_[x].predict([x])[0])
            except KeyError:
                rv.append(self.fallback_.predict([x])[0])
                # Use the fallback when the category isn't 
                # in `self.predictors_`.
        return np.array(rv)

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

from functools import partial

class BagOfLetters(CountVectorizer):
    def build_tokenizer(self):
        return list
vectorizer = CountVectorizer(analyzer="char", lowercase=False)
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())
print(X.toarray())
#vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2))
#X2 = vectorizer2.fit_transform(corpus)
#vectorizer2.get_feature_names_out()

[' ' '.' '?' 'A' 'I' 'T' 'c' 'd' 'e' 'f' 'h' 'i' 'm' 'n' 'o' 'r' 's' 't'
 'u']
[[4 1 0 0 0 1 1 1 2 1 2 3 1 1 1 1 3 3 1]
 [5 1 0 0 0 1 3 3 4 0 2 2 2 3 3 0 3 3 2]
 [5 1 0 1 0 0 0 2 2 0 3 3 0 2 1 1 2 3 0]
 [4 0 1 0 1 0 1 1 2 1 2 2 1 1 1 1 3 4 1]]


In [17]:
from nlpbook import get_train_test_data

train_df, test_df = get_train_test_data()
train_df.columns

Index(['id', 'movie_id', 'rating', 'review', 'label'], dtype='object')

In [27]:
vectorizer = CountVectorizer(analyzer="char", lowercase=False)
X_train = vectorizer.fit_transform(train_df["review"])
X_train

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 1180459 stored elements and shape (24904, 178)>

In [28]:
X_train.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [30]:
vectorizer.get_feature_names_out()

array(['\x08', '\t', '\x10', ' ', '!', '"', '#', '$', '%', '&', "'", '(',
       ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5',
       '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B',
       'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
       'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\',
       ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i',
       'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
       'w', 'x', 'y', 'z', '{', '|', '}', '~', '\x80', '\x84', '\x85',
       '\x8d', '\x8e', '\x91', '\x95', '\x96', '\x97', '\x9a', '\x9e',
       '\xa0', '¡', '¢', '£', '¤', '¦', '§', '¨', '«', '\xad', '®', '°',
       '³', '´', '·', 'º', '»', '½', '¾', '¿', 'À', 'Á', 'Ã', 'Ä', 'Å',
       'È', 'É', 'Ê', 'Õ', 'Ø', 'Ü', 'ß', 'à', 'á', 'â', 'ã', 'ä', 'å',
       'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ð', 'ñ', 'ò',
       'ó', 'ô', 'ö', 'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'ō', '–', 

In [39]:
oner = OneR().fit(X_train.toarray(), train_df["label"])

In [41]:
oner.score(vectorizer.transform(test_df["review"]).toarray(), test_df["label"])

0.5812817904374364

In [42]:
oner.i_

34

In [43]:
vectorizer.get_feature_names_out()[oner.i_]

'?'