In [226]:
import pandas as pd
import numpy as np

In [227]:
df = pd.read_csv('resources/dataset_train.csv', index_col=0)
df.head()

Unnamed: 0_level_0,Hogwarts House,First Name,Last Name,Birthday,Best Hand,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,Ravenclaw,Tamara,Hsu,2000-03-30,Left,58384.0,-487.886086,5.72718,4.878861,4.722,272.035831,532.484226,5.231058,1039.788281,3.790369,0.715939,-232.79405,-26.89
1,Slytherin,Erich,Paredes,1999-10-14,Right,67239.0,-552.060507,-5.987446,5.520605,-5.612,-487.340557,367.760303,4.10717,1058.944592,7.248742,0.091674,-252.18425,-113.45
2,Ravenclaw,Stephany,Braun,1999-11-03,Left,23702.0,-366.076117,7.725017,3.660761,6.14,664.893521,602.585284,3.555579,1088.088348,8.728531,-0.515327,-227.34265,30.42
3,Gryffindor,Vesta,Mcmichael,2000-08-19,Left,32667.0,697.742809,-6.497214,-6.977428,4.026,-537.001128,523.982133,-4.809637,920.391449,0.821911,-0.01404,-256.84675,200.64
4,Gryffindor,Gaston,Gibbs,1998-09-27,Left,60158.0,436.775204,-7.820623,,2.236,-444.262537,599.324514,-3.444377,937.434724,4.311066,-0.26407,-256.3873,157.98


In [228]:
target = 'Hogwarts House'

courses = ['Arithmancy',
            'Astronomy',
            'Herbology',
            'Defense Against the Dark Arts',
            'Divination',
            'Muggle Studies',
            'Ancient Runes',
            'History of Magic',
            'Transfiguration',
            'Potions',
            'Care of Magical Creatures',
            'Charms',
            'Flying']

# courses = ['Herbology', 'Defense Against the Dark Arts',
#            'Ancient Runes', 'Charms']

X = df[courses].values
y = df[target]
X.shape, y.shape

((1600, 13), (1600,))

In [229]:
def impute(X, strategy='mean'):
    imp_vec = None
    if strategy == 'mean':
        imp_vec = np.nanmean(X, axis=0)
    else:
        raise ValueError(f'Unrecognized strategy: {strategy}')
    
    
    for i in range(X.shape[1]):
        Xi = X[:, i]
        Xi = np.where(np.isnan(Xi), imp_vec[i], Xi)
        X[:, i] = Xi

    return X, imp_vec

In [237]:
X, imp = impute(X)

In [238]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [239]:
def log_loss(y_true, y_hat):
    return -(y_true * np.log(y_hat) + (1 - y_true) * np.log(1 - y_hat)).mean()

In [240]:
def normalize(X):
    X_mean = X.mean(axis=0)
    X_std = X.std(axis=0)
    return (X - X_mean) / X_std, X_mean, X_std

In [241]:
Xv, X_mean, X_std = normalize(X)

In [242]:
def softmax(X):
    return np.exp(X) / np.exp(X).sum(axis=1).reshape(-1, 1)

In [243]:
class BinaryLR:
    def __init__(self, w, b):
        self.w = w
        self.b = b
    
    def predict(self, X):
        return sigmoid(X @ self.w + self.b) > 0.5

    def predict_proba(self, X):
        return sigmoid(X @ self.w + self.b)

class LogisticRegression:
    def __init__(self, lr=.05, verbose=False, seed=None):
        self.lr = lr
        self.verbose = verbose
        self.seed = seed
        self.classifiers = []
    
    def fit(self, X, y, n_epochs=3, batch_size=-1):
        self.unique_targets = np.unique(y)
        
        self.classifiers = []
        if self.verbose:
            print(f"# of targets: {len(self.unique_targets)}")
        if 'values' in dir(X):
            X = X.values
        n, m = X.shape
        batch_size = n if batch_size == -1 else batch_size
        n_batches = int(n // batch_size)
        if batch_size < n:
            n_batches += 1
        if self.verbose:
            print("n_batches:", n_batches)
        for i_clf in range(len(self.unique_targets)):
            if self.verbose:
                print(f"Training classifier #{i_clf+1}")
            target = np.where(y == self.unique_targets[i_clf], 1, 0)
            w, b = self._initializer(m)
            for epoch in range(n_epochs):
                losses = []
                for it in range(n_batches):
                    if it < n_batches - 1:
                        Xb = X[it * batch_size:(it + 1) * batch_size, :]
                        yb = target[it * batch_size:(it + 1) * batch_size]
                    else:
                        Xb = X[it * batch_size:]
                        yb = target[it * batch_size:]
                    if len(Xb) == 0:
                        continue
                    yh = sigmoid(Xb @ w + b)
                    if len(yh.shape) > 1:
                        yh.ravel()
                    losses.append(log_loss(yb, yh))
                    err = (yh - yb).reshape(-1, 1)
                    dw = (err * Xb).mean(axis=0)
                    db = err.mean(axis=0)
                    
                    w -= self.lr * dw
                    b -= self.lr * db
                if self.verbose:
                    print(f"[{epoch}/{n_epochs}]: mean loss: {np.mean(losses)}")
            self.classifiers.append(BinaryLR(w, b))
    
    def predict_proba(self, X):
        preds = np.zeros((len(X), len(self.classifiers)))
        for i, clf in enumerate(self.classifiers):
            y_pred = clf.predict_proba(X)
            preds[:, i] = y_pred
        return softmax(preds)
    
    def predict(self, X):
        return np.array([self.unique_targets[c] for c in np.argmax(self.predict_proba(X), axis=1)])
            
    def _initializer(self, n_features, scale=.01):
        if self.seed is not None:
            np.random.seed(self.seed)

        w = np.random.randn(n_features) * scale
        b = np.zeros(1)
        return w, b

In [244]:
lr = LogisticRegression(verbose=True, seed=0)
lr.fit(Xv, y, batch_size=32, n_epochs=10)
y_pred = lr.predict(Xv)

# of targets: 4
n_batches: 51
Training classifier #1
[0/10]: mean loss: 0.3608074486531433
[1/10]: mean loss: 0.17527003332580687
[2/10]: mean loss: 0.12643213288725108
[3/10]: mean loss: 0.10350772283925629
[4/10]: mean loss: 0.09028343305125486
[5/10]: mean loss: 0.0817180271380016
[6/10]: mean loss: 0.07573678587924175
[7/10]: mean loss: 0.0713321921668521
[8/10]: mean loss: 0.06795735536060182
[9/10]: mean loss: 0.0652905105266782
Training classifier #2
[0/10]: mean loss: 0.40321804835024816
[1/10]: mean loss: 0.1972439982971828
[2/10]: mean loss: 0.14130869185481992
[3/10]: mean loss: 0.11595111453037756
[4/10]: mean loss: 0.10166330328396944
[5/10]: mean loss: 0.09257815346785848
[6/10]: mean loss: 0.08633614849262491
[7/10]: mean loss: 0.0818095466743199
[8/10]: mean loss: 0.07839325232626707
[9/10]: mean loss: 0.07573439241329587
Training classifier #3
[0/10]: mean loss: 0.35629401500579455
[1/10]: mean loss: 0.17436280857234324
[2/10]: mean loss: 0.130849898362342
[3/10]: mean

In [245]:
def accuracy(y_true, y_pred):
    return ((y_true.values == y_pred)).sum() / len(y_true)

In [246]:
accuracy(y, y_pred)

0.981875

In [156]:
test_df = pd.read_csv('resources/dataset_test.csv', index_col=0)
Xv_test = (test_df[courses] - X_mean) / X_std

In [163]:
np.isnan(Xv_test).sum()

Herbology                        11
Defense Against the Dark Arts     8
Ancient Runes                     8
Charms                            0
dtype: int64

In [161]:
lr.predict(Xv_test)

array(['Hufflepuff', 'Ravenclaw', 'Gryffindor', 'Gryffindor',
       'Hufflepuff', 'Slytherin', 'Ravenclaw', 'Hufflepuff', 'Ravenclaw',
       'Hufflepuff', 'Hufflepuff', 'Slytherin', 'Slytherin', 'Slytherin',
       'Slytherin', 'Gryffindor', 'Ravenclaw', 'Slytherin', 'Ravenclaw',
       'Hufflepuff', 'Hufflepuff', 'Slytherin', 'Ravenclaw', 'Ravenclaw',
       'Gryffindor', 'Hufflepuff', 'Ravenclaw', 'Gryffindor',
       'Gryffindor', 'Ravenclaw', 'Hufflepuff', 'Hufflepuff',
       'Gryffindor', 'Hufflepuff', 'Hufflepuff', 'Ravenclaw',
       'Gryffindor', 'Ravenclaw', 'Slytherin', 'Ravenclaw', 'Ravenclaw',
       'Slytherin', 'Gryffindor', 'Slytherin', 'Hufflepuff', 'Ravenclaw',
       'Slytherin', 'Gryffindor', 'Ravenclaw', 'Slytherin', 'Ravenclaw',
       'Hufflepuff', 'Ravenclaw', 'Ravenclaw', 'Hufflepuff', 'Slytherin',
       'Slytherin', 'Ravenclaw', 'Hufflepuff', 'Gryffindor', 'Gryffindor',
       'Gryffindor', 'Hufflepuff', 'Hufflepuff', 'Ravenclaw',
       'Hufflepuff', 'Huff