In [None]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import random

from sklearn.datasets import load_digits
from sklearn.datasets import load_boston

In [None]:
def to_label(data, target, percentile):
    frac = percentile / 100.0
    part_val = data[target].quantile(frac)
    data[target] = [1 if d > part_val else 0 for d in data[target]]
    return data

In [35]:
boston = load_boston()
data = pd.DataFrame(boston.data, columns=boston.feature_names)
label = 'HomeVal50'
data[label] = boston.target
to_label(data, label, 50)
data.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,HomeVal50
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,1
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,1
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,1
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,1
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,1


In [None]:
def group_classes(df):
    grouped = df.groupby(df.loc[:,label])
    classes = [k for k in grouped.groups.keys()]

    return classes

In [36]:
classes = group_classes(data)

## Load in data

In [None]:
np.unique(data[label])

In [None]:
t = np.array(data[label])
X = data.drop(label, axis=1)
N = X.shape[0]
D = X.shape[1]
print(N,D)

In [37]:
def train_test_split(data, label, test_ratio=0.2):

    test_idx = []
    indices = [i for i in range(data.shape[0])]

    test_size = test_ratio * len(data)
    while len(test_idx) < test_size:
        test_idx.append(random.randrange(len(indices)))

    train_idx = [i for i in indices if i not in test_idx]

    test = data.iloc[test_idx]
    train = data.iloc[train_idx]
    
    y_train = train[label]
    X_train = train.drop(label,axis=1)

    y_test = test[label]
    X_test = test.drop(label,axis=1)

    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = train_test_split(data, label=label)

In [38]:
N = X_train.shape[0]
D = X_train.shape[1]
print(N,D)

415 13


In [39]:
classes = np.unique(y_train)

In [42]:
t_one_hot = np.zeros((N,len(classes)))
t_one_hot[np.arange(N), y_train] = 1
t_one_hot.shape

(415, 2)

## Logistic Regression
### 2-class

In [None]:
w = 0.001* np.random.randn(D+1)
a = np.dot(phi,w)

In [None]:
a.shape

In [None]:
def sigmoid(a):
    return 1/(1 + np.exp(-a))

In [None]:
sigmoid(a)

### Multi-class

In [43]:
def softmax(a):
    e = np.exp(a - np.max(a, axis=1).reshape((-1,1)))
    e_total = np.sum(e, axis=1).reshape((-1,1))
    return e / e_total

In [50]:
W = 0.001 * np.random.random((D, len(classes)))
def IRLS(X, W, t_one_hot):
    for i in range(50):
        a = np.dot(X, W)
        p = softmax(a)
        E = - np.sum(t_one_hot * np.log(p + 1e-6))
        grad = np.dot(X.T, (p-t_one_hot))
        R = np.diag(p[:,0]*p[:,1])
        z = a - np.dot(np.linalg.pinv(R),(p-t_one_hot))
        H = np.dot(X.T,R).dot(X)
        W = np.linalg.pinv(H).dot(X.T).dot(R).dot(z)
        new_p = softmax(np.dot(X, W))
        new_E = - np.sum(t_one_hot * np.log(new_p + 1e-6))
        
        if E - new_E < 1e-10:
            return W
            break

In [52]:
new_W = IRLS(X_train, W, t_one_hot)

In [78]:
def predict(new_X, new_W):
    pred = softmax(np.dot(new_X, new_W))
    return np.argmax(pred, axis=1)
        
y_pred = predict(X_test, new_W)

In [83]:
len(list(y_pred))

102

In [96]:
train_idx = 10/100 * X_train.shape[0]
train_idx

41.5

In [98]:
X_new = X_train.iloc[:int(np.floor(train_idx)),:]

In [99]:
X_new

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33
6,0.08829,12.5,7.87,0.0,0.524,6.012,66.6,5.5605,5.0,311.0,15.2,395.6,12.43
7,0.14455,12.5,7.87,0.0,0.524,6.172,96.1,5.9505,5.0,311.0,15.2,396.9,19.15
8,0.21124,12.5,7.87,0.0,0.524,5.631,100.0,6.0821,5.0,311.0,15.2,386.63,29.93
9,0.17004,12.5,7.87,0.0,0.524,6.004,85.9,6.5921,5.0,311.0,15.2,386.71,17.1
11,0.11747,12.5,7.87,0.0,0.524,6.009,82.9,6.2267,5.0,311.0,15.2,396.9,13.27
