In [1]:
import numpy as np
import urllib.request
import os
import tarfile
import pickle
from sklearn.datasets import fetch_openml

In [2]:
def get_mnist():
    mnist = fetch_openml('mnist_784', data_home=".")

    x = mnist.data
    y = mnist.target
    # reshape to (#data, #channel, width, height)
    x = np.reshape(x, (x.shape[0], 1, 28, 28)) / 255.
    x_tr = np.asarray(x[:60000], dtype=np.float32)
    y_tr = np.asarray(y[:60000], dtype=np.int32)
    x_te = np.asarray(x[60000:], dtype=np.float32)
    y_te = np.asarray(y[60000:], dtype=np.int32)
    return (x_tr, y_tr), (x_te, y_te)

In [3]:
mnist = fetch_openml('mnist_784', data_home=".")

x = mnist.data
y = mnist.target

In [4]:
x.shape

(70000, 784)

In [5]:
x.shape

(70000, 784)

In [6]:
x = x/255.

In [7]:
x_tr = np.asarray(x[:60000], dtype=np.float32)
y_tr = np.asarray(y[:60000], dtype=np.int32)
x_te = np.asarray(x[60000:], dtype=np.float32)
y_te = np.asarray(y[60000:], dtype=np.int32)

In [8]:
x_tr.shape, y_tr.shape, x_te.shape, y_te.shape

((60000, 784), (60000,), (10000, 784), (10000,))

In [9]:
def binarize_mnist_class(y_train, y_test):
    y_train_bin = np.ones(len(y_train), dtype=np.int32)
    y_train_bin[y_train % 2 == 1] = -1
    y_test_bin = np.ones(len(y_test), dtype=np.int32)
    y_test_bin[y_test % 2 == 1] = -1
    return y_train_bin, y_test_bin

In [10]:
y_train, y_test = binarize_mnist_class(y_tr, y_te)

In [11]:
x_train, x_test = x_tr.copy(), x_te.copy()

In [73]:
x_train, x_test

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32))

In [74]:
y_train, y_test

(array([-1,  1,  1, ..., -1,  1,  1], dtype=int32),
 array([-1,  1, -1, ...,  1, -1,  1], dtype=int32))

In [12]:
np.unique(y_tr)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32)

In [13]:
np.unique(y_train, return_counts=True)

(array([-1,  1], dtype=int32), array([30508, 29492]))

In [14]:
x = x_train.copy()
y = y_train.copy()

In [15]:
labels = np.unique(y)
labels

array([-1,  1], dtype=int32)

In [16]:
positive, negative = labels[1], labels[0]

In [17]:
positive, negative

(1, -1)

In [18]:
x, y = np.asarray(x, dtype=np.float32), np.asarray(y, dtype=np.int32)
assert(len(x) == len(y))

In [19]:
x.shape, y.shape

((60000, 784), (60000,))

In [20]:
# Random permutation of np.arange(len(y))
perm = np.random.permutation(len(y))

# Randomly shuffle x and y
x, y = x[perm], y[perm]

In [21]:
n_p = (y == positive).sum()
n_n = (y == negative).sum()

n_p, n_n

(29492, 30508)

In [22]:
labeled = 100
unlabeled = 59900

n_lp = labeled
n_u = unlabeled

In [23]:
if labeled + unlabeled == len(x):
    n_up = n_p - n_lp
elif unlabeled == len(x):
    n_up = n_p
else:
    raise ValueError("Only support |P|+|U|=|X| or |U|=|X|.")

In [24]:
n_up, n_p

(29392, 29492)

In [25]:
_prior = float(n_up) / float(n_u)

In [26]:
_prior

0.4906844741235392

In [27]:
xlp = x[y == positive][:n_lp]

In [28]:
xup = np.concatenate((x[y == positive][n_lp:], xlp), axis=0)[:n_up]

In [29]:
xun = x[y == negative]
xun.shape

(30508, 784)

In [30]:
x = np.asarray(np.concatenate((xlp, xup, xun), axis=0), dtype=np.float32)
x.shape

(60000, 784)

In [31]:
y = np.asarray(np.concatenate((np.ones(n_lp), -np.ones(n_u))), dtype=np.int32)
perm = np.random.permutation(len(y))
x, y = x[perm], y[perm]

In [32]:
x.shape, y.shape

((60000, 784), (60000,))

In [33]:
np.unique(y, return_counts=True)

(array([-1,  1], dtype=int32), array([59900,   100]))

In [34]:
x.max()

1.0

In [71]:
n_labeled = 100
n_unlabeled = 59900

In [72]:
def make_pu_dataset_from_binary_dataset(x, y, labeled=n_labeled, unlabeled=n_unlabeled):
    labels = np.unique(y)
    positive, negative = labels[1], labels[0]
    
    x, y = np.asarray(x, dtype=np.float32), np.asarray(y, dtype=np.int32)
    assert(len(x) == len(y))
    
    # Random permutation of np.arange(len(y))
    perm = np.random.permutation(len(y))
    
    # Randomly shuffle x and y
    x, y = x[perm], y[perm]
    
    n_p = (y == positive).sum()
    n_lp = labeled
    n_n = (y == negative).sum()
    n_u = unlabeled
    
    if labeled + unlabeled == len(x):
        n_up = n_p - n_lp
    elif unlabeled == len(x):
        n_up = n_p
    else:
        raise ValueError("Only support |P|+|U|=|X| or |U|=|X|.")
        
    _prior = float(n_up) / float(n_u)
    
    xlp = x[y == positive][:n_lp]
    xup = np.concatenate((x[y == positive][n_lp:], xlp), axis=0)[:n_up]
    xun = x[y == negative]
    x = np.asarray(np.concatenate((xlp, xup, xun), axis=0), dtype=np.float32)
    print(x.shape)
    y = np.asarray(np.concatenate((np.ones(n_lp), -np.ones(n_u))), dtype=np.int32)
    perm = np.random.permutation(len(y))
    x, y = x[perm], y[perm]
    return x, y, _prior


In [75]:
x_train, y_train, prior = make_pu_dataset_from_binary_dataset(x_train, y_train)

(60000, 784)


In [77]:
np.unique(y_train, return_counts=True)

(array([-1,  1], dtype=int32), array([59900,   100]))

In [None]:
def make_pn_dataset_from_binary_dataset(x, y):
        labels = np.unique(y)
        positive, negative = labels[1], labels[0]
        X, Y = np.asarray(x, dtype=np.float32), np.asarray(y, dtype=np.int32)
        n_p = (Y == positive).sum()
        n_n = (Y == negative).sum()
        Xp = X[Y == positive][:n_p]
        Xn = X[Y == negative][:n_n]
        X = np.asarray(np.concatenate((Xp, Xn)), dtype=np.float32)
        Y = np.asarray(np.concatenate((np.ones(n_p), -np.ones(n_n))), dtype=np.int32)
        perm = np.random.permutation(len(Y))
        X, Y = X[perm], Y[perm]
        return X, Y

In [None]:
x_test, y_test = make_pn_dataset_from_binary_dataset(x_test, y_test)

In [None]:
x_test.shape

In [None]:
np.unique(y_test, return_counts=True)

In [36]:
import pandas as pd

In [106]:
def read_riboseq_data(prior=0.05):
    df = pd.read_csv("../data/final_ckpt_with_labels.csv")
    
    df['is_start_position'] = df['is_start_position'].replace({True: 1, False: -1})
    
    position = df['position'].values
    gene = df['gene'].values
    
    y_orig = df['is_start_position'].values
    
    n_fake_pos = 100
    fake_pos = np.random.permutation(df[df['is_start_position'] == 1]['position'])[:n_fake_pos]
    
    df['fake_label'] = df['is_start_position'].copy()
    df.loc[df['position'].isin(fake_pos), 'fake_label'] = -1
    
    X_train = df.drop(['Unnamed: 0', 'position', 'gene', 'is_start_position', 'fake_label'], axis=1).values
    y_train = df['fake_label'].values
    
    X_test = df[df['fake_label'] == -1].drop(
        ['Unnamed: 0', 'position', 'gene', 'is_start_position', 'fake_label'], axis=1
    ).values
    
    y_test = df[df['fake_label'] == -1]['is_start_position'].values
    
    X_train, y_train = np.asarray(X_train, dtype=np.float32), np.asarray(y_train, dtype=np.int32)
    X_test, y_test = np.asarray(X_test, dtype=np.float32), np.asarray(y_test, dtype=np.int32)
    
    X_train = X_train.reshape((-1, 1, 31, 21))
    X_test = X_test.reshape((-1, 1, 31, 21))
    
    return X_train, X_test, y_train, y_test, prior, fake_pos, gene

In [107]:
X_train, X_test, y_train, y_test, prior, fake_pos, gene = read_riboseq_data()

In [109]:
X_train.shape

(115313, 651)

In [110]:
31*21

651

In [114]:
np.unique(y_test, return_counts=True)

(array([-1,  1], dtype=int32), array([113965,    101]))

In [112]:
.shape

(115313, 1, 31, 21)