In [2]:
import numpy as np
import urllib.request
import os
import tarfile
import pickle
from sklearn.datasets import fetch_openml

In [9]:
def get_mnist():
    mnist = fetch_openml('mnist_784', data_home=".")

    x = mnist.data
    y = mnist.target
    # reshape to (#data, #channel, width, height)
    x = np.reshape(x, (x.shape[0], 1, 28, 28)) / 255.
    x_tr = np.asarray(x[:60000], dtype=np.float32)
    y_tr = np.asarray(y[:60000], dtype=np.int32)
    x_te = np.asarray(x[60000:], dtype=np.float32)
    y_te = np.asarray(y[60000:], dtype=np.int32)
    return (x_tr, y_tr), (x_te, y_te)

In [11]:
mnist = fetch_openml('mnist_784', data_home=".")

x = mnist.data
y = mnist.target

In [14]:
x.shape

(70000, 784)

In [16]:
x.shape

(70000, 784)

In [18]:
x = x/255.

In [20]:
x_tr = np.asarray(x[:60000], dtype=np.float32)
y_tr = np.asarray(y[:60000], dtype=np.int32)
x_te = np.asarray(x[60000:], dtype=np.float32)
y_te = np.asarray(y[60000:], dtype=np.int32)

In [21]:
x_tr.shape, y_tr.shape, x_te.shape, y_te.shape

((60000, 784), (60000,), (10000, 784), (10000,))

In [22]:
def binarize_mnist_class(y_train, y_test):
    y_train_bin = np.ones(len(y_train), dtype=np.int32)
    y_train_bin[y_train % 2 == 1] = -1
    y_test_bin = np.ones(len(y_test), dtype=np.int32)
    y_test_bin[y_test % 2 == 1] = -1
    return y_train_bin, y_test_bin

In [23]:
y_train, y_test = binarize_mnist_class(y_tr, y_te)

In [28]:
x_train, x_test = x_tr.copy(), x_te.copy()

In [26]:
np.unique(y_tr)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32)

In [30]:
np.unique(y_train, return_counts=True)

(array([-1,  1], dtype=int32), array([30508, 29492]))

In [31]:
x = x_train.copy()
y = y_train.copy()

In [32]:
labels = np.unique(y)
labels

array([-1,  1], dtype=int32)

In [33]:
positive, negative = labels[1], labels[0]

In [34]:
positive, negative

(1, -1)

In [35]:
x, y = np.asarray(x, dtype=np.float32), np.asarray(y, dtype=np.int32)
assert(len(x) == len(y))

In [37]:
x.shape, y.shape

((60000, 784), (60000,))

In [39]:
# Random permutation of np.arange(len(y))
perm = np.random.permutation(len(y))

# Randomly shuffle x and y
x, y = x[perm], y[perm]

In [42]:
n_p = (y == positive).sum()
n_n = (y == negative).sum()

n_p, n_n

(29492, 30508)

In [43]:
labeled = 100
unlabeled = 59900

n_lp = labeled
n_u = unlabeled

In [44]:
if labeled + unlabeled == len(x):
    n_up = n_p - n_lp
elif unlabeled == len(x):
    n_up = n_p
else:
    raise ValueError("Only support |P|+|U|=|X| or |U|=|X|.")

In [46]:
n_up, n_p

(29392, 29492)

In [47]:
_prior = float(n_up) / float(n_u)

In [48]:
_prior

0.4906844741235392

In [49]:
xlp = x[y == positive][:n_lp]

In [50]:
xup = np.concatenate((x[y == positive][n_lp:], xlp), axis=0)[:n_up]

In [52]:
xun = x[y == negative]
xun.shape

(30508, 784)

In [53]:
x = np.asarray(np.concatenate((xlp, xup, xun), axis=0), dtype=np.float32)
x.shape

(60000, 784)

In [55]:
y = np.asarray(np.concatenate((np.ones(n_lp), -np.ones(n_u))), dtype=np.int32)
perm = np.random.permutation(len(y))
x, y = x[perm], y[perm]

In [56]:
x.shape, y.shape

((60000, 784), (60000,))

In [59]:
np.unique(y, return_counts=True)

(array([-1,  1], dtype=int32), array([59900,   100]))

In [69]:
x.max()

1.0

In [None]:
def make_pu_dataset_from_binary_dataset(x, y, labeled=n_labeled, unlabeled=n_unlabeled):
    labels = np.unique(y)
    positive, negative = labels[1], labels[0]
    
    x, y = np.asarray(x, dtype=np.float32), np.asarray(y, dtype=np.int32)
    assert(len(x) == len(y))
    
    # Random permutation of np.arange(len(y))
    perm = np.random.permutation(len(y))
    
    # Randomly shuffle x and y
    x, y = x[perm], y[perm]
    
    n_p = (y == positive).sum()
    n_lp = labeled
    n_n = (y == negative).sum()
    n_u = unlabeled
    
    if labeled + unlabeled == len(x):
        n_up = n_p - n_lp
    elif unlabeled == len(x):
        n_up = n_p
    else:
        raise ValueError("Only support |P|+|U|=|X| or |U|=|X|.")
        
    _prior = float(n_up) / float(n_u)
    
    xlp = x[y == positive][:n_lp]
    xup = np.concatenate((x[y == positive][n_lp:], xlp), axis=0)[:n_up]
    xun = x[y == negative]
    x = np.asarray(np.concatenate((xlp, xup, xun), axis=0), dtype=np.float32)
    print(x.shape)
    y = np.asarray(np.concatenate((np.ones(n_lp), -np.ones(n_u))), dtype=np.int32)
    perm = np.random.permutation(len(y))
    x, y = x[perm], y[perm]
    return x, y, _prior


In [70]:
def make_pn_dataset_from_binary_dataset(x, y):
        labels = np.unique(y)
        positive, negative = labels[1], labels[0]
        X, Y = np.asarray(x, dtype=np.float32), np.asarray(y, dtype=np.int32)
        n_p = (Y == positive).sum()
        n_n = (Y == negative).sum()
        Xp = X[Y == positive][:n_p]
        Xn = X[Y == negative][:n_n]
        X = np.asarray(np.concatenate((Xp, Xn)), dtype=np.float32)
        Y = np.asarray(np.concatenate((np.ones(n_p), -np.ones(n_n))), dtype=np.int32)
        perm = np.random.permutation(len(Y))
        X, Y = X[perm], Y[perm]
        return X, Y

In [71]:
x_test, y_test = make_pn_dataset_from_binary_dataset(x_test, y_test)

In [73]:
x_test.shape

(10000, 784)

In [77]:
np.unique(y_test, return_counts=True)

(array([-1,  1], dtype=int32), array([5074, 4926]))

In [62]:
import pandas as pd

In [60]:
def read_riboseq_data():
    df = pd.read_csv("../data/final_ckpt_with_labels.csv")
    return df

In [63]:
df = read_riboseq_data()

In [65]:
df['is_start_position'] = df['is_start_position'].replace({True: 1, False: -1})

In [67]:
df['is_start_position'].value_counts()

-1    113965
 1      1348
Name: is_start_position, dtype: int64

In [78]:
df

Unnamed: 0.1,Unnamed: 0,rl_15-15,rl_16-15,rl_17-15,rl_18-15,rl_19-15,rl_20-15,rl_21-15,rl_22-15,rl_23-15,...,rl_29+15,rl_30+15,rl_31+15,A+15,C+15,G+15,T+15,position,gene,is_start_position
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,48,,-1
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,49,,-1
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,50,,-1
3,3,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,51,,-1
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,52,,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115308,115308,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,4640944,,-1
115309,115309,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,4640945,,-1
115310,115310,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,4640946,,-1
115311,115311,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,4640947,,-1


In [83]:
WINDOW_LEN = 15

def generate_colnames(df, window_len):

    locs = np.arange(-window_len, window_len + 1, 1)

    colnames = []
    for loc in locs:
        loc_str = str(loc) if loc < 0 else ('+' + str(loc))

        colnames.append(col + loc_str)

    return colnames
        
colnames = generate_colnames(df, WINDOW_LEN)
print(len(colnames))

20305
