In [16]:
import numpy as np
import pandas
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from sklearn.model_selection import cross_val_score, ShuffleSplit
import pickle

## Constants

In [21]:
file_path = "/nfs/lab/varpred/Results/biobank_imbal_and_peaks"
t1d_mat_imbalance_binary = file_path + '/T1D_mat_imbalance_binary_val.txt'
t1d_mat_peaks_binary = file_path + '/T1D_mat_peaks_binary_val.txt'
t1d_mat_imbalance_actual = file_path + '/T1D_mat_imbalance_actual_val.txt'
t1d_mat_peaks_actual = file_path + '/T1D_mat_peaks_actual_val.txt'
t2d_mat_imbalance_binary = file_path + '/T2D_mat_imbalance_binary_val.txt'
t2d_mat_peaks_binary = file_path + '/T2D_mat_peaks_binary_val.txt'
t2d_mat_imbalance_actual = file_path + '/T2D_mat_imbalance_actual_val.txt'
t2d_mat_peaks_actual = file_path + '/T2D_mat_peaks_actual_val.txt'
proba_threshold = 0.7


# Data Ingestion

In [57]:
t1d_imbalance_binary = pandas.read_csv(t1d_mat_imbalance_binary, sep=' ')
t1d_peaks_binary = pandas.read_csv(t1d_mat_peaks_binary, sep=' ')
t1d_imbalance_actual = pandas.read_csv(t1d_mat_imbalance_actual, sep=' ')
t1d_peaks_actual = pandas.read_csv(t1d_mat_peaks_actual, sep=' ')


In [58]:
# t1d_peaks_actual = t1d_peaks_actual.rename(columns={"endo": "endocrine"})
# t1d_peaks_binary = t1d_peaks_binary.rename(columns={"endo": "endocrine"})

cell_types = list(t1d_imbalance_actual.columns)[2:]
raw_data_frames = {
    "imbalance_binary": t1d_imbalance_binary,
    "peaks_binary": t1d_peaks_binary,
    "imbalance_actual": t1d_imbalance_actual,
    "peaks_actual": t1d_peaks_actual,
}

data = pandas.concat([raw_data_frames['imbalance_actual'], raw_data_frames['imbalance_binary'], raw_data_frames['peaks_actual'], raw_data_frames['peaks_binary']], axis=1)
labels = data['probability'].iloc[:,0]
data = data.drop(['variant_id', 'probability'], axis=1)

In [59]:
labels = t1d_imbalance_binary['probability']

# Categorical labelling
def label_prob(prob):
    if prob > proba_threshold:
        return 1
    elif prob < 1 - proba_threshold:
        return 0
    else:
        return -1

labels = labels.apply(lambda row: label_prob(row))

# Label Propagation

In [64]:
label_prop_model = LabelPropagation()
# label_prop_model.fit(data, labels)

In [None]:
pickle.dump(label_prop_model, open(f"./models/label_prop_model_thresh_{proba_threshold}", 'wb'))

# Cross Validation

In [None]:
# Filtering out unlabelled data
known_data_bool = labels != -1
data.insert(len(data.columns), 'label', labels)
data.insert(len(data.columns), 'known', known_data_bool)
all_known_data = data[data.known == True]
known_data = all_known_data.drop(['label', 'known'], axis=1)
known_labels = all_known_data['label']

In [None]:
cv_score = cross_val_score(label_prop_model, known_data, y=known_labels, verbose=1, n_jobs=4)

# Label Spreading

In [None]:
label_spread_model = LabelSpreading(alpha=(1 - proba_threshold))
# label_spread_model.fit(data, labels)

In [None]:
pickle.dump(label_spread_model, open(f"./models/label_spread_model_thresh_{proba_threshold}", 'wb'))

# Experimenting

In [67]:
known_data_bool = labels != -1
all_data = data.copy(deep=True)
all_data.insert(len(all_data.columns), 'label', labels)
all_data.insert(len(all_data.columns), 'known', known_data_bool)
all_known_data = all_data[all_data.known == True]
all_unknown_data = all_data[all_data.known == False]
known_data = all_known_data.drop(['label', 'known'], axis=1)
known_labels = all_known_data['label']
unknown_data = all_unknown_data.drop(['label', 'known'], axis=1)
unknown_labels = all_unknown_data['label']

In [68]:
print('Num unknown:', len(unknown_labels))
print('Num negative:', len(known_labels[known_labels == 0]))
print('Num positive:', len(known_labels[known_labels == 1]))

Num unknown: 61
Num negative: 89104
Num positive: 21


In [69]:
ss = ShuffleSplit(n_splits=5, test_size=0.25, random_state=0)
splits = ss.split(all_known_data)

def custom_splitter(splits):
    print(type(splits))
    print(len(list(unknown_data.index.values)))
    for train_index, test_index in splits:
        print("%s %s" % (len(train_index), len(test_index)))
        train_index = np.append(train_index, list(unknown_data.index.values))
        print("%s %s" % (len(train_index), len(test_index)))
        yield train_index, test_index



In [70]:
cv_score = cross_val_score(label_prop_model, data, y=labels, cv=custom_splitter(splits), verbose=1, n_jobs=4)

<class 'generator'>
61
66843 22282
66904 22282
66843 22282
66904 22282
66843 22282
66904 22282
66843 22282
66904 22282
66843 22282
66904 22282
Process ForkPoolWorker-5:


KeyboardInterrupt: 