In [1]:
import numpy as np
import pandas
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from sklearn.model_selection import cross_val_score, ShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.svm import SVR
import pickle
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

## Constants

In [2]:
# file_path = "/nfs/lab/varpred/Final_Results/final_matrices/"
file_path = "./"
# t1d_mat = file_path + "T1D_final_short.bed"
t1d_mat = file_path + "t1d_final_short_after_loci_filtering.bed"
# t2d_mat = file_path + "T2D_final_short.bed"
t2d_mat = file_path + "t2d_final_short_after_loci_filtering.bed"
topmed_mat = file_path + "topmed_final.bed"
proba_threshold_high = 0.8
proba_threshold_low = 0.000005

# Data Ingestion

In [3]:
# Raw data from csv
t1d_data_raw = pandas.read_csv(t1d_mat, sep='\s+')

# Splitting labels and data
t1d_labels = t1d_data_raw["Probability"]
t1d_data_raw = t1d_data_raw.drop(['VarID', 'Probability'], axis=1)

# Removing imbalance columns
t1d_data_raw = t1d_data_raw.drop(columns=[col for col in t1d_data_raw.columns if 'imbal' in col])

In [4]:
# Categorical labelling
def label_prob(prob, prob_threshold_high, prob_threshold_low):
    if prob > prob_threshold_high:
        return 1
    elif prob < prob_threshold_low:
        return 0
    else:
        return -1
    
# Split unknown and known data
def sort_data(data, labels):
    known_data_bool = labels != -1
    all_data = data.copy(deep=True)
    all_data.insert(len(all_data.columns), 'label', labels)
    all_data.insert(len(all_data.columns), 'known', known_data_bool)
    all_known_data = all_data[all_data.known == True]
    all_unknown_data = all_data[all_data.known == False]
    known_data = all_known_data.drop(['label', 'known'], axis=1)
    known_labels = all_known_data['label']
    unknown_data = all_unknown_data.drop(['label', 'known'], axis=1)
    unknown_labels = all_unknown_data['label']
    
    return all_known_data, unknown_data, known_data, known_labels, unknown_data, unknown_labels


def create_splitter(known_data, unknown_data):
    ss = ShuffleSplit(n_splits=5, test_size=0.25, random_state=0)
    splits = ss.split(known_data)

    def custom_splitter(splits):
        for train_index, test_index in splits:
            train_index = (known_data.iloc[train_index].append(unknown_data)).index.values
#             train_index = np.append(train_index, list(unknown_data.index.values))
            test_index = (known_data.iloc[test_index]).index.values
            yield train_index, test_index
            
    return splits, custom_splitter


def perform_cv(model, data, labels):
    all_known_data, unknown_data, known_data, known_labels, unknown_data, unknown_labels = sort_data(data, labels)
    splits, custom_splitter = create_splitter(known_data, unknown_data)
    cv_score = cross_val_score(model, data, y=labels, cv=custom_splitter(splits), verbose=1, n_jobs=1)
    return np.mean(cv_score)

def cv(model, data, labels):
    all_known_data, unknown_data, known_data, known_labels, unknown_data, unknown_labels = sort_data(data, labels)
    ss = ShuffleSplit(n_splits=5, test_size=0.25, random_state=0)
    splits = ss.split(known_data)
    
    total_acc_score = 0
    total_prec_score = 0
    total_rec_score = 0
    num = 0
    for train_idx, test_idx in splits:
        train_data = known_data.iloc[train_idx].append(unknown_data)
        test_data = known_data.iloc[test_idx]
        train_labels = known_labels.iloc[train_idx].append(unknown_labels)
        test_labels = known_labels.iloc[test_idx]

        model.fit(train_data, train_labels)

        # Predict on test
        pred_labels = model.predict(test_data)

        # Measure accuracy
        acc = accuracy_score(pred_labels, test_labels)
        prec = precision_score(pred_labels, test_labels)
        rec = recall_score(pred_labels, test_labels)
        total_acc_score += acc
        total_prec_score += prec
        total_rec_score += rec
        num += 1

    print("Avg accuracy, precision, recall:", total_acc_score / num, total_prec_score / num, total_rec_score / num)
    return total_acc_score / num, total_prec_score / num, total_rec_score / num

In [32]:
t1d_data_raw = pandas.read_csv(t1d_mat, sep='\s+')
t1d_var_data = t1d_data_raw['VarID']
t1d_loc_data = t1d_data_raw['Locus']
t1d_prob = t1d_data_raw['Probability']
t1d_data = t1d_data_raw.drop(columns=['VarID', 'Locus', 'Probability'])

hthresh = 0.0001
lthresh = 0.000005

# Label points based on prob thresholds
print("high: ", hthresh, "low", lthresh, end=':')
print("Positive", len(t1d_prob[t1d_prob > hthresh]), end='; ')
print("Negative", len(t1d_prob[t1d_prob < lthresh]), end='; ')
print("Unlabelled", len(t1d_prob[(t1d_prob <= hthresh) & (t1d_prob >= lthresh)]), end='; ')
print("")
t1d_labels = t1d_prob.apply(lambda row: label_prob(row, hthresh, lthresh))

# Data stats
num_pos = len(t1d_labels[t1d_labels == 1])
num_neg = len(t1d_labels[t1d_labels == 0])
num_unlabelled = len(t1d_labels[t1d_labels == -1])
print('% positive', num_pos / (num_neg + num_unlabelled + num_pos))
print('% negative', num_neg / (num_neg + num_unlabelled + num_pos))
print('positive to negative ratio', num_pos / num_neg)
print('labelled to unlabelled ratio', (num_pos + num_neg) / num_unlabelled)

model = LabelSpreading(kernel='rbf', alpha=0.2, gamma=100)
# model = make_pipeline(MinMaxScaler(), model)

# cv(model, t1d_data, t1d_labels)

high:  0.0001 low 5e-06:Positive 3542; Negative 246; Unlabelled 8592; 
% positive 0.28610662358642974
% negative 0.01987075928917609
positive to negative ratio 14.398373983739837
labelled to unlabelled ratio 0.4408752327746741


In [37]:
all_known_data, unknown_data, known_data, known_labels, unknown_data, unknown_labels = sort_data(t1d_data, t1d_labels)
ss = ShuffleSplit(n_splits=5, test_size=0.25, random_state=0)
splits = ss.split(known_data)

total_acc_score = 0
total_prec_score = 0
total_rec_score = 0
num = 0
train_idx, test_idx = next(splits)

train_data = known_data.iloc[train_idx].append(unknown_data)
test_data = known_data.iloc[test_idx]
train_labels = known_labels.iloc[train_idx].append(unknown_labels)
test_labels = known_labels.iloc[test_idx]

model.fit(train_data, train_labels)

# Predict on test
pred_labels = model.predict(test_data)


  self.label_distributions_ /= normalizer


In [34]:
print(accuracy_score(pred_labels, test_labels))
print(precision_score(pred_labels, test_labels))
print(recall_score(pred_labels, test_labels))
print(confusion_matrix(pred_labels, test_labels))

0.05174234424498416
0.0
0.0
[[ 49 898]
 [  0   0]]


  _warn_prf(average, modifier, msg_start, len(result))


In [35]:
transduction = model.transduction_
print('Original labelling')
print('Num neg', len(t1d_labels[t1d_labels == 0]))
print('Num pos', len(t1d_labels[t1d_labels == 1]))
print('Num unlabelled', len(t1d_labels[t1d_labels == -1]))
print('total', len(t1d_labels))

print("Transduction labelling")
print('Num neg', len(transduction[transduction == 0]))
print('Num pos', len(transduction[transduction == 1]))
print('Num unlabelled', len(transduction[transduction == -1]))
print('total', len(transduction))

print("test labels pos", len(test_labels[test_labels == 1]))
print("test labels neg", len(test_labels[test_labels == 0]))

print("pred labels pos", len(pred_labels[pred_labels == 1]))
print("pred labels neg", len(pred_labels[pred_labels == 0]))

Original labelling
Num neg 246
Num pos 3542
Num unlabelled 8592
total 12380
Transduction labelling
Num neg 6083
Num pos 5350
Num unlabelled 0
total 11433
test labels pos 898
test labels neg 49
pred labels pos 0
pred labels neg 947


In [36]:
model.label_distributions_

array([[0.00000000e+00, 1.00000000e+00],
       [1.00000000e+00, 0.00000000e+00],
       [1.76960054e-10, 1.00000000e+00],
       ...,
       [           nan,            nan],
       [2.00000000e-01, 8.00000000e-01],
       [           nan,            nan]])

In [11]:
pred_labels

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,