In [1]:
import numpy as np
import pandas
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.svm import SVR
import pickle
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler

## Constants

In [2]:
# file_path = "/nfs/lab/varpred/Final_Results/final_matrices/"
file_path = "./"
# t1d_mat = file_path + "T1D_final_short.bed"
t1d_mat = file_path + "t1d_final_short_after_loci_filtering.bed"
# t2d_mat = file_path + "T2D_final_short.bed"
t2d_mat = file_path + "t2d_final_short_after_loci_filtering.bed"
topmed_mat = file_path + "topmed_final.bed"
proba_threshold_high = 0.8
proba_threshold_low = 0.000005

# Data Ingestion

In [3]:
# Raw data from csv
t1d_data_raw = pandas.read_csv(t1d_mat, sep='\s+')

# Splitting labels and data
t1d_labels = t1d_data_raw["Probability"]
t1d_data_raw = t1d_data_raw.drop(['VarID', 'Probability'], axis=1)

# Removing imbalance columns
t1d_data_raw = t1d_data_raw.drop(columns=[col for col in t1d_data_raw.columns if 'imbal' in col])

In [4]:
# Categorical labelling
def label_prob(prob, prob_threshold_high, prob_threshold_low):
    if prob > prob_threshold_high:
        return 1
    elif prob < prob_threshold_low:
        return 0
    else:
        return -1
    
# Split unknown and known data
def sort_data(data, labels):
    known_data_bool = labels != -1
    all_data = data.copy(deep=True)
    all_data.insert(len(all_data.columns), 'label', labels)
    all_data.insert(len(all_data.columns), 'known', known_data_bool)
    all_known_data = all_data[all_data.known == True]
    all_unknown_data = all_data[all_data.known == False]
    known_data = all_known_data.drop(['label', 'known'], axis=1)
    known_labels = all_known_data['label']
    unknown_data = all_unknown_data.drop(['label', 'known'], axis=1)
    unknown_labels = all_unknown_data['label']
    
    return all_known_data, unknown_data, known_data, known_labels, unknown_data, unknown_labels

def cv(model, data, labels):
    all_known_data, unknown_data, known_data, known_labels, unknown_data, unknown_labels = sort_data(data, labels)
    ss = StratifiedShuffleSplit(n_splits=5, test_size=0.25, random_state=0)
    splits = ss.split(known_data)
    
    total_acc_score = 0
    total_prec_score = 0
    total_rec_score = 0
    num = 0
    for train_idx, test_idx in splits:
        train_data = known_data.iloc[train_idx].append(unknown_data)
        test_data = known_data.iloc[test_idx]
        train_labels = known_labels.iloc[train_idx].append(unknown_labels)
        test_labels = known_labels.iloc[test_idx]

        model.fit(train_data, train_labels)

        # Predict on test
        pred_labels = model.predict(test_data)

        # Measure accuracy
        acc = accuracy_score(pred_labels, test_labels)
        prec = precision_score(pred_labels, test_labels)
        rec = recall_score(pred_labels, test_labels)
        total_acc_score += acc
        total_prec_score += prec
        total_rec_score += rec
        num += 1

    print("Avg accuracy, precision, recall:", total_acc_score / num, total_prec_score / num, total_rec_score / num)
    return total_acc_score / num, total_prec_score / num, total_rec_score / num

In [5]:
t1d_data_raw = pandas.read_csv(t1d_mat, sep='\s+')
t1d_var_data = t1d_data_raw['VarID']
t1d_loc_data = t1d_data_raw['Locus']
t1d_prob = t1d_data_raw['Probability']
t1d_data = t1d_data_raw.drop(columns=['VarID', 'Locus', 'Probability'])

hthresh = 0.001
lthresh = 0.000005

# Label points based on prob thresholds
print("high: ", hthresh, "low", lthresh, end=':')
print("Positive", len(t1d_prob[t1d_prob > hthresh]), end='; ')
print("Negative", len(t1d_prob[t1d_prob < lthresh]), end='; ')
print("Unlabelled", len(t1d_prob[(t1d_prob <= hthresh) & (t1d_prob >= lthresh)]), end='; ')
print("")
t1d_labels = t1d_prob.apply(lambda row: label_prob(row, hthresh, lthresh))

# Data stats
num_pos = len(t1d_labels[t1d_labels == 1])
num_neg = len(t1d_labels[t1d_labels == 0])
num_unlabelled = len(t1d_labels[t1d_labels == -1])
print('% positive', num_pos / (num_neg + num_unlabelled + num_pos))
print('% negative', num_neg / (num_neg + num_unlabelled + num_pos))
print('positive to negative ratio', num_pos / num_neg)
print('labelled to unlabelled ratio', (num_pos + num_neg) / num_unlabelled)

# model = LabelSpreading(kernel='rbf', alpha=0.2, gamma=100)
# model = make_pipeline(MinMaxScaler(), model)

# cv(model, t1d_data, t1d_labels)

high:  0.001 low 5e-06:Positive 1930; Negative 246; Unlabelled 10204; 
% positive 0.15589660743134087
% negative 0.01987075928917609
positive to negative ratio 7.845528455284553
labelled to unlabelled ratio 0.21324970599764798


In [6]:
pca = PCA(n_components=20)
model = LabelSpreading(kernel='rbf', alpha=0.2, gamma=20)
norm_model = make_pipeline(MinMaxScaler(), pca, model)

all_known_data, unknown_data, known_data, known_labels, unknown_data, unknown_labels = sort_data(t1d_data, t1d_labels)
ss = StratifiedShuffleSplit(n_splits=5, test_size=0.25, random_state=0)
splits = ss.split(known_data, known_labels)

total_acc_score = 0
total_prec_score = 0
total_rec_score = 0
num = 0
train_idx, test_idx = next(splits)

train_data = known_data.iloc[train_idx].append(unknown_data)
test_data = known_data.iloc[test_idx]
train_labels = known_labels.iloc[train_idx].append(unknown_labels)
test_labels = known_labels.iloc[test_idx]

rus = RandomUnderSampler(random_state=42)
test_data, test_labels = rus.fit_resample(test_data, test_labels)

model.fit(train_data, train_labels)

# Predict on test
pred_labels = model.predict(test_data)


  probabilities /= normalizer


In [21]:
print(accuracy_score(pred_labels, test_labels))
print(precision_score(pred_labels, test_labels))
print(recall_score(pred_labels, test_labels))
print(confusion_matrix(pred_labels, test_labels))
tn, fp, fn, tp = confusion_matrix(pred_labels, test_labels).ravel()
print("True negatives:", tn)
print("False positives:", fp)
print("False negatives:", fn)
print("True positives:", tp)

0.6209677419354839
0.5
0.6595744680851063
[[46 31]
 [16 31]]
True negatives: 46
False positives: 31
False negatives: 16
True positives: 31


In [20]:
transduction = model.transduction_
print('Original labelling')
print('Num neg', len(t1d_labels[t1d_labels == 0]))
print('Num pos', len(t1d_labels[t1d_labels == 1]))
print('Num unlabelled', len(t1d_labels[t1d_labels == -1]))
print('total', len(t1d_labels))

print("Transduction labelling")
print('Num neg', len(transduction[transduction == 0]))
print('Num pos', len(transduction[transduction == 1]))
print('Num unlabelled', len(transduction[transduction == -1]))
print('total', len(transduction))

print('known labelling')
print('Num neg', len(known_labels[known_labels == 0]))
print('Num pos', len(known_labels[known_labels == 1]))
print('total', len(known_labels))

print("test labels pos", len(test_labels[test_labels == 1]))
print("test labels neg", len(test_labels[test_labels == 0]))

print("pred labels pos", len(pred_labels[pred_labels == 1]))
print("pred labels neg", len(pred_labels[pred_labels == 0]))

Original labelling
Num neg 246
Num pos 1930
Num unlabelled 10204
total 12380
Transduction labelling
Num neg 7158
Num pos 4678
Num unlabelled 0
total 11836
known labelling
Num neg 246
Num pos 1930
total 2176
test labels pos 62
test labels neg 62
pred labels pos 47
pred labels neg 77


In [13]:
known_data

Unnamed: 0,Acinar_peaks_bin,Alpha_peaks_bin,Beta_peaks_bin,Delta_peaks_bin,Ductal_peaks_bin,Endo_peaks_bin,Gamma_peaks_bin,Immune_peaks_bin,Stellate_peaks_bin,Acinar_peaks,...,TFA,THA,TP5,TP6,USF,YY1.like,ZEB,ZFX.ZFY,ZNF,ZNF76.like
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12375,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12376,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12377,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12378,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
df = pandas.DataFrame({"pred" : pred_labels, "test": test_labels})

In [9]:
df

Unnamed: 0,pred,test
0,0,0
1,0,0
2,1,0
3,0,0
4,0,0
...,...,...
119,0,1
120,0,1
121,1,1
122,1,1


In [16]:
print("True positives", len(df[df.pred == 1][df.test == 1]))
print("False positives", len(df[df.pred == 1][df.test == 0]))
print("False negatives", len(df[df.pred == 0][df.test == 1]))
print("True negatives", len(df[df.pred == 0][df.test == 0]))

True positives 31
False positives 16
False negatives 31
True negatives 46


  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [28]:
y_pred = [1, 0, 0, 0, 0, 1, 0, 1, 0, 0]
y_true = [0, 0, 1, 0, 1, 1, 0, 0, 1, 0]
confusion_matrix(y_pred, y_true)
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
print("True negatives:", tn)
print("False positives:", fp)
print("False negatives:", fn)
print("True positives:", tp)

True negatives: 4
False positives: 2
False negatives: 3
True positives: 1
