In [8]:
import numpy as np
import pandas
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from sklearn.model_selection import cross_val_score, ShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.svm import SVR
import pickle
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Constants

In [2]:
# file_path = "/nfs/lab/varpred/Final_Results/final_matrices/"
file_path = "./"
t1d_mat = file_path + "T1D_final_short.bed"
t2d_mat = file_path + "T2D_final_short.bed"
topmed_mat = file_path + "topmed_final.bed"
proba_threshold_high = 0.8
proba_threshold_low = 0.000005

# Data Ingestion

In [6]:
# Raw data from csv
t1d_data_raw = pandas.read_csv(t1d_mat, sep='\s+')

# Splitting labels and data
t1d_labels_raw = t1d_data_raw["Probability"]
t1d_data_raw = t1d_data_raw.drop(['VarID', 'Probability'], axis=1)

# Removing imbalance columns
t1d_data = t1d_data.drop(columns=[col for col in t1d_data.columns if 'imbal' in col])

In [61]:
# Categorical labelling
def label_prob(prob, prob_threshold_high, prob_threshold_low):
    if prob > prob_threshold_high:
        return 1
    elif prob < prob_threshold_low:
        return 0
    else:
        return -1
    
# Split unknown and known data
def sort_data(data, labels):
    known_data_bool = labels != -1
    all_data = data.copy(deep=True)
    all_data.insert(len(all_data.columns), 'label', labels)
    all_data.insert(len(all_data.columns), 'known', known_data_bool)
    all_known_data = all_data[all_data.known == True]
    all_unknown_data = all_data[all_data.known == False]
    known_data = all_known_data.drop(['label', 'known'], axis=1)
    known_labels = all_known_data['label']
    unknown_data = all_unknown_data.drop(['label', 'known'], axis=1)
    unknown_labels = all_unknown_data['label']
    
    return all_known_data, unknown_data, known_data, known_labels, unknown_data, unknown_labels


def create_splitter(known_data, unknown_data):
    ss = ShuffleSplit(n_splits=5, test_size=0.25, random_state=0)
    splits = ss.split(known_data)

    def custom_splitter(splits):
        for train_index, test_index in splits:
            train_index = (known_data.iloc[train_index].append(unknown_data)).index.values
#             train_index = np.append(train_index, list(unknown_data.index.values))
            test_index = (known_data.iloc[test_index]).index.values
            yield train_index, test_index
            
    return splits, custom_splitter


def perform_cv(model, data, labels):
    all_known_data, unknown_data, known_data, known_labels, unknown_data, unknown_labels = sort_data(data, labels)
    splits, custom_splitter = create_splitter(known_data, unknown_data)
    cv_score = cross_val_score(model, data, y=labels, cv=custom_splitter(splits), verbose=1, n_jobs=1)
    return np.mean(cv_score)

def cv(model, data, labels):
    all_known_data, unknown_data, known_data, known_labels, unknown_data, unknown_labels = sort_data(data, labels)
    ss = ShuffleSplit(n_splits=5, test_size=0.25, random_state=0)
    splits = ss.split(known_data)
    
    total_score = 0
    num = 0
    for train_idx, test_idx in splits:
        train_data = known_data.iloc[train_idx].append(unknown_data)
        test_data = known_data.iloc[test_idx]
        train_labels = known_labels.iloc[train_idx].append(unknown_labels)
        test_labels = known_labels.iloc[test_idx]

        model.fit(train_data, train_labels)

        # Predict on test
        pred_labels = model.predict(test_data)

        # Measure accuracy
        score = accuracy_score(pred_labels, test_labels)
        print("Score", score)
        total_score += score
        num += 1

    print("Avg:", total_score / num)
    return total_score / num

In [62]:
t1d_data_raw = pandas.read_csv(t1d_mat, sep='\s+')
t1d_var_data = t1d_data_raw['VarID']
t1d_prob = t1d_data_raw['Probability']

hthresh = 0.01
lthresh = 0.000007

# Label points based on prob thresholds
print("high: ", hthresh, "low", lthresh, end=':')
print("Positive", len(t1d_labels[t1d_labels > hthresh]), end='; ')
print("Negative", len(t1d_labels[t1d_labels < lthresh]), end='; ')
print("Unlabelled", len(t1d_labels[(t1d_labels <= hthresh) & (t1d_labels >= lthresh)]), end='; ')
print("")
t1d_labels_thresh = t1d_labels.apply(lambda row: label_prob(row, hthresh, lthresh))

# Data stats
num_pos = len(t1d_labels_thresh[t1d_labels_thresh == 1])
num_neg = len(t1d_labels_thresh[t1d_labels_thresh == 0])
num_unlabelled = len(t1d_labels_thresh[t1d_labels_thresh == -1])
print('% positive', num_pos / (num_neg + num_unlabelled + num_pos))
print('% negative', num_neg / (num_neg + num_unlabelled + num_pos))
print('positive to negative ratio', num_pos / num_neg)
print('labelled to unlabelled ratio', (num_pos + num_neg) / num_unlabelled)

model = LabelSpreading(kernel='rbf', alpha=0.2, gamma=20)
norm_model = make_pipeline(MinMaxScaler(), model)

cv(model, t1d_data, t1d_labels_thresh)

high:  0.01 low 7e-06:Positive 589; Negative 2094; Unlabelled 21456; 
% positive 0.024400347984589253
% negative 0.08674758689258047
positive to negative ratio 0.281279847182426
labelled to unlabelled ratio 0.12504660700969425


  self.label_distributions_ /= normalizer


Score 0.7749627421758569


  self.label_distributions_ /= normalizer


Score 0.7719821162444114


  self.label_distributions_ /= normalizer


Score 0.7794336810730254


  self.label_distributions_ /= normalizer


Score 0.767511177347243


  self.label_distributions_ /= normalizer


Score 0.7943368107302533
Avg: 0.777645305514158
