In [3]:
import numpy as np
import scipy as sp
import pandas as pd
import os 
from itertools import product

import warnings

#from modshogun import *

from sklearn import linear_model, decomposition
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, GroupKFold, LeaveOneGroupOut
from sklearn.externals.joblib import Parallel, delayed
from sklearn.preprocessing import RobustScaler, LabelEncoder, StandardScaler, Imputer, MinMaxScaler
from sklearn.pipeline import Pipeline

from CustomCVs import KFoldMixedSizes, StratifiedKFoldMixedSizes, StratifiedKFoldByGroups
#from evaluation_classifier import Evaluater

from time import time
from IPython.display import clear_output

#from fancyimpute import BiScaler, KNN, NuclearNormMinimization, SoftImpute, IterativeSVD #, MICE

from six.moves import cPickle as pickle

import matplotlib.pyplot as plt

In [None]:
def create_rank_k_dataset(
        n_rows=5,
        n_cols=5,
        k=3,
        fraction_missing=0.1,
        symmetric=False,
        random_seed=0):
    np.random.seed(random_seed)
    x = np.random.randn(n_rows, k)
    y = np.random.randn(k, n_cols)

    XY = np.dot(x, y)

    if symmetric:
        assert n_rows == n_cols
        XY = 0.5 * XY + 0.5 * XY.T

    missing_raw_values = np.random.uniform(0, 1, (n_rows, n_cols))
    missing_mask = missing_raw_values < fraction_missing

    XY_incomplete = XY.copy()
    # fill missing entries with NaN
    XY_incomplete[missing_mask] = np.nan

    return XY, XY_incomplete, missing_mask

In [None]:
# create some default data to be shared across tests
XY, XY_incomplete, missing_mask = create_rank_k_dataset(
    n_rows=500,
    n_cols=10,
    k=3,
    fraction_missing=0.25)

In [4]:
def create_correlated_dataset(cov_mat, n_obs = 5000):
    
    n_vars = cov_mat.shape[0]
    
    L = np.linalg.cholesky(cov_mat)
    D = np.dot(L, np.random.uniform(0,1, (n_vars, n_obs)))
    
    return D
    

In [5]:
# test
cov_mat = np.array([[1, 0.7, 0.7, 0.5,],
             [0.7, 1, 0.95, 0.3],
             [0.7, 0.95, 1, 0.3],
             [0.5, 0.3, 0.3, 1]])

D = create_correlated_dataset(cov_mat)
print(D.shape)
np.corrcoef(D)

(4, 5000)


array([[1.        , 0.69646432, 0.69910663, 0.50894069],
       [0.69646432, 1.        , 0.94887712, 0.30165075],
       [0.69910663, 0.94887712, 1.        , 0.30366525],
       [0.50894069, 0.30165075, 0.30366525, 1.        ]])

In [7]:
# Construct a cov matrix using a REAL dataset

data_dir="/data/rmthomas/HeteroSmallSample"
ENIGMA_OCD_df = pd.read_csv(os.path.join(data_dir, "ENIGMA_OCD.csv"))
cov_mat_overall = ENIGMA_OCD_df.corr().values

alpha = 0.2
reg_cov_mat = cov_mat_overall + alpha*np.eye(175) # alpha makes the matrix well conditioned for Cholesky
D_overall = create_correlated_dataset(reg_cov_mat)
print(D.shape)

(4, 5000)


In [10]:
ENIGMA_OCD_df.columns[:10]

Index(['Unnamed: 0', 'age_group', 'tesla', 'site', 'AO', 'Age', 'AgeSQ',
       'Agr_Check', 'Anx', 'Clean'],
      dtype='object')

In [11]:
covariates_columns = [ENIGMA_OCD_df.columns.values[3], ENIGMA_OCD_df.columns.values[6]]
ENIGMA_OCD_df[~ENIGMA_OCD_df.loc[:, covariates_columns].T.isnull().any()].shape

(4243, 180)

In [20]:
ENIGMA_OCD_df.loc[:, covariates_columns].T.isnull()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4360,4361,4362,4363,4364,4365,4366,4367,4368,4369
site,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
AgeSQ,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
ENIGMA_OCD_df.loc[:, covariates_columns].T

In [111]:
x=ENIGMA_OCD_df.groupby(['site', 'age_group'])