In [20]:
import numpy as np
import scipy as sp
import pandas as pd
from pandas import Series
import os 
from itertools import product

import warnings

#from modshogun import *

from sklearn import linear_model, decomposition
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, GroupKFold, LeaveOneGroupOut
from sklearn.externals.joblib import Parallel, delayed
from sklearn.preprocessing import RobustScaler, LabelEncoder, StandardScaler, Imputer, MinMaxScaler
from sklearn.pipeline import Pipeline

from CustomCVs import KFoldMixedSizes, StratifiedKFoldMixedSizes, StratifiedKFoldByGroups
#from evaluation_classifier import Evaluater

from time import time
from IPython.display import clear_output

#from fancyimpute import BiScaler, KNN, NuclearNormMinimization, SoftImpute, IterativeSVD #, MICE

from six.moves import cPickle as pickle

import matplotlib.pyplot as plt

In [2]:
# def create_rank_k_dataset(
#         n_rows=5,
#         n_cols=5,
#         k=3,
#         fraction_missing=0.1,
#         symmetric=False,
#         random_seed=0):
#     np.random.seed(random_seed)
#     x = np.random.randn(n_rows, k)
#     y = np.random.randn(k, n_cols)

#     XY = np.dot(x, y)

#     if symmetric:
#         assert n_rows == n_cols
#         XY = 0.5 * XY + 0.5 * XY.T

#     missing_raw_values = np.random.uniform(0, 1, (n_rows, n_cols))
#     missing_mask = missing_raw_values < fraction_missing

#     XY_incomplete = XY.copy()
#     # fill missing entries with NaN
#     XY_incomplete[missing_mask] = np.nan

#     return XY, XY_incomplete, missing_mask

# # create some default data to be shared across tests
# XY, XY_incomplete, missing_mask = create_rank_k_dataset(
#     n_rows=500,
#     n_cols=10,
#     k=3,
#     fraction_missing=0.25)

In [3]:
def create_correlated_dataset(cov_mat, n_obs = 2500):
    
    n_vars = cov_mat.shape[0]
    cov_mat = cov_mat + 1 * np.eye(n_vars) # regularize for stability
    
    try:
        L = np.linalg.cholesky(cov_mat)
        D = np.dot(L, np.random.uniform(0,1, (n_vars, n_obs)))
        return D
    
    except np.linalg.LinAlgError as err:
        print('Error ---- Cholesksy')
        return None
        
    

In [4]:
# test
cov_mat = np.array([[1, 0.7, 0.7, 0.5,],
             [0.7, 1, 0.95, 0.3],
             [0.7, 0.95, 1, 0.3],
             [0.5, 0.3, 0.3, 1]])

D = create_correlated_dataset(cov_mat)
print(D.shape)
np.corrcoef(D)

(4, 2500)


array([[1.        , 0.37224904, 0.36230115, 0.20353479],
       [0.37224904, 1.        , 0.48604921, 0.12585383],
       [0.36230115, 0.48604921, 1.        , 0.16007137],
       [0.20353479, 0.12585383, 0.16007137, 1.        ]])

In [5]:
# Construct a cov matrix using a REAL dataset

data_dir="/data/rmthomas/HeteroSmallSample"
df = pd.read_csv(os.path.join(data_dir, "real_data.csv"))
df_numeric = df[df.columns[25:125]]
cov_mat_overall = df_numeric.corr().values

alpha = 0.2
reg_cov_mat = cov_mat_overall + alpha*np.eye(100) # alpha makes the matrix well conditioned for Cholesky
D_overall = create_correlated_dataset(reg_cov_mat)
print(D_overall.shape)

(100, 2500)


In [6]:
n_features = 100 
g = df.groupby(['site', 'Dx', 'age_group'])[df.columns[25:25+n_features]]

Groups = list(g.indices.keys())

In [7]:
# Generate correlation matrices per set = (site, Dx, age_group)
corrs_per_set = g.corr().values.reshape(-1, n_features, n_features)
#corrs_per_set[np.where(np.isnan(corrs_per_set))] = 0.5

In [8]:
feature_labels = [f'f{i}' for i in range(100)] # f1, f2 ...f100
data_cols = ['site', 'Dx', 'Age_group'] + feature_labels

In [9]:
sim_data_all = pd.DataFrame(columns=data_cols) # initialize a dataframe
group_template = pd.DataFrame(columns=['site', 'Dx', 'Age_group']) # initialize a dataframe

In [72]:
sim_data=[]

min_subj = 5
max_subj = 70
for corr_i in range(corrs_per_set.shape[0]):
    n_obs=np.random.choice(np.arange(min_subj, max_subj))
    D = create_correlated_dataset(corrs_per_set[corr_i], n_obs=n_obs)

    if D is not None:
        sim_data_group = pd.DataFrame([list(Groups[corr_i])], columns=['site', 'Dx', 'Age_group'])
        sim_data_group = pd.concat([sim_data_group]*n_obs, ignore_index=True)
        
        sim_data_group_matrix = pd.DataFrame(D.T, columns=feature_labels)
        df_group = pd.concat([sim_data_group, sim_data_group_matrix], axis=1, ignore_index=False)
        sim_data_all = sim_data_all.append(df_group, ignore_index=True)
        sim_data.append(D)

Error ---- Cholesksy
Error ---- Cholesksy
Error ---- Cholesksy
Error ---- Cholesksy
Error ---- Cholesksy
Error ---- Cholesksy
Error ---- Cholesksy
Error ---- Cholesksy
Error ---- Cholesksy
Error ---- Cholesksy
Error ---- Cholesksy
Error ---- Cholesksy
Error ---- Cholesksy
Error ---- Cholesksy


In [76]:
sim_data_all['site'].unique()

array(['Arnold', 'Benedetti', 'Beucke', 'Brennan', 'Buitelaar', 'Cheng',
       'Fitzgerald', 'Gruner', 'Heuvel', 'Hirano', 'Hoexter', 'Huyser',
       'Koch', 'Kwon', 'KwonNMC', 'KwonSNU', 'Lazaro', 'Marsh',
       'Mataix_Cols', 'Menchon', 'Morgado', 'Nakamae', 'Nakao', 'Nurmi',
       'Reddy', 'Simpson', 'Soreni', 'Spalletta', 'Stein', 'Stewart',
       'Tolin', 'Walitza', 'Wang'], dtype=object)

In [62]:
#sim_data_group_matrix
sim_data_group = pd.DataFrame([list(Groups[0])], columns=['site', 'Dx', 'Age_group'])
sim_data_group = pd.concat([sim_data_group]*n_obs, ignore_index=True)
sim_data_group
dd = pd.concat([sim_data_group, sim_data_group_matrix], axis=1, ignore_index=False)

In [63]:
dd

Unnamed: 0,site,Dx,Age_group,f0,f1,f2,f3,f4,f5,f6,...,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99
0,Arnold,0,2_pediatric,0.962754,1.168055,1.042,1.625226,1.200295,0.983538,0.996493,...,-0.057974,0.322854,-0.140337,-0.201307,0.850433,0.719654,1.218751,0.599901,0.960641,1.083709
1,Arnold,0,2_pediatric,0.104035,0.497611,1.204809,0.274449,1.263152,0.714191,1.078611,...,0.819422,0.943072,1.487125,0.443699,1.444968,0.407332,1.235342,0.620423,1.466793,0.942976
2,Arnold,0,2_pediatric,0.418678,1.263444,0.44528,1.566263,1.521554,1.797008,0.268024,...,0.105432,0.152917,1.224929,0.152372,0.935337,0.153113,0.465805,0.548593,0.671647,1.413478
3,Arnold,0,2_pediatric,0.206298,0.850981,0.788609,1.280285,1.440172,1.09896,-0.009737,...,0.986256,0.029717,0.737938,0.966094,1.156463,1.168977,0.417448,0.737924,1.545493,0.747156
4,Arnold,0,2_pediatric,0.944302,0.226151,0.351589,0.710804,0.480637,0.69857,0.528538,...,0.22114,1.251491,0.624633,-0.053671,0.713409,-0.303856,1.22189,0.87852,1.40276,0.443313
5,Arnold,0,2_pediatric,1.185779,0.602944,0.542786,1.102667,1.243428,1.519081,-0.057982,...,1.289156,0.752756,1.405726,0.516663,0.515741,1.225218,0.715387,1.013767,1.598636,0.379195
6,Arnold,0,2_pediatric,0.871914,1.168844,0.438497,1.022589,0.839407,0.955539,0.474187,...,0.254032,1.0109,1.500728,1.052047,1.689595,0.486345,0.920775,1.562962,1.355947,1.559838
7,Arnold,0,2_pediatric,0.286235,1.3471,1.230444,0.386898,1.384157,1.612789,0.071755,...,-0.217803,0.272326,0.993267,0.047184,0.608304,-0.001256,1.261843,0.229796,1.257732,1.02697
8,Arnold,0,2_pediatric,0.934228,1.421984,1.326797,1.275845,2.000391,1.305234,0.74307,...,1.055843,1.275247,1.273124,1.41076,1.597448,1.411381,1.643633,1.416657,1.019333,1.497514
9,Arnold,0,2_pediatric,0.83371,0.174007,1.294597,1.39154,1.344636,1.738491,0.198023,...,-0.128061,1.065121,0.739626,0.18888,0.114051,0.634649,1.248146,0.86178,1.106435,0.532173


In [None]:
all_data = np.hstack(sim_data)
nvars, nsubjs = all_data.shape

In [None]:
# Create data for each group
D_patients = create_correlated_dataset(cov_patients)
D_controls = create_correlated_dataset(cov_controls)

In [None]:
import seaborn as sns
import matplotlib.gridspec as gs
import matplotlib.pyplot as plt
import itertools

a = 31
if a%2 != 0:
    a += 1

n = np.floor(np.sqrt(a)).astype(np.int64)

while a%n != 0:
    n -= 1

m = (a/n).astype(np.int64)
coords = list(itertools.product(list(range(m)), list(range(n))))




In [None]:
from fancyimpute import MICE

In [None]:
# https://www.kaggle.com/athi94/investigating-imputation-methods