In [2]:
import numpy as np
import scipy as sp
import pandas as pd
import os 
from itertools import product

import warnings

#from modshogun import *

from sklearn import linear_model, decomposition
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, GroupKFold, LeaveOneGroupOut
from sklearn.externals.joblib import Parallel, delayed
from sklearn.preprocessing import RobustScaler, LabelEncoder, StandardScaler, Imputer, MinMaxScaler
from sklearn.pipeline import Pipeline

from CustomCVs import KFoldMixedSizes, StratifiedKFoldMixedSizes, StratifiedKFoldByGroups
#from evaluation_classifier import Evaluater

from time import time
from IPython.display import clear_output

#from fancyimpute import BiScaler, KNN, NuclearNormMinimization, SoftImpute, IterativeSVD #, MICE

from six.moves import cPickle as pickle

import matplotlib.pyplot as plt

In [3]:
def create_rank_k_dataset(
        n_rows=5,
        n_cols=5,
        k=3,
        fraction_missing=0.1,
        symmetric=False,
        random_seed=0):
    np.random.seed(random_seed)
    x = np.random.randn(n_rows, k)
    y = np.random.randn(k, n_cols)

    XY = np.dot(x, y)

    if symmetric:
        assert n_rows == n_cols
        XY = 0.5 * XY + 0.5 * XY.T

    missing_raw_values = np.random.uniform(0, 1, (n_rows, n_cols))
    missing_mask = missing_raw_values < fraction_missing

    XY_incomplete = XY.copy()
    # fill missing entries with NaN
    XY_incomplete[missing_mask] = np.nan

    return XY, XY_incomplete, missing_mask

In [4]:
# create some default data to be shared across tests
XY, XY_incomplete, missing_mask = create_rank_k_dataset(
    n_rows=500,
    n_cols=10,
    k=3,
    fraction_missing=0.25)

In [87]:
def create_correlated_dataset(cov_mat, n_obs = 2500):
    
    n_vars = cov_mat.shape[0]
    cov_mat = cov_mat + 1 * np.eye(n_vars) # regularize for stability
    
    try:
        L = np.linalg.cholesky(cov_mat)
        D = np.dot(L, np.random.uniform(0,1, (n_vars, n_obs)))
        return D
    
    except np.linalg.LinAlgError as err:
        print('Error ---- Cholesksy')
        return None
        
    

In [31]:
# test
cov_mat = np.array([[1, 0.7, 0.7, 0.5,],
             [0.7, 1, 0.95, 0.3],
             [0.7, 0.95, 1, 0.3],
             [0.5, 0.3, 0.3, 1]])

D = create_correlated_dataset(cov_mat)
print(D.shape)
np.corrcoef(D)

(4, 2500)


array([[1.        , 0.62904718, 0.64037505, 0.44460715],
       [0.62904718, 1.        , 0.86393994, 0.25334644],
       [0.64037505, 0.86393994, 1.        , 0.24615124],
       [0.44460715, 0.25334644, 0.24615124, 1.        ]])

In [7]:
# Construct a cov matrix using a REAL dataset

data_dir="/data/rmthomas/HeteroSmallSample"
df = pd.read_csv(os.path.join(data_dir, "real_data.csv"))
df_numeric = df[df.columns[25:125]]
cov_mat_overall = df_numeric.corr().values

alpha = 0.2
reg_cov_mat = cov_mat_overall + alpha*np.eye(100) # alpha makes the matrix well conditioned for Cholesky
D_overall = create_correlated_dataset(reg_cov_mat)
print(D_overall.shape)

(100, 2500)


In [23]:
n_features = 100 
g = df.groupby(['site', 'Dx', 'age_group'])[df.columns[25:25+n_features]]

In [162]:
Groups = list(g.indices.keys())

In [141]:
# Generate correlation matrices per set = (site, Dx, age_group)
corrs_per_set = g.corr().values.reshape(-1, n_features, n_features)
#corrs_per_set[np.where(np.isnan(corrs_per_set))] = 0.5

In [157]:
feature_labels = [f'f{i}' for i in range(100)] # f1, f2 ...f100
data_cols = ['site', 'Dx', 'Age_group'] + feature_labels

In [177]:
sim_data_all = pd.DataFrame(columns=data_cols) # initialize a dataframe
sim_data_group = pd.DataFrame(columns=['site', 'Dx', 'Age_group']) # initialize a dataframe

In [193]:
x=pd.concat([sim_data_group]*10)

In [198]:
D = create_correlated_dataset(corrs_per_set[corr_i], n_obs=5)

In [203]:
sim_data_group_matrix.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99
0,0.579753,0.354048,1.172559,0.47739,1.418456,1.378038,0.648582,1.289229,1.249805,1.23604,...,0.485921,0.307701,1.179291,0.794946,1.057373,-0.282628,1.493134,0.718699,0.926573,0.361378
1,0.247798,0.721363,0.913661,0.902886,0.920255,1.250294,0.20579,0.898721,1.422403,0.635937,...,0.646693,0.870319,0.550951,0.783512,0.649794,0.588926,1.582328,0.955295,1.455194,0.917329
2,0.323708,0.850976,0.579934,1.43846,1.077942,1.401805,0.367354,1.618112,0.882814,1.141259,...,-0.519534,1.187627,0.528415,0.690527,1.093244,0.451904,0.666801,0.429912,1.412119,0.456779
3,0.97751,0.222065,1.479338,0.611162,1.243309,0.805549,0.266392,1.566194,1.937155,1.282991,...,0.413174,1.34935,1.122237,1.106723,1.504242,1.100625,1.603293,0.777187,1.828995,1.542503
4,0.860499,1.423681,0.829665,0.442679,1.149462,1.917575,0.355069,1.857625,1.331467,1.601007,...,-0.439265,1.300415,0.114586,0.49278,0.557306,0.03059,1.310867,1.044541,1.545986,0.070934


In [142]:
from pandas import Series

min_subj = 5
max_subj = 70
for corr_i in range(corrs_per_set.shape[0]):
    n_obs=np.random.choice(np.arange(min_subj, max_subj))
    D = create_correlated_dataset(corrs_per_set[corr_i], n_obs=n_obs)

    if D is not None:
        sim_data_group = sim_data_group.append(Series(list(Groups[corr_i]), index=['site', 'Dx', 'Age_group']), ignore_index=True)
        sim_data_group = pd.concat([sim_data_group]*n_obs)
        
        sim_data_group_matrix = pd.DataFrame(D.T, columns=feature_labels)
        
        
        sim_data.append(D)

Error ---- Cholesksy
Error ---- Cholesksy
Error ---- Cholesksy
Error ---- Cholesksy
Error ---- Cholesksy
Error ---- Cholesksy
Error ---- Cholesksy
Error ---- Cholesksy
Error ---- Cholesksy
Error ---- Cholesksy
Error ---- Cholesksy
Error ---- Cholesksy
Error ---- Cholesksy
Error ---- Cholesksy


In [129]:
all_data = np.hstack(sim_data)
nvars, nsubjs = all_data.shape

In [96]:
# Create data for each group
D_patients = create_correlated_dataset(cov_patients)
D_controls = create_correlated_dataset(cov_controls)

In [132]:
import seaborn as sns
import matplotlib.gridspec as gs
import matplotlib.pyplot as plt
import itertools

a = 31
if a%2 != 0:
    a += 1

n = np.floor(np.sqrt(a)).astype(np.int64)

while a%n != 0:
    n -= 1

m = (a/n).astype(np.int64)
coords = list(itertools.product(list(range(m)), list(range(n))))




In [109]:
from fancyimpute import MICE

In [1]:
# https://www.kaggle.com/athi94/investigating-imputation-methods