In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import os 
from itertools import product

import warnings

#from modshogun import *

from sklearn import linear_model, decomposition
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, GroupKFold, LeaveOneGroupOut
from sklearn.externals.joblib import Parallel, delayed
from sklearn.preprocessing import RobustScaler, LabelEncoder, StandardScaler, Imputer, MinMaxScaler
from sklearn.pipeline import Pipeline

from CustomCVs import KFoldMixedSizes, StratifiedKFoldMixedSizes, StratifiedKFoldByGroups
#from evaluation_classifier import Evaluater

from time import time
from IPython.display import clear_output

#from fancyimpute import BiScaler, KNN, NuclearNormMinimization, SoftImpute, IterativeSVD #, MICE

from six.moves import cPickle as pickle

import matplotlib.pyplot as plt

In [4]:
def create_rank_k_dataset(
        n_rows=5,
        n_cols=5,
        k=3,
        fraction_missing=0.1,
        symmetric=False,
        random_seed=0):
    np.random.seed(random_seed)
    x = np.random.randn(n_rows, k)
    y = np.random.randn(k, n_cols)

    XY = np.dot(x, y)

    if symmetric:
        assert n_rows == n_cols
        XY = 0.5 * XY + 0.5 * XY.T

    missing_raw_values = np.random.uniform(0, 1, (n_rows, n_cols))
    missing_mask = missing_raw_values < fraction_missing

    XY_incomplete = XY.copy()
    # fill missing entries with NaN
    XY_incomplete[missing_mask] = np.nan

    return XY, XY_incomplete, missing_mask

In [5]:
# create some default data to be shared across tests
XY, XY_incomplete, missing_mask = create_rank_k_dataset(
    n_rows=500,
    n_cols=10,
    k=3,
    fraction_missing=0.25)

In [86]:
def create_correlated_dataset(cov_mat, n_obs = 2500):
    
    n_vars = cov_mat.shape[0]
    
    L = np.linalg.cholesky(cov_mat)
    D = np.dot(L, np.random.uniform(0,1, (n_vars, n_obs)))
    
    return D
    

In [87]:
# test
cov_mat = np.array([[1, 0.7, 0.7, 0.5,],
             [0.7, 1, 0.95, 0.3],
             [0.7, 0.95, 1, 0.3],
             [0.5, 0.3, 0.3, 1]])

D = create_correlated_dataset(cov_mat)
print(D.shape)
np.corrcoef(D)

(4, 2500)


array([[1.        , 0.70117498, 0.70252767, 0.50443589],
       [0.70117498, 1.        , 0.94965823, 0.29018632],
       [0.70252767, 0.94965823, 1.        , 0.28468454],
       [0.50443589, 0.29018632, 0.28468454, 1.        ]])

In [91]:
# Construct a cov matrix using a REAL dataset

data_dir="/data/rmthomas/HeteroSmallSample"
df = pd.read_csv(os.path.join(data_dir, "real_data.csv"))
df_numeric = df[df.columns[25:125]]
cov_mat_overall = df_numeric.corr().values

alpha = 0.2
reg_cov_mat = cov_mat_overall + alpha*np.eye(100) # alpha makes the matrix well conditioned for Cholesky
D_overall = create_correlated_dataset(reg_cov_mat)
print(D_overall.shape)

(100, 2500)


In [92]:
g = df.groupby(['site', 'Dx', 'age_group'])[df.columns[25:125]]

In [93]:
g = df.groupby(['Dx'])[df.columns[25:125]]

In [94]:
g.corr().shape

(200, 100)

In [95]:
# Create a separate correlation matrix for controls and patients
cov_patients = g.corr().values[:100, :]
cov_controls = g.corr().values[100:, :]

In [96]:
# Create data for each group
D_patients = create_correlated_dataset(cov_patients)
D_controls = create_correlated_dataset(cov_controls)

In [101]:
# Merge and randomize to create the final features X subjects matrix
D = np.hstack((D_patients, D_controls))
D = D[:, np.random.permutation(D.shape[1])]

In [109]:
from fancyimpute import MICE

In [110]:
https://www.kaggle.com/athi94/investigating-imputation-methods

SyntaxError: invalid syntax (<ipython-input-110-fd69fde1077d>, line 1)