In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
%matplotlib inline
import matplotlib.pyplot as plt
import random

In [2]:
#define paths

data_dir = '/analysis/shikhar/abcd_npc/img_dt/' #'/analysis/shikhar/abcd_npc/img_dt/' #or /data/Ritter/abcd_np/
trainy_filename = 'train/train_y.csv'
valy_filename = 'val/val_y.csv'
testy_filename ='test/test_y.csv'
trainx_filename = 'train/volumetrix_data_train.csv'
valx_filename = 'val/volumetrix_data_val.csv'
testx_filename = 'test/volumetrix_data_test.csv'

In [3]:
#loading datasets

trainy= pd.read_csv(data_dir+trainy_filename, index_col=False)
valy= pd.read_csv(data_dir+valy_filename, index_col=False)
testy= pd.read_csv(data_dir+testy_filename, index_col=False)
trainx =  pd.read_csv(data_dir+trainx_filename, index_col=False)
valx =  pd.read_csv(data_dir+valx_filename, index_col=False)
testx =  pd.read_csv(data_dir+testx_filename, index_col=False)

In [4]:
#append all

all_data = trainx.append(valx).append(testx).copy()
all_data['sex_bin'] = np.where(all_data['GENDER']=='F', 1, 0)
all_data = all_data.drop(['BTSV01_ID','DATASET_ID','GENDER','SRC_SUBJECT_ID','INTERVIEW_DATE'],\
                         axis=1).copy() ## removing irrelevant columns
trainyValy_moredt = pd.merge(all_data[['SUBJECTKEY','INTERVIEW_AGE','sex_bin','STUDY_COHORT_NAME']], \
                             trainy.append(valy), left_on='SUBJECTKEY', right_on='subject')
print(all_data.STUDY_COHORT_NAME.value_counts())
print(trainy.shape)
print(valy.shape)

ABCD NP Challenge 2019 Training Set      3736
ABCD NP Challenge 2019 Test Set          3640
ABCD NP Challenge 2019 Validation Set     415
Name: STUDY_COHORT_NAME, dtype: int64
(3739, 2)
(415, 2)


#### ^^^ Training set should have been 3739, test should have been 3648 <<< In order to use above we would need imputation
##### checked in R> all IDs in volumetrix data have scan data (i.e no new random ID)


----
### Adding missing subjects (no volumetrix data) and imputing their volumetrix values

In [5]:
# appending missing Subjects

missing_dict={'test':list(set(testy.subject).difference(set(testx.SUBJECTKEY))),\
         'train':list(set(trainy.subject).difference(set(trainx.SUBJECTKEY)))}

for key, values in missing_dict.items():
    if key=='test':
        studyname = 'ABCD NP Challenge 2019 Test Set'
        for key in values:
            all_data = all_data.append([{'SUBJECTKEY':key, 'STUDY_COHORT_NAME':studyname}], ignore_index=True)
    else:
        studyname = 'ABCD NP Challenge 2019 Training Set'
        for key in values:
            all_data = all_data.append([{'SUBJECTKEY':key, 'STUDY_COHORT_NAME':studyname}], ignore_index=True)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [6]:
print(all_data['sex_bin'].value_counts())
print(all_data['sex_bin'].isna().value_counts())

0.0    4085
1.0    3706
Name: sex_bin, dtype: int64
False    7791
True       11
Name: sex_bin, dtype: int64


In [7]:
#randomly assigning sex to appended subjects (only 11 new subjects)
np.random.seed(seed=234)
all_data['sex_bin'] = all_data['sex_bin'].fillna( \
                pd.Series(np.random.choice(all_data[all_data['sex_bin'].notna()]['sex_bin'], size=len(all_data.index)))\
                                                ) 
print(all_data['sex_bin'].value_counts())
print(all_data['sex_bin'].isna().value_counts())


0.0    4092
1.0    3710
Name: sex_bin, dtype: int64
False    7802
Name: sex_bin, dtype: int64


In [8]:
#imputing mean for rest of the columns (only 11 new subjects)
all_data = all_data.fillna(all_data.mean()).copy()
print(all_data.STUDY_COHORT_NAME.value_counts())

ABCD NP Challenge 2019 Training Set      3739
ABCD NP Challenge 2019 Test Set          3648
ABCD NP Challenge 2019 Validation Set     415
Name: STUDY_COHORT_NAME, dtype: int64


---
### Normalizing and checking for correlation

In [9]:
idv_scaler = MinMaxScaler(feature_range=(0, 1))
idv_scaler.fit(all_data[all_data.columns.difference(['SUBJECTKEY','STUDY_COHORT_NAME','sex_bin'])])
all_data[all_data.columns.difference(['SUBJECTKEY','STUDY_COHORT_NAME','sex_bin'])] = \
    idv_scaler.transform(all_data[all_data.columns.difference(['SUBJECTKEY','STUDY_COHORT_NAME','sex_bin'])])
all_data

Unnamed: 0,SUBJECTKEY,INTERVIEW_AGE,SRI24PRECENTRALLGM,SRI24PRECENTRALRGM,SRI24FRONTALSUPLGM,SRI24FRONTALSUPRGM,SRI24FRONTALSUPORBLGM,SRI24FRONTALSUPORBRGM,SRI24FRONTALMIDLGM,SRI24FRONTALMIDRGM,...,SRI24CORPUSCALLOSUMWM,SRI24WM400WM400LWM,SRI24WM400WM400RWM,SRI24VTLSLATERALVTLLCSF,SRI24VTLSLATERALVTLRCSF,SRI24VTLSTHIRDVTLLCSF,SRI24VTLSTHIRDVTLRCSF,SRI24SUPTENTSUPRATENTORIUMV,STUDY_COHORT_NAME,sex_bin
0,NDAR_INV47FFMT85,0.730769,0.427935,0.312790,0.329006,0.494018,0.553185,0.248272,0.580806,0.563136,...,0.442683,0.573540,0.551201,0.113070,0.078897,0.219352,0.268849,0.279479,ABCD NP Challenge 2019 Training Set,1.0
1,NDAR_INVGP0KJLU3,0.653846,0.375488,0.484359,0.339774,0.512916,0.513188,0.462092,0.823286,0.606311,...,0.394940,0.375327,0.286871,0.072311,0.042373,0.146441,0.109464,0.230826,ABCD NP Challenge 2019 Training Set,0.0
2,NDAR_INVPMHGZU3M,0.500000,0.448786,0.504869,0.443692,0.531500,0.527939,0.367530,0.412101,0.364393,...,0.644641,0.692629,0.728605,0.210519,0.047408,0.296774,0.326918,0.138476,ABCD NP Challenge 2019 Training Set,1.0
3,NDAR_INVRPYKZRFL,0.384615,0.330454,0.468127,0.468582,0.427228,0.515623,0.310516,0.410344,0.416241,...,0.666578,0.566847,0.543932,0.025747,0.005786,0.096684,0.154030,0.363902,ABCD NP Challenge 2019 Training Set,1.0
4,NDAR_INVB8UX2D6Y,0.500000,0.318695,0.586295,0.659693,0.741631,0.514902,0.433210,0.700705,0.415170,...,0.314876,0.429555,0.456135,0.107885,0.038739,0.215837,0.247668,0.253220,ABCD NP Challenge 2019 Training Set,1.0
5,NDAR_INVXHD5E4MN,0.076923,0.238062,0.302160,0.379664,0.391236,0.479125,0.496169,0.549717,0.672929,...,0.618360,0.484964,0.473223,0.060954,0.030514,0.293549,0.253250,0.251486,ABCD NP Challenge 2019 Training Set,0.0
6,NDAR_INVLGBWLRM4,0.307692,0.152099,0.259396,0.349954,0.345720,0.348731,0.408119,0.347722,0.507995,...,0.534445,0.678909,0.670324,0.086915,0.103254,0.267514,0.265772,0.212792,ABCD NP Challenge 2019 Training Set,1.0
7,NDAR_INVUCGJ1U8F,0.500000,0.295462,0.496603,0.580116,0.514957,0.499118,0.391873,0.346881,0.196277,...,0.428261,0.736634,0.740361,0.069557,0.035983,0.456799,0.351558,0.255846,ABCD NP Challenge 2019 Training Set,1.0
8,NDAR_INVAVR6TURT,0.269231,0.296849,0.446853,0.485876,0.566110,0.625725,0.484008,0.499147,0.485112,...,0.332979,0.422191,0.421817,0.091726,0.053141,0.248261,0.210657,0.240091,ABCD NP Challenge 2019 Training Set,0.0
9,NDAR_INV930295U4,0.384615,0.484346,0.455493,0.494523,0.538455,0.586550,0.440536,0.439575,0.601026,...,0.547942,0.554798,0.520046,0.042634,0.032454,0.199797,0.150750,0.343242,ABCD NP Challenge 2019 Training Set,0.0


In [10]:
# Create correlation matrix
corr_matrix = all_data.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

In [11]:

# Find index of feature columns with correlation greater than 0.95
to_drop_dueTOhighCor = [column for column in upper.columns if any(upper[column] > 0.95)]
to_drop_dueTOhighCor

['SRI24THALAMUSRGM', 'SRI24CBLMHEMIWHTRWM', 'SRI24WM400WM400RWM']

In [12]:
all_data = all_data[all_data.columns.difference(to_drop_dueTOhighCor)].copy()

# checked for 0 variance columns, there are none.

___

### Splitting in train (80%), validation_internal (5%), validation_forEnsemble (15%) with strata on sex, IQ and age

In [13]:
# extracting columns on which I need to create strata 


trainyValy_moredt['age_bin']=pd.qcut(trainyValy_moredt['INTERVIEW_AGE'], 5, labels=list(range(0,5)))
trainyValy_moredt['Score_bin']=pd.qcut(trainyValy_moredt['residual_fluid_intelligence_score'], 10,\
                                       labels=list(range(0,10)))
trainyValy_moredt.STUDY_COHORT_NAME.value_counts()

ABCD NP Challenge 2019 Training Set      3736
ABCD NP Challenge 2019 Validation Set     415
Name: STUDY_COHORT_NAME, dtype: int64

In [14]:
val = trainyValy_moredt[trainyValy_moredt['STUDY_COHORT_NAME']=='ABCD NP Challenge 2019 Validation Set']
trainy_moredt = trainyValy_moredt[trainyValy_moredt['STUDY_COHORT_NAME']=='ABCD NP Challenge 2019 Training Set']
print(val.shape)
print(trainy_moredt.shape)


(415, 8)
(3736, 8)


In [15]:
# splitting training into 80-20 and the 20 further into 75-25 (75 for ensemble and 25 our internal validation)

train, vals = train_test_split(trainy_moredt, \
                                       test_size=0.20, \
                                       random_state=243,\
                                       stratify=trainy_moredt[['Score_bin', 'sex_bin','age_bin']])
val_ForEnsemble, val_internal = train_test_split(vals, \
                                       test_size=0.25, \
                                       random_state=243,\
                                       stratify=vals[['Score_bin', 'sex_bin','age_bin']])

In [16]:
print("train shape >",train.shape)
print("val_ForEnsemble shape >",val_ForEnsemble.shape)
print("val_internal shape >",val_internal.shape)

print(train.Score_bin.value_counts())
print(val_ForEnsemble.Score_bin.value_counts())
print(val_internal.Score_bin.value_counts())


train shape > (2988, 8)
val_ForEnsemble shape > (561, 8)
val_internal shape > (187, 8)
8    310
9    307
0    304
2    301
6    300
3    300
7    299
1    292
5    288
4    287
Name: Score_bin, dtype: int64
8    59
6    57
2    57
1    57
9    56
7    56
3    56
0    56
4    54
5    53
Name: Score_bin, dtype: int64
9    21
5    21
3    19
0    19
8    18
7    18
6    18
4    18
1    18
2    17
Name: Score_bin, dtype: int64


In [17]:
# adding category column
train['sample'] = 'train'
val_ForEnsemble['sample'] = 'val_forEnsemble'
val_internal['sample'] = 'val_internal'
val['sample'] = 'val'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

In [18]:
allY_withsegments = train.append(val_ForEnsemble).append(val_internal).append(val).copy()
allY_withsegments['sample'].value_counts()

train              2988
val_forEnsemble     561
val                 415
val_internal        187
Name: sample, dtype: int64

In [19]:
allY_withsegments.to_csv(data_dir+'allY_withsegments.csv', sep=',',index=False)


In [20]:
all_data = pd.merge(all_data, allY_withsegments[['residual_fluid_intelligence_score','SUBJECTKEY','sample']], on='SUBJECTKEY',how='left')



In [21]:
all_data['sample'].isna().value_counts()

False    4151
True     3651
Name: sample, dtype: int64

In [22]:
all_data.to_csv(data_dir+'all_volumetrix_processed.csv', sep=',',index=False)

Unnamed: 0,INTERVIEW_AGE,SRI24AMYGDALALGM,SRI24AMYGDALARGM,SRI24ANGULARLGM,SRI24ANGULARRGM,SRI24CALCARINELGM,SRI24CALCARINERGM,SRI24CAUDATELGM,SRI24CAUDATERGM,SRI24CBLMHEMIWHTLWM,...,SRI24VTLSLATERALVTLLCSF,SRI24VTLSLATERALVTLRCSF,SRI24VTLSTHIRDVTLLCSF,SRI24VTLSTHIRDVTLRCSF,SRI24WM400WM400LWM,STUDY_COHORT_NAME,SUBJECTKEY,sex_bin,residual_fluid_intelligence_score,sample
0,0.730769,0.414616,0.498913,0.408479,0.534032,0.466244,0.427001,0.495336,0.517034,0.457057,...,0.113070,0.078897,0.219352,0.268849,0.573540,ABCD NP Challenge 2019 Training Set,NDAR_INV47FFMT85,1.0,18.617968,val_internal
1,0.653846,0.503110,0.495210,0.403832,0.259560,0.373547,0.284987,0.463980,0.324082,0.295725,...,0.072311,0.042373,0.146441,0.109464,0.375327,ABCD NP Challenge 2019 Training Set,NDAR_INVGP0KJLU3,0.0,3.453365,train
2,0.500000,0.362114,0.445087,0.258911,0.380400,0.510958,0.545192,0.606682,0.493726,0.578211,...,0.210519,0.047408,0.296774,0.326918,0.692629,ABCD NP Challenge 2019 Training Set,NDAR_INVPMHGZU3M,1.0,0.632251,train
3,0.384615,0.354227,0.495866,0.564634,0.402903,0.279941,0.348822,0.280813,0.199971,0.457937,...,0.025747,0.005786,0.096684,0.154030,0.566847,ABCD NP Challenge 2019 Training Set,NDAR_INVRPYKZRFL,1.0,1.488461,train
4,0.500000,0.504922,0.536298,0.415827,0.742165,0.334548,0.394898,0.513267,0.510823,0.571353,...,0.107885,0.038739,0.215837,0.247668,0.429555,ABCD NP Challenge 2019 Training Set,NDAR_INVB8UX2D6Y,1.0,1.774360,val_internal
5,0.076923,0.548186,0.623860,0.366376,0.477545,0.430969,0.554960,0.278450,0.239574,0.339701,...,0.060954,0.030514,0.293549,0.253250,0.484964,ABCD NP Challenge 2019 Training Set,NDAR_INVXHD5E4MN,0.0,6.180170,val_forEnsemble
6,0.307692,0.282067,0.379114,0.295948,0.464647,0.357310,0.311682,0.476758,0.549624,0.575798,...,0.086915,0.103254,0.267514,0.265772,0.678909,ABCD NP Challenge 2019 Training Set,NDAR_INVLGBWLRM4,1.0,11.878714,train
7,0.500000,0.574076,0.609198,0.375601,0.474799,0.266511,0.257622,0.443047,0.366232,0.453853,...,0.069557,0.035983,0.456799,0.351558,0.736634,ABCD NP Challenge 2019 Training Set,NDAR_INVUCGJ1U8F,1.0,1.796625,train
8,0.269231,0.350792,0.536245,0.365410,0.441224,0.446118,0.494009,0.519339,0.497299,0.451206,...,0.091726,0.053141,0.248261,0.210657,0.422191,ABCD NP Challenge 2019 Training Set,NDAR_INVAVR6TURT,0.0,2.199330,train
9,0.384615,0.409298,0.545981,0.261631,0.376005,0.454516,0.756365,0.435638,0.368323,0.370388,...,0.042634,0.032454,0.199797,0.150750,0.554798,ABCD NP Challenge 2019 Training Set,NDAR_INV930295U4,0.0,6.921028,train
