# Data set stratification

This script randomly assigns training/test labels to each entry in a data set.

One quarter (1/4) of the data is assigned as test, and rest as training. The labeling
is stratified for censoring so that both testing and training pieces have about the same
amount of censoring.

In [1]:
import datasets as ds
import pandas as pd
import numpy as np

This cell can be used for all data sets except *colon*. *colon* is special because it has 3 types of events instead of just 2. Just change the first line to run a different data set.

In [38]:
#data = ds._pbc
#data = ds._lung
#data = ds._nwtco
data = ds._flchain

df = pd.read_csv(data['filename'][:-4] + "_org.csv",
                 sep=None, engine='python')
k = 4

# flchain has three guys at zero, remove them
if 'flchain' in data['filename']:
    df = df[(df[data['timecol']] > 0)]

# Need shape later
n, d = df.shape

# Random reordering
df = df.reindex(np.random.permutation(df.index))
df.sort(data['eventcol'], inplace=True)

assignments = np.array((n // k + 1) * list(range(0, k)))
assignments = assignments[:n]

print(assignments.shape)
print(df.shape)

# Create a new column that specifies set
df['set'] = 1
# 0 is testing
df.loc[assignments == 0, 'set'] = 'testing'
# rest is training
df.loc[assignments != 0, 'set'] = 'training'

print("Training size:", np.sum(df['set'] == 'training'))
print("Testing size:", np.sum(df['set'] == 'testing'))

df = df.reindex(np.sort(df.index))

(7871,)
(7871, 11)
Training size: 5903
Testing size: 1968


Print the labeled to data to a new file.

In [39]:
fname = data['filename']
print(fname)
df.to_csv(fname, na_rep='NA', index=False)

data/flchain.csv


# Colon

Is kind of special. It has 3 events where two must be combined before stratification is possible.

In [48]:
data = ds._colon

df = pd.read_csv(data['filename'], sep=None, engine='python')
n, d = df.shape
k = 4

# Construct lists of events, censored
events = []
censored = []

for i in df['id'].unique():
    x = ((df['id'] == i) & (df['etype'] == 1))
    if df[x]['status'].sum() < 1:
        censored.append(i)
    else:
        events.append(i)



0

In [64]:
trainingids = []
testingids = []
for d in [events, censored]:
    ids = np.random.permutation(d)

    n = len(ids)
    k = 4
    assignments = np.array((n // k + 1) * list(range(0, k)))
    assignments = assignments[:n]

    testingids.extend(ids[assignments == 0])
    trainingids.extend(ids[assignments != 0])
    
df['set'] = 1

for i in trainingids:
    which = (df['id'] == i)
    df.loc[which, 'set'] = 'training'
    
for i in testingids:
    which = (df['id'] == i)
    df.loc[which, 'set'] = 'testing'
    
print("Training size:", np.sum(df['set'] == 'training'))
print("Testing size:", np.sum(df['set'] == 'testing'))
df

Training size: 1392
Testing size: 466


Unnamed: 0,id,study,rx,sex,age,obstruct,perfor,adhere,nodes,status,differ,extent,surg,node4,time,etype,set
0,1,1,Lev+5FU,1,43,0,0,0,5,1,2,3,0,1,1521,2,training
1,1,1,Lev+5FU,1,43,0,0,0,5,1,2,3,0,1,968,1,training
2,2,1,Lev+5FU,1,63,0,0,0,1,0,2,3,0,0,3087,2,training
3,2,1,Lev+5FU,1,63,0,0,0,1,0,2,3,0,0,3087,1,training
4,3,1,Obs,0,71,0,0,1,7,1,2,2,0,1,963,2,training
5,3,1,Obs,0,71,0,0,1,7,1,2,2,0,1,542,1,training
6,4,1,Lev+5FU,0,66,1,0,0,6,1,2,3,1,1,293,2,training
7,4,1,Lev+5FU,0,66,1,0,0,6,1,2,3,1,1,245,1,training
8,5,1,Obs,1,69,0,0,0,22,1,2,3,1,1,659,2,testing
9,5,1,Obs,1,69,0,0,0,22,1,2,3,1,1,523,1,testing


Print data to file.

In [65]:
fname = data['filename'][:-8] + '.csv'
print(fname)
df.to_csv(fname, na_rep='NA', index=False)

data/colon.csv
