In [1]:
import numpy as np
import pandas as pd
import os, sys
import json
import pprint

In [2]:
dataset_name = 'primary_tumor'
inp_fname = 'primary_tumor_data.csv'

In [3]:
input_dir = './raw'
output_dir = './processed'

outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')
outp_train_fname = os.path.join(output_dir, f'{dataset_name}_train.csv')
outp_test_fname = os.path.join(output_dir, f'{dataset_name}_test.csv')
outp_test_key_fname = os.path.join(output_dir, f'{dataset_name}_test_key.csv')
outp_infer_instances = os.path.join(output_dir, f'{dataset_name}_infer_req.json')

# Read Data

In [4]:
cols = [
    "age",
    "sex",
    "histologic-type",
    "degree-of-diffe",
    "bone",
    "bone-marrow",
    "lung",
    "pleura",
    "peritoneum",
    "liver",
    "brain",
    "skin",
    "neck",
    "supraclavicular",
    "axillar",
    "mediastinum",
    "abdominal",
    "class",
]

In [5]:
data = pd.read_csv(os.path.join(input_dir, inp_fname), header=None, names=cols)
data.head()

Unnamed: 0,age,sex,histologic-type,degree-of-diffe,bone,bone-marrow,lung,pleura,peritoneum,liver,brain,skin,neck,supraclavicular,axillar,mediastinum,abdominal,class
0,'>=60',female,?,?,no,no,no,no,no,yes,no,no,no,no,no,no,no,lung
1,'>=60',male,?,poorly,no,no,no,no,no,yes,no,no,no,yes,no,yes,no,lung
2,'30-59',female,adeno,?,no,no,no,yes,no,no,no,yes,no,no,yes,no,no,breast
3,'30-59',female,adeno,?,no,no,no,no,yes,no,no,no,no,no,no,no,no,ovary
4,'30-59',female,adeno,?,no,no,no,yes,yes,no,no,no,no,no,no,no,no,ovary


In [6]:
id_col = "id"
target_col = "class"

# Insert Id Column

In [7]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())
data[id_col] = data[id_col].astype(str)

   id      age     sex histologic-type degree-of-diffe bone bone-marrow lung  \
0   0   '>=60'  female               ?               ?   no          no   no   
1   1   '>=60'    male               ?          poorly   no          no   no   
2   2  '30-59'  female           adeno               ?   no          no   no   
3   3  '30-59'  female           adeno               ?   no          no   no   
4   4  '30-59'  female           adeno               ?   no          no   no   

  pleura peritoneum liver brain skin neck supraclavicular axillar mediastinum  \
0     no         no   yes    no   no   no              no      no          no   
1     no         no   yes    no   no   no             yes      no         yes   
2    yes         no    no    no  yes   no              no     yes          no   
3     no        yes    no    no   no   no              no      no          no   
4    yes        yes    no    no   no   no              no      no          no   

  abdominal   class  
0        n

# Shuffle Data

In [8]:
# shuffle data
data = data.sample(frac=1, random_state=42)
data.head()

Unnamed: 0,id,age,sex,histologic-type,degree-of-diffe,bone,bone-marrow,lung,pleura,peritoneum,liver,brain,skin,neck,supraclavicular,axillar,mediastinum,abdominal,class
284,284,'>=60',female,epidermoid,poorly,no,no,no,no,no,no,no,no,no,no,no,yes,no,lung
116,116,'30-59',male,adeno,?,yes,no,no,no,no,no,no,no,no,no,no,no,no,kidney
113,113,'30-59',female,?,?,yes,no,yes,no,no,no,no,no,no,no,no,no,no,kidney
42,42,'30-59',female,adeno,?,no,no,yes,no,no,yes,no,no,no,no,no,no,yes,pancreas
126,126,'30-59',female,adeno,?,no,no,no,yes,no,no,no,no,no,no,no,no,yes,ovary


# Replace null indicators

In [9]:
data = data.replace('?', np.nan)
data.head()

Unnamed: 0,id,age,sex,histologic-type,degree-of-diffe,bone,bone-marrow,lung,pleura,peritoneum,liver,brain,skin,neck,supraclavicular,axillar,mediastinum,abdominal,class
284,284,'>=60',female,epidermoid,poorly,no,no,no,no,no,no,no,no,no,no,no,yes,no,lung
116,116,'30-59',male,adeno,,yes,no,no,no,no,no,no,no,no,no,no,no,no,kidney
113,113,'30-59',female,,,yes,no,yes,no,no,no,no,no,no,no,no,no,no,kidney
42,42,'30-59',female,adeno,,no,no,yes,no,no,yes,no,no,no,no,no,no,yes,pancreas
126,126,'30-59',female,adeno,,no,no,no,yes,no,no,no,no,no,no,no,no,yes,ovary


# Save Main Data File

In [10]:
data.to_csv(outp_fname, index=False)

# Train Test Split

In [11]:
from sklearn.model_selection import train_test_split
test_size = 0.1

data_train, data_test = train_test_split(data, test_size=test_size, random_state=42)
print(data_train.shape, data_test.shape)

data_train.to_csv(outp_train_fname, index=False)
data_test.drop(columns=[target_col]).to_csv(outp_test_fname, index=False)
data_test[[id_col, target_col]].to_csv(outp_test_key_fname, index=False)

(305, 19) (34, 19)


# JSON inference request instance

In [12]:
instance = data_test.replace({np.nan:None}).drop(columns=[target_col]).reset_index(drop=True).loc[0].to_dict()
infer_req_instance_dict = {  "instances": [ {**instance}, ] }
pprint.pprint(infer_req_instance_dict)
   
with open(outp_infer_instances, 'w', encoding='utf8') as f:
    json.dump(infer_req_instance_dict, f, indent=2, ensure_ascii=False)

{'instances': [{'abdominal': 'no',
                'age': "'30-59'",
                'axillar': 'no',
                'bone': 'no',
                'bone-marrow': 'no',
                'brain': 'yes',
                'degree-of-diffe': 'poorly',
                'histologic-type': 'anaplastic',
                'id': '190',
                'liver': 'no',
                'lung': 'no',
                'mediastinum': 'no',
                'neck': 'no',
                'peritoneum': 'no',
                'pleura': 'no',
                'sex': 'male',
                'skin': 'no',
                'supraclavicular': 'no'}]}
