### Import packages

In [1]:
import pandas as pd
import numpy as np
import os

### Download Data

In [2]:
%env SET = bec-data-MetroHealth-Dascena:/HarmonizedT-FeatureT/DNAnexus/
    
datadir = "data/" + os.environ['SET'].replace(':','/')
print("Python variable datadir=" + datadir)

env: SET=bec-data-MetroHealth-Dascena:/HarmonizedT-FeatureT/DNAnexus/
Python variable datadir=data/bec-data-MetroHealth-Dascena//HarmonizedT-FeatureT/DNAnexus/


In [3]:
%%bash
DIR=data/${SET/:/\/}
echo local directory: $DIR
mkdir -p $DIR
dx download -rf "$SET/*" --output $DIR

ls -lh $DIR

local directory: data/bec-data-MetroHealth-Dascena//HarmonizedT-FeatureT/DNAnexus/
total 112K
-rw-r--r-- 1 root root 6.0K Sep 19 13:29 123_coding_dictionary.csv
-rw-r--r-- 1 root root  817 Sep 19 13:29 123_data_dictionary.csv
-rw-r--r-- 1 root root 3.5K Sep 19 13:29 Data1.csv
-rw-r--r-- 1 root root  12K Sep 19 13:29 Data123.csv
-rw-r--r-- 1 root root 6.4K Sep 19 13:29 Data2.csv
-rw-r--r-- 1 root root 3.7K Sep 19 13:29 Data3.csv
-rw-r--r-- 1 root root  61K Sep 19 13:29 Data_transform_assignment_RNM.ipynb
-rw-r--r-- 1 root root  114 Sep 19 13:29 example_codings_dict.csv
-rw-r--r-- 1 root root  100 Sep 19 13:29 example_data_dict.csv


### Read CSV Files into Pandas DataFrame

#### Input files: 

In [4]:
df1 = pd.read_csv(datadir + "Data1.csv")
df2 = pd.read_csv(datadir + "Data2.csv")
df3 = pd.read_csv(datadir + "Data3.csv")

#### Example files:

In [17]:
example_codings_dict = pd.read_csv(datadir + "example_codings_dict.csv")
example_data_dict = pd.read_csv(datadir + "example_data_dict.csv")

### Explore the Data

#### Describe df1: 

In [166]:
df1.head()

Unnamed: 0,Patient_ID,Intake_age,Sex,Race,Last_seen
0,DJ1DBUTML8,66.351,M,White,2/11/21
1,DB77YVCX8R,66.416,F,White,2/11/21
2,D7NAYHHQAI,66.603,M,White,2/10/21
3,D74HHURP99,66.663,M,White,2/9/21
4,D2Y7HSIRWK,67.137,F,White,2/9/21


In [167]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
Patient_ID    100 non-null object
Intake_age    100 non-null object
Sex           100 non-null object
Race          99 non-null object
Last_seen     100 non-null object
dtypes: object(5)
memory usage: 4.0+ KB


In [6]:
# Is df1.Patient_ID a unique key -> YES -> one patient per row (100 rows)
print(len(df1.Patient_ID.unique()) == len(df1.Patient_ID))
print(len(df1.Patient_ID.unique()), len(df1.Patient_ID))

True
100 100


#### Describe df2: 

In [7]:
df2.head()

Unnamed: 0,Patient_ID,PrimaryDiagnosisSiteCode,Histology,ClinTStage,ClinNStage,ClinMStage
0,DJ1DBUTML8,C49.1,Malignant Peripheral Nerve Sheath Tumor,Tx,N0,M0
1,DB77YVCX8R,C49.2,Fibromyxosarcoma,Tx,c0,c0
2,D7NAYHHQAI,C49.2,Malignant Peripheral Nerve Sheath Tumor,Unknown/Not Reported,c0,c0i
3,D74HHURP99,C49.2,"Synovial Sarcoma, NOS",Unknown/Not Reported,c0,c0
4,D2Y7HSIRWK,C49.2,"Liposarcoma, Well Differentiated",Unknown/Not Reported,N0,M0


In [8]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102 entries, 0 to 101
Data columns (total 6 columns):
Patient_ID                  102 non-null object
PrimaryDiagnosisSiteCode    102 non-null object
Histology                   102 non-null object
ClinTStage                  93 non-null object
ClinNStage                  82 non-null object
ClinMStage                  83 non-null object
dtypes: object(6)
memory usage: 4.9+ KB


In [9]:
# Is df2.Patient_ID a unique key -> NO -> there are two patients that appear twice
print(len(df2.Patient_ID.unique()) == len(df2.Patient_ID))
print(len(df2.Patient_ID.unique()), len(df2.Patient_ID))

False
100 102


In [10]:
df2.drop_duplicates().shape == df2.shape  # No duplicates 

True

In [173]:
print(' Primary Diagnosi Code - unique values:')
print(df2.PrimaryDiagnosisSiteCode.unique(), end='\n\n')
print('Histology - unique values:')
print(df2.Histology.unique(), end='\n\n')
print('TStage - unique values:')
print(df2.ClinTStage.unique(), end='\n\n')  
print('NStage - unique values:')
print(df2.ClinNStage.unique(), end='\n\n')
print('MStage - unique values:')
print(df2.ClinMStage.unique(), end='\n\n')

 Primary Diagnosi Code - unique values:
['C49.1' 'C49.2' 'C49.4' 'C49.5' 'C49.3' 'C34.3' 'C64.9' 'C49.0' 'C48.0'
 'C18.9' 'C09.1' 'C74.9' 'C49.6' 'C63.1' 'C38.1' 'C16.1' 'C48.1' 'C17.9'
 'C54.9' 'C18.2' 'C47.1' 'C61.9']

Histology - unique values:
['Malignant Peripheral Nerve Sheath Tumor' 'Fibromyxosarcoma'
 'Synovial Sarcoma, NOS' 'Liposarcoma, Well Differentiated'
 'Undifferentiated Sarcoma' 'Leiomyosarcoma, NOS' 'Retinoblastoma, NOS'
 'Meningioma, NOS' 'Dedifferentiated Liposarcoma'
 'Papillary Microcarcinoma' 'Invasive Carcinoma of No Special Type'
 'Papillary Adenocarcinoma, NOS' 'Clear Cell Adenocarcinoma, NOS'
 'Synovial Sarcoma, Spindle Cell' 'Transitional Cell Carcinoma In Situ'
 'Adenocarcinoma, NOS' 'Hepatocellular Carcinoma, NOS'
 'Malignant Fibrous Histiocytoma']

TStage - unique values:
['Tx' 'Unknown/Not Reported' nan 'T2b'
 'No TNM applicable for this site/histology combination' 'T2' 'T1' 'c2a'
 'c1c' 'T1a' 'T1b' 'c2' 'cx' 'c4b' 'cis' 'c1' 'c1a' 'c1b' 'c2b']

NStage - 

#### Describe df3: 

In [11]:
df3.head()

Unnamed: 0,Patient_ID,pathologic tumor size,mitotic rate,HEIGHT3,WEIGHT,BMI,Current_patent
0,DJ1DBUTML8,8.0,25.0,70.0,10.0,1.43,N
1,DB77YVCX8R,18.0,7.0,48.62,29.92,8.9,Y
2,D7NAYHHQAI,25.0,2.0,44.0,29.3,10.64,Y
3,D74HHURP99,11.5,7.0,52.5,43.0,10.97,Y
4,D2Y7HSIRWK,6.5,10.0,50.0,40.26,11.32,Y


In [12]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 7 columns):
Patient_ID               103 non-null object
pathologic tumor size    101 non-null float64
mitotic rate             102 non-null float64
HEIGHT3                  103 non-null float64
WEIGHT                   103 non-null float64
BMI                      103 non-null float64
Current_patent           97 non-null object
dtypes: float64(5), object(2)
memory usage: 5.8+ KB


In [13]:
df3.drop_duplicates().shape == df3.shape # No duplicates 

True

In [15]:
# Is df3.Patient_ID a unique key -> YES -> one patient per row (103 rows)
print(len(df3.Patient_ID.unique()) == len(df3.Patient_ID))
print(len(df3.Patient_ID.unique()), len(df3.Patient_ID))

True
103 103


In [16]:
# Df3 has (the same) 3 extra patients compared to df1 and df2:
print(set(df3.Patient_ID.unique()) - set(df1.Patient_ID.unique()))
print(set(df3.Patient_ID.unique()) - set(df2.Patient_ID.unique()))

{'6F60Y70O', '10R407K3', '1C060V70'}
{'6F60Y70O', '10R407K3', '1C060V70'}


### Task 1:

❖	Create a data file named Data123.csv. This file will combine the data from the three input files into a single output with one patient per row. 


### Solution 1

In [22]:
# Three-way megre - df1 and df2 and joined by an inner join, while df3 is joined using a right join to include the 3 extra patients:
Data123 = pd.merge(pd.merge(df1,df2,on='Patient_ID', how ="inner"),df3,on='Patient_ID', how ="right")

In [27]:
print(Data123.shape)
print(len(Data123.Patient_ID.unique()))
#Data123.sort_values(by=['Intake_age']).tail()

(103, 16)
103


In [24]:
# Use groupby followedby aggregate to collapse multiple patients rows into one row per patient:  
Data123 = Data123.groupby(['Patient_ID'], as_index=False).agg({'Sex': 'first', 
                                                                 'Race': 'first',
                                                                 'Intake_age': set,
                                                                 'Last_seen': set, 
                                                                 'PrimaryDiagnosisSiteCode': set,
                                                                 'Histology': set,
                                                                 'ClinTStage': set,
                                                                 'ClinNStage': set,
                                                                 'ClinMStage': set,
                                                                 'pathologic tumor size': set,
                                                                 'mitotic rate': set,
                                                                 'HEIGHT3': set,
                                                                 'WEIGHT': set,
                                                                 'BMI': set,
                                                                 'Current_patent': set})

In [25]:
# Get rid off sets in cases where there's a signgle value in record:

col_lst3_ = Data123.columns.to_list()[3:]

# Lists of indices where value is a set of two values 
idx_lst = []
# Lists of columns where value is a set of two values -> we will use this information in the solution for Task 3
col_lst = []

for name, values in Data123[col_lst3_].iteritems():
    for index, value in values.iteritems():
        if len(value) == 1:
            Data123.at[index , name] = list(value)[0]
        else:
            idx_lst.append(index)
            col_lst.append(name)
            
print('rows with sets of values:', list(set(idx_lst)))
print('columns with sets of values:', list(set(col_lst)))

rows with sets of values: [62, 14]
columns with sets of values: ['ClinTStage', 'PrimaryDiagnosisSiteCode', 'ClinMStage', 'ClinNStage', 'Histology']


In [184]:
# Example - records in row 62:
Data123.loc[62]

Patient_ID                                                       84GIKAQ7SE
Sex                                                                       F
Race                                                                  White
Intake_age                                                           74.071
Last_seen                                                           1/22/21
PrimaryDiagnosisSiteCode                                     {C38.1, C49.1}
Histology                   {Leiomyosarcoma, NOS, Undifferentiated Sarcoma}
ClinTStage                                                       {T1b, T2b}
ClinNStage                                       {cx, Unknown/Not Reported}
ClinMStage                                       {Unknown/Not Reported, MX}
pathologic tumor size                                                    17
mitotic rate                                                              3
HEIGHT3                                                               44.88
WEIGHT      

In [26]:
print(Data123.shape)
Data123.sort_values(by=['Intake_age']).tail()

(103, 16)


Unnamed: 0,Patient_ID,Sex,Race,Intake_age,Last_seen,PrimaryDiagnosisSiteCode,Histology,ClinTStage,ClinNStage,ClinMStage,pathologic tumor size,mitotic rate,HEIGHT3,WEIGHT,BMI,Current_patent
1,095QMAZA7F,F,White,Age 90 or older,12/23/20,C49.2,"Adenocarcinoma, NOS",T2,N0,M0,15.1,12.0,29.3,28.6,16.99,N
0,06VD4EX00D,F,White,Age 90 or older,12/23/20,C49.6,Dedifferentiated Liposarcoma,T1,c0,c0,,,52.0,65.5,17.03,Y
7,10R407K3,,,,,,,,,,7.1,23.0,24.02,13.11,15.97,Y
10,1C060V70,,,,,,,,,,21.0,5.0,52.6,60.0,15.25,N
50,6F60Y70O,,,,,,,,,,22.0,5.0,53.7,49.4,14.35,N


In [30]:
# test: data from the three input files with one patient per row.
print(len(Data123.Patient_ID.unique()) == Data123.shape[0])
print(len(Data123.Patient_ID.unique()), Data123.shape[0])

True
103 103


In [187]:
# save dateframe Data123 to csv file:
Data123.to_csv('Data123.csv') 

##### Uploading to DNAnexus:

In [188]:
%%bash
dx upload 'Data123.csv' --path /HarmonizedT-FeatureT/DNAnexus/ --brief

file-GGYqbzQ0k906Y6KY5f04724Q


### Task 2:

❖	Create a data dictionary file named  123_data_dictionary.csv. This file will contain one row for each column of data in the combined data file. 

➢	For each column of data, list the name, the type of data(string, int, float, date), description, and units if applicable.  
➢	In addition, for string fields, all unique values should be compiled in a codings list. Data columns for which a coding list is compiled should be marked in the data dictionary with an entry in the “coding_name” column.
➢	An example file named example_data_dict.csv is provided.


In [189]:
example_data_dict.head()

Unnamed: 0,name,coding_name,type,description,units
0,Patient_ID,,string,Patient Identifier,
1,Sex,SEX,string,Sex,


### Solution 2

In [31]:
# Buiding a python dict
dictionary = {
    'name' : Data123.columns.to_list(),
    'coding_name': [np.nan, 'SEX', 'RACE', np.nan, np.nan, 'ICD10', 'Diagnosis', 'TNM staging - T category', 'TNM staging - N category', 'TNM staging - M category', np.nan, np.nan, np.nan, np.nan, np.nan, 'NY'],
    'type': ['string', 'string', 'string', 'float', 'date', 'string', 'string', 'string', 'string', 'string', 'float', 'float', 'float', 'float', 'float', 'string'],
    'description': ['Patient Identifier', 'Sex', 'Race', 'Intake Age', 'Date of Last Visit', 'Primary Diagnosis Code', 
                    'Patient condition or diagnosis name', 'Stage of original (primary) tumor', 'Stage of the lymph nodes', 'Stage of the Metastasis', 
                    'Pathologic Tumor Size', 'Measure of Cell Growth', 'Tumor Height', 'Tumor Weight', 'Tumor Body Mass Index', 'Current Patient'],
    'units': [np.nan, np.nan, np.nan, 'year', np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 'cm', 'mm2', 'mm', 'gr', 'kg / m2', np.nan]  
}

# creating a Dataframe object 
data_dictionary_df = pd.DataFrame(dictionary)

# A note:
#--------
# I assigned the unit 'year' to the column 'Intake Age' as its type should have been a float (as can be seen in the data directort below).
# Given values like 'Age 90 or older' the whole column is cast as a string.
# In a real wrok scenario, I would change 'Age 90 or older' to 90.000, for example, and cast the column as a float. 

In [32]:
data_dictionary_df

Unnamed: 0,name,coding_name,type,description,units
0,Patient_ID,,string,Patient Identifier,
1,Sex,SEX,string,Sex,
2,Race,RACE,string,Race,
3,Intake_age,,float,Intake Age,year
4,Last_seen,,date,Date of Last Visit,
5,PrimaryDiagnosisSiteCode,ICD10,string,Primary Diagnosis Code,
6,Histology,Diagnosis,string,Patient condition or diagnosis name,
7,ClinTStage,TNM staging - T category,string,Stage of original (primary) tumor,
8,ClinNStage,TNM staging - N category,string,Stage of the lymph nodes,
9,ClinMStage,TNM staging - M category,string,Stage of the Metastasis,


In [192]:
# saving dataframe to a csv file
data_dictionary_df.to_csv('123_data_dictionary.csv')

##### Uploading to DNAnexus:

In [193]:
%%bash
dx upload '123_data_dictionary.csv' --path /HarmonizedT-FeatureT/DNAnexus/ --brief

file-GGYqbzj0k9004Pb71xp7f5VV


### Task 3:

❖	Create a coding dictionary file named 123_coding_dictionary.csv. This file will have a row for each unique value from every categorical data column. 

➢	The value found in the data will be listed in the “code” column.  
➢	In some cases the codes can be translated to a meaning. 
➢	If the meaning is not obvious, the code and meaning columns can hold the same values.
➢	An example file named example_coding_dict.csv is provided.

In [194]:
example_codings_dict

Unnamed: 0,coding_name,code,meaning
0,NY,N,No
1,NY,,NA;Not Applicable
2,NY,U,U;UNK;Unknown
3,NY,Y,Yes
4,SEX,F,Female
5,SEX,M,Male


### Solution 3

#### Step 1 - Building a data dictionary for categorical columns only using the outcome of solution 2

In [34]:
# Creat a subset dataframe using data_dictionary_df that holds info on categorical columns only: 
data_dictionary_categorical_df = data_dictionary_df.loc[~data_dictionary_df.coding_name.isna()]

In [35]:
data_dictionary_categorical_df

Unnamed: 0,name,coding_name,type,description,units
1,Sex,SEX,string,Sex,
2,Race,RACE,string,Race,
5,PrimaryDiagnosisSiteCode,ICD10,string,Primary Diagnosis Code,
6,Histology,Diagnosis,string,Patient condition or diagnosis name,
7,ClinTStage,TNM staging - T category,string,Stage of original (primary) tumor,
8,ClinNStage,TNM staging - N category,string,Stage of the lymph nodes,
9,ClinMStage,TNM staging - M category,string,Stage of the Metastasis,
15,Current_patent,NY,string,Current Patient,


#### Step 2 - creating a name-code dataframe

In [38]:
# Data123: columns and rows where values are data type set: all column (but Patient_ID) coming from df2, in rows 14 and 62

print('RE: Task1 -- Columns and Rows where values data type = set:')
print('rows with sets of values:', list(set(idx_lst)))
print('columns with sets of values:', list(set(col_lst)))

RE: Task1 -- Columns and Rows where values data type = set:
rows with sets of values: [62, 14]
columns with sets of values: ['ClinTStage', 'PrimaryDiagnosisSiteCode', 'ClinMStage', 'ClinNStage', 'Histology']


In [36]:
# Creat a dataframe using df2 and Data123 that connects between name and code: 

categorical_col_lst = data_dictionary_categorical_df.name.to_list()
#print(categorical_col_lst)

name = []
code = []

for col_name in categorical_col_lst:

    unq_val_name = col_name + "_unique_vals"
    
    if col_name in col_lst:  # col_lst is a list of the column names where Data123 has type set data in some of the records (please find the answer for Task1 above) 
        unique_vals_lst = list(df2[col_name].unique())
    else:
        unique_vals_lst = list(Data123[col_name].unique())
        
    locals()[col_name] = [col_name for n in range(len(unique_vals_lst))]
    locals()[unq_val_name] = unique_vals_lst

    name.append(locals()[col_name])
    code.append(locals()[unq_val_name])

# flat name:
flat_name = [x for lst in name for x in lst]
flat_code = [x for lst in code for x in lst]

# Creat a dataframe with two columns: 'name and 'code'
zipped = list(zip(flat_name, flat_code))
name_code_df = pd.DataFrame(zipped, columns=['name', 'code'])

In [39]:
name_code_df.head()

Unnamed: 0,name,code
0,Sex,F
1,Sex,M
2,Sex,
3,Race,White
4,Race,


#### Step 3 - creating a code-meaning dataframe

In [40]:
# Building a dictionary of codes values as the keys and meanings as the values:

dict_code = {'N': 'No', 'Y': 'Yes', 'y': 'Yes', 'NaN': 'NA;Not Applicable', 'U': 'U;UNK;Unknown',
            'F': 'Female', 'M': 'Male', 'nan': 'Not Applicable / Not Available',
             'No TNM applicable for this site/histology combination': 'TNM system not available',
             'Tx': 'no information about the primary tumor', 
             'T1': 'tumor size and/or amount of spread into nearby structures', 'T2': 'tumor size and/or amount of spread into nearby structures',
             'T1a': 'tumor is 1cm or less at its widest part', 'T1b': 'tumor is between 1cm and 2cm across', 
             'T2a': 'tumor is between 3cm and 4cm', 'T2b': 'tumor is between 4cm and 5cm',
             'c1': 'TME-Classical complement pathway', 'c1a': 'TME-Classical complement pathway', 'c1b': 'TME-Classical complement pathway', 
             'c1c': 'TME-Classical complement pathway', 'c2': 'TME-Classical complement pathway',  'c2a': 'TME-Classical complement pathway',  
             'c2b': 'TME-Classical complement pathway', 'c4b': 'TME-Classical complement pathway', 
             'c0': 'stage 0 - carcinoma in situ', 'c0i': 'stage 0 - carcinoma in situ', 'cis': 'carcinoma in situ', 'cx': 'chemotherapy combination: cisplatin & capecitabine (Xeloda)',
             'N0': 'nearby lymph nodes do not contain cancer', 'Nx': 'no information about the nearby lymph nodes',
             'M0': 'no distant metastasis' , 'M1': 'metastasis to distant organs or tissues', 
             'MX': 'cancers that could not be evaluated for Mitosis', 'C49.1': 'Diagnosis Code', 'C49.2': 'Diagnosis Code', 'C49.4': 'Diagnosis Code', 'C49.5': 'Diagnosis Code', 'C49.3': 'Diagnosis Code', 'C34.3': 'Diagnosis Code', 
             'C64.9': 'Diagnosis Code', 'C49.0': 'Diagnosis Code', 'C48.0': 'Diagnosis Code', 'C18.9': 'Diagnosis Code', 'C09.1': 'Diagnosis Code', 'C74.9': 'Diagnosis Code', 
             'C49.6': 'Diagnosis Code', 'C63.1': 'Diagnosis Code', 'C38.1':'Diagnosis Code', 'C16.1': 'Diagnosis Code', 'C48.1': 'Diagnosis Code', 
             'C17.9': 'Diagnosis Code', 'C54.9': 'Diagnosis Code', 'C18.2': 'Diagnosis Code', 'C47.1': 'Diagnosis Code', 'C61.9': 'Diagnosis Code'}

# Resources: 
# 1. Cancer Staging - https://www.cancer.org/treatment/understanding-your-diagnosis/staging
# 2. TNM staging - https://www.cancerresearchuk.org/about-cancer/lung-cancer/stages-types-grades/tnm-staging
# 3. Complement System - https://www.immunology.org/public-information/bitesized-immunology/systems-and-processes/complement-system

# A note:       
#--------
# I couldn't find info on 'c0i', therefore, the meaning in the table is my best guess. 

In [41]:
# Creating a dataframe with two columns: 'code' and 'meaning':

code_lst = list(dict_code.keys())      # a list of dict_code keys
meaning_lst = list(dict_code.values()) # a list of dict_code value 

zipped = list(zip(code_lst, meaning_lst))
code_meaning_df = pd.DataFrame(zipped, columns=['code', 'meaning'])

In [42]:
code_meaning_df.head()

Unnamed: 0,code,meaning
0,N,No
1,Y,Yes
2,y,Yes
3,,NA;Not Applicable
4,U,U;UNK;Unknown


#### Step 4 - joining all three tables to get the final dataframe

In [43]:
data_dictionary_categorical_df.head(1)

Unnamed: 0,name,coding_name,type,description,units
1,Sex,SEX,string,Sex,


In [44]:
name_code_df.head(1)

Unnamed: 0,name,code
0,Sex,F


In [45]:
code_meaning_df.head(1)

Unnamed: 0,code,meaning
0,N,No


In [46]:
# Creating coding_dictionary_df 
coding_dictionary_df = pd.merge(pd.merge(data_dictionary_categorical_df[['name', 'coding_name']], name_code_df, on='name', how ="inner"), code_meaning_df, on='code', how ="left")

# Updating the 'meaning' column when 'code' column is nan:
coding_dictionary_df.loc[coding_dictionary_df.code.isna(), 'meaning'] = 'NA;Not Applicable'

# Updating the 'meaning' column when 'meaning' column is nan:
coding_dictionary_df.loc[coding_dictionary_df.meaning.isna(), 'meaning'] = coding_dictionary_df.loc[coding_dictionary_df.meaning.isna()].code

In [48]:
# Drop the 'name' column to get the desired dataframe:
coding_dictionary_df = coding_dictionary_df.drop(['name'], axis=1)

In [51]:
print(coding_dictionary_df.shape)
coding_dictionary_df.head()

(92, 3)


Unnamed: 0,coding_name,code,meaning
0,SEX,F,Female
1,SEX,M,Male
2,SEX,,NA;Not Applicable
3,RACE,White,White
4,RACE,,NA;Not Applicable


In [211]:
# save dateframe Data123 to csv file:
coding_dictionary_df.to_csv('123_coding_dictionary.csv')

##### uploading to DNAnexus:

In [212]:
%%bash
dx upload '123_coding_dictionary.csv' --path /HarmonizedT-FeatureT/DNAnexus/ --brief

file-GGYqbzj0k90BQqq1P4gqy2PY
