# Breast Cancer Survival Time after Diagnosis  


Data source:  U.S. Department of Health and Human Services, National Institutes of Health, National Cancer Institute  
Surveillance, Epidemiology, and End Results (SEER) Program - Incidences of Breast Cancer  
https://seer.cancer.gov/   

In [2]:
import numpy as np
import pandas as pd

#Data management
import lib.read_file as rf


In [3]:
#bring in data
df = rf.read_file('./data/BREAST.TXT')


In [3]:
#remove columns not needed
columns_not_needed = ['HISTO2V', 'BEHO2V', 'DX_CONF', 'REPT_SRC', 'EOD10_PE',
                      'EOD10_NE', 'EOD13', 'EOD2', 'EOD4', 'EOD_CODE', 
                      'TUMOR_1V', 'TUMOR_2V', 'TUMOR_3V', 'DAJCCFL', 
                      'AGE_1REC', 'SITERWHO', 'ICDOTO9V', 'ICDOT10V', 
                      'ICCC3WHO', 'ICCC3XWHO', 'BEHTREND', 'HISTREC', 
                      'HISTRECB', 'cs0204schema', 'RAC_RECA', 'RAC_RECY', 
                      'ORIGRECB', 'AJCC_STG', 'AJ_3SEER', 'SSS77VZ', 'SSSM2KPZ',
                      'IHSLINK', 'AYASITERWHO', 'LYMSUBRWHO', 'csschema', 
                      'CS8SITE', 'CS10SITE', 'CS11SITE', 'CS13SITE', 'CS15SITE',
                      'CS16SITE', 'VASINV', 'CS9SITE', 'CS12SITE', 'ANNARBOR',
                      'CSLYMPHN', 'CSEXTEN', 'CSTUMSIZ', 'EOD10_SZ', 'EOD10_EX',
                      'EOD10_PN', 'CS25SITE', 'CSVLATES', 'SURGSCOF', 
                      'CSVCURRENT', 'intprim', 'erstatus', 'prstatus', 
                      'DAJCC7T', 'DAJCC7N', 'DAJCC7M', 'DAJCC7STG', 'ADJTM_6VALUE',
                      'ADJNM_6VALUE', 'ADJM_6VALUE', 'ADJAJCCSTG', 'her2',
                      'T_VALUE', 'N_VALUE', 'M_VALUE', 'EOD10_ND', 'CSMETSDX',
                      'CS1SITE', 'CS2SITE', 'CS3SITE', 'CS4SITE', 'CS5SITE',
                      'CS6SITE', 'DAJCCT', 'DAJCCN', 'DAJCCM', 'DAJCCSTG',
                      'DSS1977S', 'DSS2000S', 'CSVFIRST', 'SURGPRIF', 'SURGSITF',
                      'NUMNODES', 'NO_SURG', 'SS_SURG', 'SURGSCOP', 'SURGSITE',
                      'CSTSEVAL', 'CSRGEVAL', 'CSMTEVAL', 'SUMM2K', 'INSREC_PUB',
                      'CS7SITE', 'CSMETSDXB_PUB', 'CSMETSDXBR_PUB', 
                      'CSMETSDXLIV_PUB', 'CSMETSDXLUNG_PUB']

 
  

In [4]:
#Drop rows with no data
df.drop([0,1,2,3], axis=0, inplace=True)

In [5]:
#Drop columns not needed
df.drop(columns_not_needed, axis=1, inplace=True)

In [6]:
#Remove whitespace
df = df.apply(lambda x: x.str.strip())

In [7]:
df.replace('', '-1' , inplace=True)

In [10]:
#Inspect columns and values
for i in df.columns:
    print(i, df[i].unique())

#Check for sparsity
# for i in df.columns:
#     print(i, df[df[i].str.contains('-1')][i].count())
    

PUBCSNUM ['07000100' '07000115' '07000118' ... '41598077' '41598096' '41598128']
REG ['0000001502' '0000001520' '0000001521' '0000001522' '0000001523'
 '0000001501' '0000001526' '0000001525' '0000001527']
MAR_STAT ['5' '2' '1' '3' '9' '4' '6']
RACE1V ['01' '02' '04' '99' '96' '05' '06' '98' '03' '08' '16' '07' '15' '11'
 '10' '13' '21' '17' '14' '27' '97' '32' '12' '25' '20' '30' '28' '31'
 '22' '26']
NHIADE ['0' '4' '5' '7' '2' '6' '1' '3' '8']
SEX ['2' '1']
AGE_DX ['065' '061' '045' '078' '076' '067' '087' '070' '068' '058' '088' '072'
 '069' '073' '060' '054' '089' '077' '079' '059' '081' '063' '064' '074'
 '046' '055' '080' '066' '075' '037' '050' '051' '083' '052' '062' '084'
 '085' '039' '056' '040' '042' '043' '053' '041' '071' '091' '090' '086'
 '047' '093' '057' '082' '095' '048' '094' '092' '097' '049' '035' '098'
 '096' '099' '044' '101' '034' '038' '033' '031' '036' '032' '027' '024'
 '030' '026' '028' '029' '020' '023' '025' '105' '022' '100' '999' '019'
 '106' '015' '103'

MALIGCOUNT ['02' '03' '04' '01' '05' '06' '09' '07' '08' '99' '25' '12' '14']
BENBORDCOUNT ['00' '01' '02' '03' '04' '05']


### Columns:  
PUBCSNUM  - Patient ID number.  
REG - SEER location on which the data are based.  
MAR_STAT - coded mariatal status.  
RACE1V - coded ancestry information.  
NHIADE - coded hispanic anscestry information.  
SEX - coded gender.  
AGE_DX - integer age at diagnosis.  
YR_BRTH - integer birth year of patient.  
SEQ_NUM - integer of the number of lifetime diagnoses neoplasms. Where '0' indicates the first and only lifetime diagnosis and after a second diagnosis '0' changes to '1'.  
MDXRECMP - integer of month of diagnosis.  
YEAR_DX - integer of year of diagnosis.  
PRIMSITE - coded primary site of diagnosis.  
LATERAL - coded side of paired organs for the diagnosis.  
HISTO3V - coded histology type by __The International Classification of Diseases for Oncology__, Third Edition (ICD-O-3).  
BEHO3V - coded behavior by ICD-O-3.  
GRADE - coded for neoplasm grade by ICD-O-3.  
REC_NO - integer of SEER record numbres, number of previous diagnsoses.  
TYPE_FU - coded for follow-up type.   
HST_STGA - coded for extent of disease.  
FIRSTPRM -  coded for first maglignat primary indcator.  
ST_CNTY -  coded for state and county of residence at diagnosis.  
CODPUB -  coded for cause of death.  
CODPUBKM -  coded for cause of death for KS and mesothelioma.  
STAT_REC -  coded for vital status at time of study cut-off date.  
VSRTSADX -  coded for cause specific death, records if a patient has died of the diagnosis.  
ODTHCLASS -  coded for other casues of death (other than diagnosis.  
srv_time_mon -  integer for survival time in months.  
srv_time_mon_flag -  coded for survival time flags.  
INSREC_PUB -  code for insurance status.  
CS7SITE -  coded for site specific information.    
brst_sub - coded for ER status and HR status combinations.    
MALIGCOUNT - integer of a patient’s total reported in situ/malignant cancers.  
BENBORDCOUNT - integer of a patient’s total reported benign/borderline cancers.

In [9]:
#Print dataframe to csv for later use
df.to_csv('./data/set.csv')