In [1]:
import pandas as pd
df = pd.read_csv(r'/home/refu0917/lungcancer/seerdb/csv_folder/seerexport.csv')
rename_dict = {'Patient ID':'id',
            'Site recode ICD-O-3/WHO 2008':'site',
            'Sequence number':'seq_number',
            'Sex' : 'Gender',
            'Year of diagnosis' : 'diag_year',
            'Age recode with <1 year olds':'Age',
            'CS tumor size (2004-2015)':'TMRSZ',
            'Regional nodes positive (1988+)':'LYMND',
            'Derived AJCC Stage Group, 7th ed (2010-2015)':'AJCCstage',
            'Visceral and Parietal Pleural Invasion Recode (2010+)':'SSF2',
            'Separate Tumor Nodules Ipsilateral Lung Recode (2010+)':'SSF1',
            'Grade (thru 2017)':'DIFF',
            'Survival months':'survival_month',
            'Vital status recode (study cutoff used)': 'vital_sts',
            'ICD-O-3 Hist/behav': 'histologic',
            'Behavior code ICD-O-3' : 'behavior',
            'Regional nodes examined (1988+)' : 'LYMND_exm'}
df = df.rename(columns=rename_dict)
#df = df.drop(columns=['Year of diagnosis'])

In [2]:
df_ipc = df.loc[(df['seq_number'] == 'One primary only') & (df['site'] == 'Lung and Bronchus')]

In [3]:
df_second_list = df.loc[(df['seq_number'] == '1st of 2 or more primaries') & (df['site'] == 'Lung and Bronchus')]['id'].tolist()
df_temp = df[df['id'].isin(df_second_list)]

In [4]:
df_temp.head()

Unnamed: 0,id,diag_year,Age,survival_month,Gender,site,seq_number,histologic,behavior,Laterality,SSF2,SSF1,TMRSZ,DIFF,AJCCstage,LYMND,LYMND_exm,vital_sts
18850,696327,2000,80-84 years,7,Male,Lung and Bronchus,1st of 2 or more primaries,"8140/3: Adenocarcinoma, NOS",Malignant,Left - origin of primary,Blank(s),Blank(s),Blank(s),Unknown,Blank(s),98,0,Dead
18851,696327,2000,80-84 years,7,Male,Prostate,2nd of 2 or more primaries,"8140/3: Adenocarcinoma, NOS",Malignant,Not a paired site,Blank(s),Blank(s),Blank(s),Moderately differentiated; Grade II,Blank(s),98,0,Dead
19368,704758,2000,55-59 years,105,Female,Lung and Bronchus,1st of 2 or more primaries,8046/3: Non-small cell carcinoma,Malignant,Right - origin of primary,Blank(s),Blank(s),Blank(s),Unknown,Blank(s),98,0,Dead
19369,704758,2008,60-64 years,9,Female,Lung and Bronchus,2nd of 2 or more primaries,"8041/3: Small cell carcinoma, NOS",Malignant,Right - origin of primary,Blank(s),Blank(s),999,Unknown,Blank(s),95,95,Dead
23190,771867,2000,70-74 years,81,Female,Lung and Bronchus,1st of 2 or more primaries,"8010/3: Carcinoma, NOS",Malignant,Left - origin of primary,Blank(s),Blank(s),Blank(s),Unknown,Blank(s),98,0,Dead


In [5]:
df_temp1 =  df_temp[df_temp['seq_number'] == '1st of 2 or more primaries']
df_temp2 =  df_temp[df_temp['seq_number'] == '2nd of 2 or more primaries']
df_temp1 = df_temp1[df_temp1.id.isin(df_temp2.id.tolist())]    # 有些seq_number為 1st of 2 or more primaries的病人只有IPC沒有找到他的下一筆資料，所以先刪除

In [6]:
df_temp1.loc[:,'comp_site'] = df_temp2['site'].tolist()
df_temp1.loc[:,'comp_histologic'] = df_temp2['histologic'].tolist()
df_temp1.loc[:,'comp_diag_year'] = df_temp2['diag_year'].tolist()
df_temp1['comp_diag_year'] = df_temp1['comp_diag_year']-df_temp1['diag_year'] #計算SPC和IPC的確診年份差距

In [7]:
# 1. Second malignancy至少要超過6個月 (已超過潛伏期) --> seer已經篩過survival months >= 6

# 2. 在不同site的 直接算SPC，為在不同部位發生的肺癌二次原發癌
rule2 = df_temp1.loc[(df_temp1['site']!=df_temp1['comp_site'])] # length = 16638
df_temp1 = df_temp1[~df_temp1.id.isin(rule2.id.tolist())]

# **** 以下符合其中一項就算 肺癌二次原發癌 (SPLC)

# 3. 組織型態(histology)不同的
rule3 = df_temp1.loc[(df_temp1['histologic']!=df_temp1['comp_histologic'])] # # length = 10061
df_temp1 = df_temp1[~df_temp1.id.isin(rule3.id.tolist())]

# 4. SPC和IPC的確診年份大於兩年
rule4 = df_temp1.loc[(df_temp1['comp_diag_year']>=2)]   # 2757, 2192 data left

# 5. SPC的腫瘤和IPC的腫瘤在不同的肺葉中，並且沒有positive nodes和轉移的證據

In [8]:
df_spc = pd.concat([rule2,rule3,rule4])
df_spc = df_spc.drop(columns = ['id', 'site', 'seq_number', 'histologic', 'behavior', 'Laterality', 'diag_year', 'survival_month', 'vital_sts', 'LYMND_exm', 'comp_site', 'comp_histologic','comp_diag_year'])
df_ipc = df_ipc.drop(columns = ['id', 'site', 'seq_number', 'histologic', 'behavior', 'Laterality', 'diag_year', 'survival_month', 'vital_sts', 'LYMND_exm'])
df_spc['Class'] = 1
df_ipc['Class'] = 0

In [14]:
from tableone import TableOne, load_dataset
#drawe table one

columns = ["Class","Gender", "Age",
        "AJCCstage", "DIFF", "LYMND", "TMRSZ",
        "SSF1", "SSF2"]
groupby = ["Class"]
categorical = ["Class","Gender", "Age",
        "AJCCstage", "DIFF", "LYMND", "TMRSZ",
        "SSF1", "SSF2"]
df_all = pd.concat([df_ipc,df_spc])
seer_tableone = TableOne(df_all, columns=columns,categorical=categorical)
seer_tableone.to_csv('seer_tableone.csv')