In [29]:
# Challenge 2: Investigating Breast Cancer Attribute Data

In [30]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [31]:
import os
father_folder = os.path.dirname(os.getcwd())
train_feats = pd.read_csv(f'{father_folder}/train_test_splits/train_split.feats.csv')

In [32]:
print(train_feats.head())

                       Form Name   Hospital User Name  אבחנה-Age  \
0                 אנמנזה סיעודית       20.0   77_Onco  48.705217   
1                 אנמנזה סיעודית       20.0   76_Onco  50.529013   
2                 אנמנזה סיעודית       20.0   79_Onco  48.813933   
3                   ביקור במרפאה       20.0  330_Onco  76.294220   
4  אומדן סימפטומים ודיווח סיעודי       60.0  169_Onco  38.352437   

  אבחנה-Basic stage אבחנה-Diagnosis date אבחנה-Her2  \
0  p - Pathological     20/03/2014 18:09        NaN   
1  p - Pathological      30/12/2010 8:30        Neg   
2  p - Pathological     14/09/2016 11:49        NaN   
3  p - Pathological     24/06/2015 12:53          -   
4      c - Clinical     23/03/2020 19:34        NEG   

     אבחנה-Histological diagnosis         אבחנה-Histopatological degree  \
0     INFILTRATING DUCT CARCINOMA                                  Null   
1  LOBULAR INFILTRATING CARCINOMA                                  Null   
2     INFILTRATING DUCT CARCINOMA  

In [39]:
train_feats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39480 entries, 0 to 39479
Data columns (total 34 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0    Form Name                               39480 non-null  object 
 1    Hospital                                39480 non-null  float64
 2   User Name                                39480 non-null  object 
 3   אבחנה-Age                                39480 non-null  float64
 4   אבחנה-Basic stage                        39480 non-null  object 
 5   אבחנה-Diagnosis date                     39480 non-null  object 
 6   אבחנה-Her2                               30109 non-null  object 
 7   אבחנה-Histological diagnosis             39480 non-null  object 
 8   אבחנה-Histopatological degree            39480 non-null  object 
 9   אבחנה-Ivi -Lymphovascular invasion       464 non-null    object 
 10  אבחנה-KI67 protein                       22099

In [36]:
unique_counts = train_feats.nunique().reset_index()
unique_counts.columns = ['Column', 'Unique Values']
print(unique_counts)

                                     Column  Unique Values
0                                 Form Name              9
1                                  Hospital              4
2                                 User Name            143
3                                 אבחנה-Age           8278
4                         אבחנה-Basic stage              4
5                      אבחנה-Diagnosis date           8269
6                                אבחנה-Her2            323
7              אבחנה-Histological diagnosis             38
8             אבחנה-Histopatological degree              6
9        אבחנה-Ivi -Lymphovascular invasion             14
10                       אבחנה-KI67 protein            319
11              אבחנה-Lymphatic penetration              5
12           אבחנה-M -metastases mark (TNM)              6
13                        אבחנה-Margin Type              3
14          אבחנה-N -lymph nodes mark (TNM)             18
15                         אבחנה-Nodes exam             

In [37]:
for idx, row in unique_counts.iterrows():
    if row['Unique Values'] < 11:
        col_name = row['Column']
        print(f"Column: {col_name}")
        print(train_feats[col_name].unique())
        print('-' * 40)

Column:  Form Name
['אנמנזה סיעודית' 'ביקור במרפאה' 'אומדן סימפטומים ודיווח סיעודי'
 'ביקור במרפאה קרינה' 'דיווח סיעודי' 'אנמנזה רפואית' 'אנמנזה סיעודית קצרה'
 'ביקור במרפאה המטו-אונקולוגית' 'אנמנזה רפואית המטו-אונקולוגית']
----------------------------------------
Column:  Hospital
[20. 60. 85. 62.]
----------------------------------------
Column: אבחנה-Basic stage
['p - Pathological' 'c - Clinical' 'r - Reccurent' 'Null']
----------------------------------------
Column: אבחנה-Histopatological degree
['Null' 'G2 - Modereately well differentiated' 'G1 - Well Differentiated'
 'G3 - Poorly differentiated' 'GX - Grade cannot be assessed'
 'G4 - Undifferentiated']
----------------------------------------
Column: אבחנה-Lymphatic penetration
['Null' 'L0 - No Evidence of invasion' 'LI - Evidence of invasion'
 'L1 - Evidence of invasion of superficial Lym.'
 'L2 - Evidence of invasion of depp Lym.']
----------------------------------------
Column: אבחנה-M -metastases mark (TNM)
[nan 'M0' 'MX' '

In [38]:
for col in train_feats.columns:
    print(f"Column: {col}")
    print(train_feats[col].head())
    print('-' * 40)

Column:  Form Name
0                   אנמנזה סיעודית
1                   אנמנזה סיעודית
2                   אנמנזה סיעודית
3                     ביקור במרפאה
4    אומדן סימפטומים ודיווח סיעודי
Name:  Form Name, dtype: object
----------------------------------------
Column:  Hospital
0    20.0
1    20.0
2    20.0
3    20.0
4    60.0
Name:  Hospital, dtype: float64
----------------------------------------
Column: User Name
0     77_Onco
1     76_Onco
2     79_Onco
3    330_Onco
4    169_Onco
Name: User Name, dtype: object
----------------------------------------
Column: אבחנה-Age
0    48.705217
1    50.529013
2    48.813933
3    76.294220
4    38.352437
Name: אבחנה-Age, dtype: float64
----------------------------------------
Column: אבחנה-Basic stage
0    p - Pathological
1    p - Pathological
2    p - Pathological
3    p - Pathological
4        c - Clinical
Name: אבחנה-Basic stage, dtype: object
----------------------------------------
Column: אבחנה-Diagnosis date
0    20/03/2014 18:09

In [40]:
train_labels = pd.read_csv(f'{father_folder}/train_test_splits/train_split.labels.0.csv')
print(train_labels.head())

  אבחנה-Location of distal metastases
0                                  []
1                                  []
2                                  []
3                                  []
4                                  []


In [44]:
for val in train_labels['אבחנה-Location of distal metastases'].unique():
    print(val)

[]
['BON - Bones', 'HEP - Hepatic']
['BON - Bones']
['PUL - Pulmonary', 'LYM - Lymph nodes', 'BON - Bones']
['LYM - Lymph nodes', 'HEP - Hepatic', 'BON - Bones']
['HEP - Hepatic']
['LYM - Lymph nodes']
['HEP - Hepatic', 'BON - Bones']
['LYM - Lymph nodes', 'PUL - Pulmonary']
['LYM - Lymph nodes', 'HEP - Hepatic']
['SKI - Skin']
['LYM - Lymph nodes', 'BON - Bones']
['PUL - Pulmonary', 'BON - Bones']
['SKI - Skin', 'OTH - Other']
['PUL - Pulmonary', 'BRA - Brain']
['PUL - Pulmonary']
['HEP - Hepatic', 'BON - Bones', 'PUL - Pulmonary']
['MAR - Bone Marrow']
['PER - Peritoneum', 'BON - Bones']
['PUL - Pulmonary', 'LYM - Lymph nodes']
['BON - Bones', 'PUL - Pulmonary', 'PLE - Pleura']
['PUL - Pulmonary', 'PER - Peritoneum', 'LYM - Lymph nodes']
['BON - Bones', 'LYM - Lymph nodes']
['PUL - Pulmonary', 'HEP - Hepatic']
['PER - Peritoneum', 'LYM - Lymph nodes', 'BON - Bones']
['PER - Peritoneum']
['BRA - Brain', 'BON - Bones']
['LYM - Lymph nodes', 'PLE - Pleura', 'BON - Bones']
['PUL - Pulmon

In [45]:
# Combine all unique lists in 'אבחנה-Location of distal metastases' into one set
all_metastases = set()
for item in train_labels['אבחנה-Location of distal metastases']:
    # Convert string representation of list to actual list
    vals = eval(item) if isinstance(item, str) else item
    all_metastases.update(vals)
print(all_metastases)

{'HEP - Hepatic', 'BRA - Brain', 'PER - Peritoneum', 'PUL - Pulmonary', 'PLE - Pleura', 'LYM - Lymph nodes', 'BON - Bones', 'MAR - Bone Marrow', 'SKI - Skin', 'ADR - Adrenals', 'OTH - Other'}


In [46]:
for metastasis in all_metastases:
    print(metastasis)

HEP - Hepatic
BRA - Brain
PER - Peritoneum
PUL - Pulmonary
PLE - Pleura
LYM - Lymph nodes
BON - Bones
MAR - Bone Marrow
SKI - Skin
ADR - Adrenals
OTH - Other


In [47]:
print(len(all_metastases))

11


In [48]:
list_lens = set(train_labels['אבחנה-Location of distal metastases'].apply(lambda x: len(eval(x)) if isinstance(x, str) else 0))
print(list_lens)

{0, 1, 2, 3}
