# Medical Image - TFM
<h4>subtitle: Generación de una tubería distribuida para la extracción de características en imágenes médicas patológicas</h4>
license: European Union Public Licence (EUPL) v1.2

<table>
  <tr> <td> author name: </td> <td> Israel Llorens </td> </tr>
  <tr> <td> email: </td> <td> sanchezis@hotmail.com </td> </tr>
</table>

<h7>date: 2024/03/22</h7>

---

In [1]:
import init
from digital_pathology.spark import spark

In [6]:
spark.read.format('csv').load(f's3a://camelyon-dataset/CAMELYON16/annotations/test_001.xml').head(5)

[Row(_c0='<?xml version="1.0"?>'),
 Row(_c0='<ASAP_Annotations>'),
 Row(_c0='\t<Annotations>'),
 Row(_c0='\t\t<Annotation Name="Annotation 0" Type="Polygon" PartOfGroup="Tumor" Color="#F4FA58">'),
 Row(_c0='\t\t\t<Coordinates>')]

In [12]:
import boto3 
from botocore.exceptions import ClientError
from botocore.handlers import disable_signing

resource = boto3.resource('s3')
resource.meta.client.meta.events.register('choose-signer.s3.*', disable_signing)

my_bucket = resource.Bucket('camelyon-dataset')
di = []

for my_bucket_object in my_bucket.objects.all():
    di.append(( my_bucket_object.key, ) )

# s3 = boto3.client('s3')
# response = s3.list_buckets(Bucket='camelyon-dataset')

In [11]:
import pandas as pd

df = pd.DataFrame( di, columns=['file'] )
df['type'] = df['file'].apply(lambda x: x.split('.')[-1])
df['loc'] = df['file'].apply(lambda x: x.split('/')[1] if len(x.split('/'))>1 and '.' not in x.split('/')[1] else None )
df['project'] = df['file'].apply(lambda x: x.split('/')[0] if len(x.split('/'))>1 else None )

In [13]:
df[ ( df['type']== 'tif') & (df['project'] == 'CAMELYON17')].sample(n=25).head(20)

Unnamed: 0,file,type,loc,project
1765,CAMELYON17/images/patient_069_node_0.tif,tif,images,CAMELYON17
2258,CAMELYON17/images/patient_167_node_4.tif,tif,images,CAMELYON17
1811,CAMELYON17/images/patient_078_node_1.tif,tif,images,CAMELYON17
1819,CAMELYON17/images/patient_079_node_4.tif,tif,images,CAMELYON17
2322,CAMELYON17/images/patient_180_node_3.tif,tif,images,CAMELYON17
1882,CAMELYON17/images/patient_092_node_2.tif,tif,images,CAMELYON17
2487,CAMELYON17/masks/patient_066_node_2_mask.tif,tif,masks,CAMELYON17
1581,CAMELYON17/images/patient_032_node_0.tif,tif,images,CAMELYON17
2466,CAMELYON17/masks/patient_043_node_3_mask.tif,tif,masks,CAMELYON17
1876,CAMELYON17/images/patient_091_node_1.tif,tif,images,CAMELYON17


##  Ingestion

In [16]:
import pyspark.sql.functions as F
from pyspark.sql.types import StructField, StructType, StringType

schema = StructType(
    [
        StructField("filename", StringType()),
    ]
)

ingestion = spark.createDataFrame(di, schema=schema)

In [17]:
ingestion = ingestion.withColumn('type',      F.regexp_extract ('filename',  '\.(.+)', 1) )
ingestion = ingestion.withColumn('loc',       F.regexp_extract ('filename',  '/(.+)/.*', 1) )
ingestion = ingestion.withColumn('project',   F.regexp_extract ('filename',  '(\w+)/.*', 1) )

In [18]:
ingestion.write.mode('overwrite').parquet('out/0-extract.parquet')