# Import stuff

In [1]:
import os

import numpy as np
import pandas as pd
import termcolor

# Configure directories

**Note**: This was done inside a Google Drive directory but it has been removed from below, use your own if you plan to do this in Google Drive. Replace the part `[REPLACE THIS WITH THE LOCATION IN YOUR OWN GOOGLE DRIVE]` with your own directory inside your Google Drive.

In [2]:
BASE_PATH = !pwd
BASE_PATH = BASE_PATH[0]

# Note
#   If using DATA_CSV_MENDELEY with ! or % , quote it so the spaces are respected
#   e.g. `!head "{DATA_CSV_MENDELEY}"`

try:  # Mount Google Drive
    import os
    from google.colab import drive

    drive.mount("/content/gdrive")
    NOTEBOOK_DIR = "/content/gdrive/My Drive/[REPLACE THIS WITH THE LOCATION IN YOUR OWN GOOGLE DRIVE]"
    DATA_WRANGLING_DIR = NOTEBOOK_DIR
    # !ln -s "{NOTEBOOK_DIR}" NOTEBOOK_DIR
    !if [ -e NOTEBOOK_DIR ]; then echo 'NOTEBOOK_DIR link already exists'; else ln -s "{NOTEBOOK_DIR}" NOTEBOOK_DIR; fi
except:  # Locally run Jupyter
    NOTEBOOK_DIR = f"{BASE_PATH}"
    DATA_WRANGLING_DIR = NOTEBOOK_DIR

In [3]:
!pwd
!ls "{DATA_WRANGLING_DIR}"

/Users/felipe/Documents/academic/fourthbrain/capstone_stuff/OCT-Transfer-SemiSup/data_wrangling
data_wrangling_comparison.ipynb [1m[36mmendeley[m[m
[1m[36mkaggle[m[m


# Read CSVs into DataFrames

## Read Mendeley CSV

At this point the Mendeley DataFrame was already processed for duplicates so no further processing is required.

In [4]:
df_mendeley = pd.read_csv(
    f"{DATA_WRANGLING_DIR}/mendeley/mendeley_filelist_combo_cond_md5.csv", index_col=0
)
df_mendeley

Unnamed: 0,file_name,dataset,condition,file_location,patient_id,md5,dimensions
0,CNV-1016042-1.jpeg,test,CNV,OCT2017/test/CNV/CNV-1016042-1.jpeg,1016042,8878b3c48d6252464d388feeddf07259,"(512, 496)"
1,CNV-1016042-2.jpeg,test,CNV,OCT2017/test/CNV/CNV-1016042-2.jpeg,1016042,2fe168b795c02e7a675f835f0930abd2,"(512, 496)"
2,CNV-1016042-3.jpeg,test,CNV,OCT2017/test/CNV/CNV-1016042-3.jpeg,1016042,6bcd80b40786b6760724d082098f513f,"(768, 496)"
3,CNV-1016042-4.jpeg,test,CNV,OCT2017/test/CNV/CNV-1016042-4.jpeg,1016042,4693ad1edc383053e72563f8212a94ce,"(512, 496)"
4,CNV-103044-1.jpeg,test,CNV,OCT2017/test/CNV/CNV-103044-1.jpeg,103044,bcd67009e1a0f7d540840a057f6334b2,"(512, 496)"
...,...,...,...,...,...,...,...
77122,NORMAL-9997680-2.jpeg,train,NORMAL,OCT2017/train/NORMAL/NORMAL-9997680-2.jpeg,9997680,31f918fd7fe2f0d02d6a6b9f6f44bcf5,"(512, 512)"
77123,NORMAL-9997680-3.jpeg,train,NORMAL,OCT2017/train/NORMAL/NORMAL-9997680-3.jpeg,9997680,ac491500b3d2616aaa6976d87505269a,"(512, 512)"
77124,NORMAL-9997680-4.jpeg,train,NORMAL,OCT2017/train/NORMAL/NORMAL-9997680-4.jpeg,9997680,9d961b691ce6f2484642f5d8118748c7,"(512, 512)"
77125,NORMAL-9997680-5.jpeg,train,NORMAL,OCT2017/train/NORMAL/NORMAL-9997680-5.jpeg,9997680,dd3be99ae7e602565aa91aa89ea06daa,"(512, 512)"


## Read Kaggle CSV

The dataset downloaded from Kaggle was not processed so it will be done here.

In [5]:
df_kaggle_w_dupes = pd.read_csv(
    f"{DATA_WRANGLING_DIR}/kaggle/kaggle_dataset_filelist.csv", index_col=0
)
df_kaggle_w_dupes.head(1)

Unnamed: 0,file_name,dataset,condition,file_location,patient_id,md5,dimensions
0,CNV-4283050-2.jpeg,test,CNV,kermany2018_downloaded_from_kaggle/OCT2017 /te...,4283050,194c039768e730812cf77c2072821f83,"(512, 496)"


In [6]:
df_kaggle = df_kaggle_w_dupes.drop_duplicates(
    subset=["condition", "md5"], ignore_index=True
)
df_kaggle = df_kaggle.sort_values(["dataset", "file_location"]).reset_index(
    drop=True
)  # Keep order test first, train second
df_kaggle

Unnamed: 0,file_name,dataset,condition,file_location,patient_id,md5,dimensions
0,CNV-1016042-1.jpeg,test,CNV,kermany2018_downloaded_from_kaggle/OCT2017 /te...,1016042,8878b3c48d6252464d388feeddf07259,"(512, 496)"
1,CNV-1016042-2.jpeg,test,CNV,kermany2018_downloaded_from_kaggle/OCT2017 /te...,1016042,2fe168b795c02e7a675f835f0930abd2,"(512, 496)"
2,CNV-1016042-3.jpeg,test,CNV,kermany2018_downloaded_from_kaggle/OCT2017 /te...,1016042,6bcd80b40786b6760724d082098f513f,"(768, 496)"
3,CNV-1016042-4.jpeg,test,CNV,kermany2018_downloaded_from_kaggle/OCT2017 /te...,1016042,4693ad1edc383053e72563f8212a94ce,"(512, 496)"
4,CNV-103044-1.jpeg,test,CNV,kermany2018_downloaded_from_kaggle/OCT2017 /te...,103044,bcd67009e1a0f7d540840a057f6334b2,"(512, 496)"
...,...,...,...,...,...,...,...
77122,NORMAL-5193994-1.jpeg,val,NORMAL,kermany2018_downloaded_from_kaggle/OCT2017 /va...,5193994,c452deb7fe847610d4aa1ee41c4af55f,"(512, 496)"
77123,NORMAL-5246808-1.jpeg,val,NORMAL,kermany2018_downloaded_from_kaggle/OCT2017 /va...,5246808,832cc0b21bed45896ac49c892d8184d8,"(512, 496)"
77124,NORMAL-5246808-2.jpeg,val,NORMAL,kermany2018_downloaded_from_kaggle/OCT2017 /va...,5246808,c3b487cba782b6a97f9d39fc7770fdae,"(512, 496)"
77125,NORMAL-5324912-1.jpeg,val,NORMAL,kermany2018_downloaded_from_kaggle/OCT2017 /va...,5324912,2ee72e2c1e0458646b2b011a4c2a2ae4,"(512, 496)"


# Compare the two resulting notebooks

We will compare the two processed dataframes (without `['condition', 'md5']` duplicates). If the number of rows between the join and the original files is the same, both DataFrames contain the same files.

In [7]:
df_kaggle_reindexed = df_kaggle.copy()
df_kaggle_reindexed.set_index(["md5", "condition"], inplace=True)
df_kaggle_reindexed.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,file_name,dataset,file_location,patient_id,dimensions
md5,condition,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
8878b3c48d6252464d388feeddf07259,CNV,CNV-1016042-1.jpeg,test,kermany2018_downloaded_from_kaggle/OCT2017 /te...,1016042,"(512, 496)"
2fe168b795c02e7a675f835f0930abd2,CNV,CNV-1016042-2.jpeg,test,kermany2018_downloaded_from_kaggle/OCT2017 /te...,1016042,"(512, 496)"


In [8]:
df_mendeley_reindexed = df_mendeley.copy()
df_mendeley_reindexed.set_index(["md5", "condition"], inplace=True)
df_mendeley_reindexed.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,file_name,dataset,file_location,patient_id,dimensions
md5,condition,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
8878b3c48d6252464d388feeddf07259,CNV,CNV-1016042-1.jpeg,test,OCT2017/test/CNV/CNV-1016042-1.jpeg,1016042,"(512, 496)"
2fe168b795c02e7a675f835f0930abd2,CNV,CNV-1016042-2.jpeg,test,OCT2017/test/CNV/CNV-1016042-2.jpeg,1016042,"(512, 496)"


In [9]:
df_join = df_mendeley_reindexed.join(df_kaggle_reindexed, how="inner", rsuffix="_other")
len(df_join)

77127

In [10]:
# # The three operations above in one line
# len(df_mendeley.set_index(['md5', 'condition'])\
#     .join(df_kaggle.set_index(['md5', 'condition']), how='inner', rsuffix='other_'))

In [11]:
if len(df_join) == len(df_kaggle):
    message = "Both DataFrames contain the same files!"
    print(termcolor.colored(message, color="white", on_color="on_green"))

[42m[37mBoth DataFrames contain the same files![0m
