**Thanks to https://www.kaggle.com/xhlulu/ranzcr-efficientnet-tpu-training**

**Let's use the extra data provided by https://www.kaggle.com/raddar/ricord-covid19-xray-positive-tests**

**Thanks to https://www.kaggle.com/josephamigo/siim-external-data-pipeline**

**I think besides this adding BIMCV to dataset is the key for winning this competition. **

In [None]:
!pip install efficientnet -q
!conda install gdcm -c conda-forge -y

In [None]:
import os

import efficientnet.tfkeras as efn
import numpy as np
import pandas as pd
from kaggle_datasets import KaggleDatasets
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn.model_selection import GroupKFold

In [None]:
COMPETITION_NAME = "siimcovid19-512-img-png-600-study-png"
GCS_DS_PATH = KaggleDatasets().get_gcs_path(COMPETITION_NAME)

In [None]:
load_dir = f"/kaggle/input/{COMPETITION_NAME}/"
df = pd.read_csv('../input/siim-covid19-detection/train_study_level.csv')
label_cols = df.columns[1:5]
df #Just includes id and numbering of 4 classes

In [None]:
IM_PATH = '../input/ricord-covid19-xray-positive-tests/MIDRC-RICORD/MIDRC-RICORD'
meta_extradata = pd.read_csv('../input/ricord-covid19-xray-positive-tests/MIDRC-RICORD-meta.csv')
meta_extradata.dropna(inplace = True, subset = ['labels'])

In [None]:
meta_extradata

In [None]:
def locate_row_to_delete(a):
    if a.max() <= 0.5 :
        return np.array([np.nan,np.nan,np.nan,np.nan])
    else:
        return a


def encode_labels(df):
    df = df[['fname', 'labels']]
    
    #initialize label columns
    df['Negative for Pneumonia'] = 0
    df['Typical Appearance'] = 0
    df['Indeterminate Appearance'] = 0
    df['Atypical Appearance'] = 0
    
    #Count occurences of each category
    df['Negative for Pneumonia'] = df.labels.apply(lambda x : x.count('Negative'))
    df['Typical Appearance'] = df.labels.apply(lambda x : x.count('Typical'))
    df['Indeterminate Appearance'] = df.labels.apply(lambda x : x.count('Indeterminate'))
    df['Atypical Appearance'] = df.labels.apply(lambda x : x.count('Atypical'))
    
    #df to array for computations
    labels_np = df[['Negative for Pneumonia','Typical Appearance', 'Indeterminate Appearance', 'Atypical Appearance']].values
    temp = labels_np/labels_np.sum(axis = 1, keepdims=True)
    temp = np.apply_along_axis(locate_row_to_delete, 1, temp) 

    temp -= 0.51
    
    temp[temp > 0] = 1
    temp[temp < 0] = 0
    

    df['Negative for Pneumonia'] = temp[:,0]
    df['Typical Appearance'] = temp[:,1]
    df['Indeterminate Appearance'] = temp[:,2]
    df['Atypical Appearance'] = temp[:,3]
    
    
    df.dropna(subset = ['Negative for Pneumonia'], inplace = True)
    return df

In [None]:
output = encode_labels(meta_extradata)
output

# **Data still have extra columns and different names**
* First change the column name from fname -> id

* Then drop NaN 

* Then drop labels column

In [None]:
output = output.rename(columns={'fname': 'id'})
nan_value = float("NaN") 
output. replace("", nan_value, inplace=True)
output. dropna(subset = ["labels"], inplace=True)
output=output.drop(['labels'], axis=1)

In [None]:
output=output.reset_index(drop=True)
print(output)

**Concatenate the extra files with original dataframe to obatin the extended one**

In [None]:
temp_df=[df, output]
extended = pd.concat(temp_df)
extended

In [None]:
extended=extended.reset_index(drop=True)
print(extended)

**Now time to use groupkfold to extended data**

In [None]:
gkf  = GroupKFold(n_splits = 5)
extended['fold'] = -1
for fold, (train_idx, val_idx) in enumerate(gkf.split(extended, groups = extended.id.tolist())):
    extended.loc[val_idx, 'fold'] = fold
extended
df=extended
print(df)

# Save all the extended data to temporary directory

In [None]:
from PIL import Image
import os

save_dir = f'/kaggle/tmp/external/'
os.makedirs(save_dir, exist_ok=True)

In [None]:
import tqdm
from PIL import Image
from tqdm.auto import tqdm

for dirname, _, filenames in tqdm(os.walk(f'../input/siimcovid19-512-img-png-600-study-png/study')):
    for file in filenames:
        photo = Image.open(os.path.join(dirname, file))
        photo.save(os.path.join(save_dir, file.replace('.png', '.jpg')))

In [None]:
for dirname, _, filenames in tqdm(os.walk(f'../input/ricord-covid19-xray-positive-tests/MIDRC-RICORD/MIDRC-RICORD')):
    for file in filenames:
        photo = Image.open(os.path.join(dirname, file))
        newsize = (600, 600) #Make image size 600x600
        photo = photo.resize(newsize)
        photo.save(os.path.join(save_dir, file.replace('.dcm.jpg', '.jpg')))

**If you want to check files uncomment the following**

In [None]:
# files = os.listdir(save_dir)
# files

In [None]:
df.to_csv(f'df.csv')
!tar -zcf test.tar.gz -C "/kaggle/tmp/external/" .