## Data Preparation

Describe and justify the process for preparing the data for analysis.

***
Questions to consider:
* Were there variables you dropped or created?
* How did you address missing values or outliers?
* Why are these choices appropriate given the data and the business problem?
***

# Contents
1. Redistribute Data
2. Visualizations of before and after
3. Show image sizes data
4. Create Dataset
5. Display images after resizing
6. Process data
7. Display images after processing

In [None]:
%load_ext autoreload
%autoreload 2

## 1. Redistribute Data

In [None]:
from my_modules.file_distribution import run_redistribution


## 2. Visualizations of before and after

In [1]:
from my_modules.file_distribution import prepare_plot, bars_data
data_dir = "data/chest_xray"
folders = ['new_train', 'new_val', 'new_test']
new_df = prepare_plot(data_dir,folders)
bars_data(new_df)

ModuleNotFoundError: No module named 'sklearn'

In [None]:
# Collecting pre and post data
old_folders = ['train', 'val', 'test']
old_df = prepare_plot(data_dir,old_folders)
old_df

In [None]:
import matplotlib.pyplot as plt
# Plotting
fig, ax = plt.subplots(1, 2, figsize=(10, 5), sharey=True)
ax[0].bar(old_df['Dataset'], old_df['Normal'], label='Normal')
ax[0].bar(old_df['Dataset'], old_df['Pneumonia'], bottom=old_df['Normal'], label='Pneumonia')
ax[0].set_ylabel('Number of Images')
ax[0].set_title('Before')
ax[1].bar(new_df['Dataset'], new_df['Normal'], label='Normal')
ax[1].bar(new_df['Dataset'], new_df['Pneumonia'], bottom=new_df['Normal'], label='Pneumonia')
ax[1].set_title('After')
plt.legend()

In [None]:
def pies(df):
    df['norm_rate']=round((df['Normal']/df['Total'])*100,2)
    df['pneu_rate']=round((df['Pneumonia']/df['Total'])*100,2)
    fig, ax = plt.subplots(1,3, figsize=(9,3))
    ax[0].pie(list(df.iloc[0,4:6]), labels=['Normal','Pneumonia'], autopct='%1.f%%', textprops={'fontsize': 12, 'color': 'white', 'fontweight':'bold'})
    ax[0].set_title('Train')
    ax[1].pie(list(df.iloc[1,4:6]), labels=['Normal','Pneumonia'], autopct='%1.f%%', textprops={'fontsize': 12, 'color': 'white'})
    ax[1].set_title('Validation')
    ax[2].pie(list(df.iloc[2,4:6]), labels=['Normal','Pneumonia'], autopct='%1.f%%', textprops={'fontsize': 12, 'color': 'white'})
    ax[2].set_title('Test')
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    plt.show()
pies(old_df)

In [None]:
pies(new_df)

In [None]:
old_df['total_rate']= round((old_df['Total']/old_df['Total'].sum())*100,2)
new_df['total_rate']= round((new_df['Total']/new_df['Total'].sum())*100,2)
list(old_df['total_rate'])

fig, ax = plt.subplots(1,2, figsize=(6,3))
ax[0].pie(list(old_df['total_rate']), labels=['Train','Val', 'Train'], autopct='%1.1f%%')
ax[0].set_title('Before')
ax[1].pie(list(new_df['total_rate']), labels=['Train','Val', 'Train'], autopct='%1.1f%%')
ax[1].set_title('After')

## 3. Show image sizes data

In [None]:
from my_modules import image_sizes

## 4. Create Dataset

In [None]:
from my_modules.import_datasets import create_dataset, process_dataset

train_dir = "data/chest_xray/new_train"
val_dir = "data/chest_xray/new_val"
test_dir = "data/chest_xray/new_test"

train_ds = create_dataset(train_dir)
val_ds = create_dataset(val_dir)
test_ds = create_dataset(test_dir)

## 5. Display images after resizing

In [None]:
import numpy as np
plt.figure(figsize=(10, 10))
for images, labels in train_ds.take(1):
    for i in range(9):
        ax = plt.subplot(3, 3, i + 1)
        plt.imshow(images[i],cmap='gray')
        if labels[i] == 0:
            label = 'Normal'
        else:
            label = 'Pneumonia'
        plt.title(label)
        plt.axis("off")

## 6. Process data

In [None]:
from tensorflow.keras.layers.experimental.preprocessing import Rescaling
rescale = Rescaling(1./255)
train_ds = train_ds.map(lambda x, y: (rescale(x), y))

## 7. Display images after processing

In [None]:
plt.figure(figsize=(10, 10))
for images, labels in train_ds.take(1):
    for i in range(9):
        ax = plt.subplot(3, 3, i + 1)
        plt.imshow(images[i],cmap='gray')
        if labels[i] == 0:
            label = 'Normal'
        else:
            label = 'Pneumonia'
        plt.title(label)
        plt.axis("off")