# Data Formatting

### Instructions:
- Open this notebook in Kaggle
- Click on '+ Add Data'
- Add the 'CIFAKE: Real and AI-Generated Synthetic Images' dataset
- Run the notebook
- Download the 'CIFAKE_Train.npz' and 'CIFAKE_Test.npz' files from Output>/kaggle/working

In [None]:
import numpy as np
from PIL import Image
import os
import pandas as pd

### We'll define a simple function to load a directory of images into an array

In [None]:
def load_images(path):
    images = []
    for filename in os.listdir(path):
        im = Image.open(path + filename)
        im_array = np.array(im)
        images.append(im_array)

    print("Loaded", len(images), "images from", path)
    return images

In [None]:
image_path = "/kaggle/input/cifake-real-and-ai-generated-synthetic-images"

### Load all the images

In [None]:
fake_train_images = load_images(image_path + "/train/FAKE/")

Loaded 50000 images from /kaggle/input/cifake-real-and-ai-generated-synthetic-images/train/FAKE/


In [None]:
real_train_images = load_images(image_path + "/train/REAL/")

Loaded 50000 images from /kaggle/input/cifake-real-and-ai-generated-synthetic-images/train/REAL/


In [None]:
fake_test_images = load_images(image_path + "/test/FAKE/")

Loaded 10000 images from /kaggle/input/cifake-real-and-ai-generated-synthetic-images/test/FAKE/


In [None]:
real_test_images = load_images(image_path + "/test/REAL/")

Loaded 10000 images from /kaggle/input/cifake-real-and-ai-generated-synthetic-images/test/REAL/


### We'll load them into Pandas DataFrames to add labels and combine the train and test folders. We'll use 0 for Fake and 1 for Real so we don't have to label encode the targets every time. We'll shuffle the combined dataframes so we don't have to do it later

In [None]:
df_fake_train = pd.DataFrame()
df_fake_train['images'] = fake_train_images
df_fake_train['labels'] = 0
df_fake_train.head(3)

Unnamed: 0,images,labels
0,"[[[114, 95, 65], [129, 110, 80], [122, 102, 75...",0
1,"[[[71, 62, 23], [66, 56, 20], [58, 48, 13], [5...",0
2,"[[[13, 8, 2], [10, 7, 0], [12, 14, 3], [0, 3, ...",0


In [None]:
df_real_train = pd.DataFrame()
df_real_train['images'] = real_train_images
df_real_train['labels'] = 1
df_real_train.head(3)

Unnamed: 0,images,labels
0,"[[[102, 86, 63], [98, 83, 60], [88, 74, 48], [...",1
1,"[[[108, 108, 82], [106, 106, 78], [109, 112, 8...",1
2,"[[[153, 155, 131], [153, 155, 131], [153, 155,...",1


In [None]:
df_train = pd.concat([df_fake_train, df_real_train],axis = 0)
df_train = df_train.sample(frac=1, random_state=0)
df_train.head(7)

Unnamed: 0,images,labels
3582,"[[[48, 89, 133], [49, 90, 134], [51, 92, 138],...",0
10498,"[[[229, 238, 247], [220, 229, 238], [221, 229,...",1
3227,"[[[39, 52, 42], [62, 75, 65], [33, 46, 36], [1...",1
21333,"[[[207, 199, 196], [219, 211, 208], [211, 203,...",0
3885,"[[[171, 208, 227], [172, 209, 228], [175, 210,...",0
1521,"[[[62, 39, 21], [73, 52, 35], [73, 54, 39], [7...",1
34261,"[[[111, 118, 87], [101, 107, 81], [90, 91, 73]...",1


In [None]:
df_train.shape

(100000, 2)

In [None]:
df_fake_test = pd.DataFrame()
df_fake_test['images'] = fake_test_images
df_fake_test['labels'] = 0
df_fake_test.head(3)

Unnamed: 0,images,labels
0,"[[[52, 45, 19], [55, 47, 24], [53, 46, 27], [6...",0
1,"[[[62, 48, 0], [81, 69, 19], [95, 85, 32], [92...",0
2,"[[[62, 60, 61], [55, 53, 54], [70, 68, 69], [1...",0


In [None]:
df_real_test = pd.DataFrame()
df_real_test['images'] = real_test_images
df_real_test['labels'] = 1
df_real_test.head(3)

Unnamed: 0,images,labels
0,"[[[119, 144, 166], [119, 144, 166], [119, 144,...",1
1,"[[[114, 136, 54], [112, 133, 54], [112, 130, 5...",1
2,"[[[60, 89, 67], [61, 93, 70], [63, 95, 72], [5...",1


In [None]:
df_test = pd.concat([df_fake_test, df_real_test],axis = 0)
df_test = df_test.sample(frac=1, random_state=0)
df_test.head(7)

Unnamed: 0,images,labels
9134,"[[[248, 242, 226], [245, 239, 223], [253, 246,...",1
4981,"[[[95, 121, 172], [92, 117, 171], [88, 113, 17...",0
6643,"[[[3, 74, 106], [3, 74, 104], [6, 72, 104], [6...",1
9117,"[[[102, 155, 187], [103, 155, 192], [92, 139, ...",1
5306,"[[[169, 167, 168], [169, 167, 168], [169, 167,...",0
230,"[[[238, 237, 235], [243, 242, 240], [230, 229,...",0
3148,"[[[13, 8, 15], [12, 7, 14], [15, 8, 16], [18, ...",0


### Save them as .npz files

In [None]:
label_names = np.array(['Fake', 'Real'])

In [None]:
df_test.shape

(20000, 2)

In [None]:
np.savez_compressed('./CIFAKE_Train.npz', images=np.stack(df_train['images'].values, axis=0), labels=df_train['labels'].values, label_names=label_names)

In [None]:
np.savez_compressed('./CIFAKE_Test.npz', images=np.stack(df_test['images'].values, axis=0), labels=df_test['labels'].values, label_names=label_names)

In [None]:
train = np.load('/kaggle/working/CIFAKE_Train.npz')
train['images'].shape

(100000, 32, 32, 3)

In [None]:
test = np.load('/kaggle/working/CIFAKE_Test.npz')
test['images'].shape

(20000, 32, 32, 3)

In [None]:
test['label_names']

array(['Fake', 'Real'], dtype='<U4')