# **Just a simple example**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import keras
from keras.datasets import cifar10
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D
import os

import numpy as np

import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix, classification_report
import itertools

%matplotlib inline

In [None]:
sample = pd.read_csv('/kaggle/input/cifar-10/sampleSubmission.csv')
sample.head()

In [None]:
trainLabel = pd.read_csv('/kaggle/input/cifar-10/trainLabels.csv')
trainLabel

In [None]:
!pip install pylzma
!pip install py7zlib

# **1、 7z file**

In [None]:
import py7zlib
import time

fp = open("/kaggle/input/cifar-10/train.7z",'rb')
#生成一个archive对象
archive = py7zlib.Archive7z(fp)

#读取文件中所有的文件名
names = archive.getnames()
#search
startTime = time.time()

#根据文件名返回文件的archiveFile类
member = archive.getmember(names[0])
end_1_time = time.time()
print("search time is {}".format(end_1_time-startTime))

#read data
#读取文件的所有数据
data = member.read()
end_2_time = time.time()
print("read time is {}".format(end_2_time-end_1_time))

In [None]:
names

In [None]:
!pip install py7zr
!python -m py7zr x ../input/cifar-10/train.7z /kaggle/working/
!python -m py7zr x ../input/cifar-10/test.7z /kaggle/working/


# **2、Replacement method**

The above method is not very easy to control

**2.1 Introduction**

The CIFAR-10 dataset contains 60,000 color images of 32 x 32 pixels in 3 channels divided into 10 classes. Each class contains 6,000 images. The training set contains 50,000 images, while the test sets provides 10,000 images. This image taken from the CIFAR repository ( https://www.cs.toronto.edu/~kriz/cifar.html ). This is a classification problem with 10 classes(muti-label classification). We can take a view on this image for more comprehension of the dataset.

The challenge is to recognize previously unseen images and assign them to one of the 10 classes.

In [None]:
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
print('x_train shape:', x_train.shape)
print('y_train shape:', y_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

In [None]:
y_train

In [None]:
fig, axs = plt.subplots(1,2,figsize=(15,5))
sns.countplot(y_train.ravel(),ax=axs[0])
axs[0].set_title('Training data')
axs[0].set_xlabel('Classes')

sns.countplot(y_test.ravel(),ax=axs[1])
axs[1].set_title('Testing data')
axs[1].set_xlabel('Classes')
plt.show()

As we can see, each classe contain exacly 6000 examples( 5000 for training and 1000 for test).

The graph above is very important for the training, for example if we had just 1000 samples of label 1 that will be a problem , the model will find difficulties to detect label 1"less accuracy ", so that's not going to happend everything look fine. It's important to know the distribution of dataset behind different classes because the goodness of our model depend on it.

Now let's doing some preprocessing.

The output variable have 10 posible values. This is a multiclass classification problem. We need to encode these lables to one hot vectors (ex : "bird" -> [0,0,1,0,0,0,0,0,0,0]).

# **3、Normalize**

In [None]:
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255

In [None]:
x_train

In [None]:
num_classes = 10
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

In [None]:
x_train.shape[:]

# **4、Defining the model architecture Using ConVnets**

In the first stage, Our net will learn 32 convolutional filters, each of which with a 3 x 3 size. The output dimension is the same one of the input shape, so it will be 32 x 32 and activation is relu, which is a simple way of introducing non-linearity; folowed by another 32 convolutional filters, each of which with a 3 x 3 size and activation is also relu. After that we have a max-pooling operation with pool size 2 x 2 and a dropout at 25%.


In the next stage in the deep pipeline, Our net will learn 64 convolutional filters, each of which with a 3 x 3 size. The output dimension is the same one of the input shape and activation is relu; folowed by another 64 convolutional filters, each of which with a 3 x 3 size and activation is also relu. After that we have a max-pooling operation with pool size 2 x 2 and a dropout at 25%.


And the Final stage in the deep pipeline is a dense network with 512 units and relu activation followed by a dropout at 50% and by a softmax layer with 10 classes as output, one for each category.

In [None]:
model =Sequential()
model.add(Conv2D(32,(3,3),padding='same',input_shape=x_train.shape[1:]))
model.add(Activation('relu'))
model.add(Conv2D(32,(3,3)))
model.add(MaxPooling2D(3,strides=2))
model.add(Dropout(0.25))

model.add(Conv2D(64,(3,3),padding='same'))
model.add(Activation('relu'))
model.add(Conv2D(64,(3,3)))
model.add(MaxPooling2D(3,strides=2))
model.add(Dropout(0.25))

model.add(Conv2D(128,(3,3),padding='same'))
model.add(Activation('relu'))
model.add(Conv2D(128,(3,3)))
model.add(MaxPooling2D(3,strides=2))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))

model.add(Activation('softmax'))

model.summary()



# **5、Model training**

Before making network ready for training we have to make sure to add below things:

A loss function: to measure how good the network is
An optimizer: to update network as it sees more data and reduce loss value
Metrics: to monitor performance of network
Also note that for data augmentation:

One of the most commun tehnique to avoid overfitting is data augmentation. And We know that overfitting is generaly occur when we don't have enought data for training the model. To avoid this overfitting problem, we need to expand artificially our dataset. The idea is to alter the training data with small transformations to reproduce the variations occuring when someone is writing a digit.

Different data aumentation techniques are as follows: Cropping, Rotating, Scaling, Translating, Flipping, Adding Gaussian noise to input images, etc...

In [None]:
opt = keras.optimizers.Adam(learning_rate=0.001,decay=1e-6,epsilon=1e-08,beta_1=0.9,beta_2=0.999)
model.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])

In [None]:
history = None
print('Not using data argumentation.')
history = model.fit(x_train,y_train,
                   batch_size=128,
                   epochs=5,
                   validation_data=(x_test,y_test),
                   shuffle=True)

# **6、Evaluate the model**

6.1 Training and validation curves.¶
Let's see the training and validation process by the visualization of history of fitting. This allow us to quickly know if how our model fit our data (overfitting, underfitting, model convergence, etc...)

In [None]:
def eva(history):
    fig, axs = plt.subplots(1,2,figsize=(15,5))
    axs[0].plot(history.history['accuracy'])
    axs[0].plot(history.history['val_accuracy'])
    axs[0].set_title('Model Accuracy')
    axs[0].set_ylabel('Accuracy')
    axs[0].set_xlabel('Epoch')
    axs[0].legend(['train','validate'], loc='upper left')
    
    axs[1].plot(history.history['loss'])
    axs[1].plot(history.history['val_loss'])
    axs[1].set_title('Model Loss')
    axs[1].set_ylabel('Loss')
    axs[1].set_xlabel('Epoch')
    axs[1].legend(['train','validate'], loc='upper left')
    plt.show()
    
print(history.history.keys())
eva(history)

# **7、Score trained model and prediction**

In [None]:
scores = model.evaluate(x_test,y_test)
print('Test loss:',scores[0])
print('Test accuracy:',scores[1])

pred = model.predict(x_test)

# **8、Confusion matrix**

Confusion matrix can be very helpfull to see your model drawbacks. We plot the confusion matrix of the validation results. For good vizualization of our confusion matrix, we have to define to fonction

In [None]:
labels = ['Airplane', 'Automobile', 'Bird', 'Cat', 'Deer', 'Dog', 'Frog', 'Horse', 'Ship', 'Truck']

# **9、Check the predictions**

In [None]:
def show_test(number):
    fig = plt.figure(figsize = (3,3))
    test_image = np.expand_dims(x_test[number], axis=0)
    test_result = model.predict_classes(test_image)
    plt.imshow(x_test[number])
    dict_key = test_result[0]
    plt.title("Predicted: {} ".format(labels[dict_key]))

In [None]:
show_test(10)

# **10、Save**

In [None]:
path = os.path.join(os.getcwd(),'save_models')
model_name = 'keras_cifar10_trained_model.h5'
if not os.path.isdir(path):
    os.mkdir(path)

model_path = os.path.join(path,model_name)
model.save(model_path)
print('Saved trained model at %s ' % model_path)

scores = model.evaluate(x_test,y_test,verbose=1)
print('Test loss:',scores[0])
print('Test accuracy:', scores[1])

In [None]:
!ls ../working

In [None]:
train_images_path = "/kaggle/working/train"
test_images_path = "/kaggle/working/test"

In [None]:
!ls /kaggle/working

In [None]:
test_images_path