In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

## Most Important
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from pathlib import Path
from PIL import Image
import scipy

## less Important
from functools import partial
import os
from scipy import stats
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D,AveragePooling2D
from keras.callbacks import LearningRateScheduler,ReduceLROnPlateau
from keras.preprocessing.image import ImageDataGenerator

## Sklearn
from sklearn import datasets
## Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
## Metrics
from sklearn.metrics import accuracy_score

## tensorflow & Keras
import tensorflow as tf    ## i will use tf for every thing and for keras using tf.keras
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Reading Data

In [1]:
train_labels = pd.read_csv('../input/arabic-hwr-ai-pro-intake1/train.csv')
train_images = Path(r'../input/arabic-hwr-ai-pro-intake1/train')

## read these all training images paths as Series
train_images_paths = pd.Series(sorted(list(train_images.glob(r'*.png'))), name='Filepath').astype(str)

train_images_paths.head()

# Explore the data

In [1]:
img_key_value = {}
for value in train_labels['label'].unique():
    img_key_value[value] = train_labels[train_labels['label']==value].index[0]
    
img_index = list(img_key_value.values())
img_label = list(img_key_value.keys())

fig, ax = plt.subplots(4, 7, figsize=(12, 8))

i = 0
for row in range(4):
    for col in range(7):
        plt.sca(ax[row, col])
        plt.title(f'label = {img_label[i]}')
        img = plt.imread(train_images_paths.iloc[img_index[i]])
        plt.imshow(img)
        plt.axis('off')
        i+=1

In [1]:
# know th shape 
print('Number of Instances in train_set =>', len(train_images_paths))
print('Number of Instances in train_labels =>', len(train_labels))

print()

img = plt.imread(train_images_paths.iloc[img_index[0]])
print('shape of each Image is =>', img.shape)

# Data Preprocessing

In [1]:
train_full_labels = train_labels['label'].values
train_full_set = np.empty((13440, 32, 32, 3), dtype=np.float32)  #take only the first 2 channels

for idx, path in enumerate(train_images_paths):
    img = plt.imread(path)
    img = img[:,:,:3]
    train_full_set[idx] = img
    
print('train_full_set.shape =>', train_full_set.shape)
print('train_full_labels.shape =>', train_full_labels.shape)

# Split the data

In [1]:
X_train, X_valid, y_train, y_valid = train_test_split(train_full_set, train_full_labels, 
                                                      test_size=0.2, shuffle=True, random_state=42)

print('X_train.shape =>', X_train.shape)
print('X_valid.shape =>', X_valid.shape)
print('y_train.shape =>', y_train.shape)
print('y_valid.shape =>', y_valid.shape)

In [1]:
#Onehot Encoding the labels.
import keras
import keras.utils
from keras import utils as np_utils
from sklearn.utils.multiclass import unique_labels
from tensorflow.keras.utils import to_categorical



#Since we have 10 classes we should expect the shape[1] of y_train,y_val and y_test to change from 1 to 10
y_train=to_categorical(y_train)
y_valid=to_categorical(y_valid)

#Verifying the dimension after one hot encoding
print((X_train.shape,y_train.shape))
print((X_valid.shape,y_valid.shape))

# Model training

In [1]:
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, BatchNormalization,AveragePooling2D


In [1]:
# model = tf.keras.models.Sequential([
#     tf.keras.layers.Conv2D(filters=16, kernel_size=3, activation='relu',input_shape=(32, 32, 3)),
#     tf.keras.layers.MaxPooling2D(pool_size=2),
    
#     tf.keras.layers.Conv2D(filters=32, kernel_size=3, activation='relu'),
#     tf.keras.layers.MaxPooling2D(pool_size=2),
    
#     tf.keras.layers.Conv2D(filters=64, kernel_size=3, activation='relu', ),
#     tf.keras.layers.MaxPooling2D(pool_size=2),
    
#     tf.keras.layers.GlobalAveragePooling2D(),
#     tf.keras.layers.Dense(29, activation='softmax')
 
# ])

In [1]:
allow_soft_placement=True


In [1]:
nets = 5
model = [0] *nets
for j in range(nets):
    model[j] = Sequential()
    model[j].add(Conv2D(32, kernel_size = 3, activation='relu', input_shape=(32, 32, 3)))
    model[j].add(BatchNormalization())
    model[j].add(Conv2D(32, kernel_size = 3, activation='relu'))
    model[j].add(BatchNormalization())
    model[j].add(Conv2D(32, kernel_size = 5, strides=2, padding='same', activation='relu'))
    model[j].add(BatchNormalization())
    model[j].add(Dropout(0.4))

    #Second Layer of CNN
    model[j].add(Conv2D(64, kernel_size = 3, activation='relu'))
    model[j].add(BatchNormalization())
    model[j].add(Conv2D(64, kernel_size = 3, activation='relu'))
    model[j].add(BatchNormalization())
    model[j].add(Conv2D(64, kernel_size = 5, strides=2, padding='same', activation='relu'))
    model[j].add(BatchNormalization())
    model[j].add(Dropout(0.4))

    #Third layer of CNN
    model[j].add(Conv2D(128, kernel_size = 4, activation='relu'))
    model[j].add(BatchNormalization())
    model[j].add(Flatten())
    model[j].add(Dropout(0.4))

    #Output layer
    model[j].add(Dense(29, activation='softmax'))

    # Compile each model
    model[j].compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    early_stopp = tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)

In [1]:
model

In [1]:
len(model)

In [1]:
# Generate batches of tensor image data with real-time data augmentation more detail: https://keras.io/preprocessing/image/
datagen = ImageDataGenerator(rotation_range=2, zoom_range = 0.1, width_shift_range=0.1, height_shift_range=0.1)
datagen.fit(X_train)

In [1]:
batch_size = 32 # Handle 32 pictures at each round
epochs = 240 

In [1]:
for j in range(5):
    print(f'Individual Net : {j+1}')   
    model[j].fit_generator(datagen.flow(X_train,y_train, batch_size=batch_size),
                                        epochs = epochs, steps_per_epoch=X_train.shape[0] // batch_size,
                                        validation_data = (X_valid,y_valid), 
                                        callbacks=[ReduceLROnPlateau(monitor='loss', patience=3, factor=0.1)], 
                                        verbose=2)

In [1]:
# model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# early_stopp = tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)

In [1]:
# history = model.fit(X_train, y_train, validation_data=(X_valid, y_valid), 
#                     epochs=10, batch_size=32, callbacks=[early_stopp])

In [1]:
model

# Evaluate on test set

In [1]:
test_labels = pd.read_csv('../input/arabic-hwr-ai-pro-intake1/test.csv')
test_images = Path(r'../input/arabic-hwr-ai-pro-intake1/test')

## read these all training images paths as Series
test_images_paths = pd.Series(sorted(list(test_images.glob(r'*.png'))), name='Filepath').astype(str)

test_images_paths.head()

In [1]:
print('Number of Instances in test_set is', len(test_images_paths))


In [1]:
test_full_set = np.empty((3360, 32, 32, 3), dtype=np.float32)  #take only the first 3 channels

for idx, path in enumerate(test_images_paths):
    img = plt.imread(path)
    img = img[:,:,:3]
    test_full_set[idx] = img
    
print('test_full_set.shape =>', test_full_set.shape)

In [1]:
# y_preds_classes = np.argmax(model.predict(test_full_set), axis=-1)
# Predict labels with models
labels = []
for m in model:
    predicts = np.argmax(m.predict(test_full_set), axis=1)
    labels.append(predicts)
    
# Ensemble with voting
labels = np.array(labels)
labels = np.transpose(labels, (1, 0))
labels = scipy.stats.mode(labels, axis=1)[0]
labels = np.squeeze(labels)


In [1]:
labels.shape

In [1]:
test_labels['label'] = labels

In [1]:
test_labels

In [1]:
# Dump predictions into submission file
pd.DataFrame({'ImageId' : np.arange(1, predicts.shape[0] + 1), 'Label' : labels }).to_csv('/kaggle/working/submission.csv', index=False)

In [1]:
# test_labels['label'] = y_preds_classes


In [1]:
# test_labels


In [1]:
test_labels[['id', 'label']].to_csv('/kaggle/working/submission.csv', index=False)
