# Tourism  Destination Image Preprocessing

The training pictures are stored in the datatrain directory, and the test pictures are stored in the dataval directory, all are stored by category.

In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
import cv2 as cv

from keras.utils import np_utils, to_categorical

## 1、Digital image processing functions
### read all files in the directory, get digital images and their labels

In [3]:
import os
import glob
from imutils import paths   #Opencv-based toolbox for image file processing
import imutils
from sklearn.utils import shuffle

traindata='datatrain\\'    #Set the directory where the training images are located
testdata='dataval\\'       #Set the directory where the test images are located
img_size=192               #Set the size of each picture

train_labels = list(paths.os.listdir(traindata))
test_labels = list(paths.os.listdir(testdata))

###  Image data preprocessing support function

In [4]:
#Get subdirectories under the image directory, each subdirectory represents a category
def get_fullpath(data):
    print("Loading file structure...",data)
    major = os.listdir(data)
    full_path=[]
    for a in major:
          full_path.append(data + a + "\\")
            
    return full_path  

#Get the files under each sub-directory (ie picture category) and digitize it
def get_imgdata(data,labels):
    full_path=get_fullpath(data)
    X_data = []
    y_labels = []
    total_images = 0

    
    for i in full_path:
        images_in_folder = 0
        label = i.split('\\')[1]
        labelIndex=labels.index(label)
        print('label:',label,':',labelIndex)
        for file in glob.glob(i + "*.jpg"):
            try:
                img = cv.cvtColor(cv.imread(file),cv.COLOR_BGR2RGB)
            except:
                Exception
                continue
            img = cv.resize(img, (img_size,img_size))
            total_images+=1
            X_data.append(img)
            y_labels.append(labelIndex)
            images_in_folder += 1
        print("The total number of images in %s = %d" % (i,images_in_folder))
    print("The total number of images in data = " + str(total_images))
    
    return X_data,y_labels

#Normalize and standardize the digitized pictures
def data_norm(x_data):
    x_data = np.array(x_data, np.float16) / 255.
    
    x_data = (x_data - x_data.mean(axis=0))/ x_data.std(axis = 0)
    
    return x_data

###  The main function for data processing
The processed image data is saved as a numpy file

In [5]:
def dataToNpy(img_size):
    X_train,y_train=get_imgdata(traindata,train_labels)  
    X_train, y_train = shuffle(X_train, y_train, random_state=0) 
    
    y_train = to_categorical(y_train)             #onehot encode

    np.save('y_train%dN.npy'%(img_size),y_train)  #Save training data tags
    print("Save training label data successfully！")
    
    X_train=data_norm(X_train)

    np.save('X_train%dN.npy'%(img_size),X_train)  #Save processed training data
    print("Save training data successfully！")
    
    del X_train,y_train   
    
    X_test,y_test=get_imgdata(testdata,test_labels)
    X_test, y_test = shuffle(X_test, y_test, random_state=0)  #
    
    y_test = to_categorical(y_test)              #onehot encode

    np.save('y_test%dN.npy'%(img_size),y_test)   #Save test data tags
    print("Save test label data successfully！！")
    
    X_test=data_norm(X_test)
    
    
    np.save('X_test%dN.npy'%(img_size),X_test)   #Save processed test data
    print("Save training data successfully！")
    
    print('Data processing completed！')

## 2 Perform image data preprocessing

 set different image sizes and call the above functions to generate processed image data for later model training.

In [6]:
img_size=192
dataToNpy(img_size)

Loading file structure... datatrain\
label: E-FV : 0
The total number of images in datatrain\E-FV\ = 501
label: E-M : 1
The total number of images in datatrain\E-M\ = 479
label: E-S : 2
The total number of images in datatrain\E-S\ = 536
label: E-WC : 3
The total number of images in datatrain\E-WC\ = 509
label: L-B : 4
The total number of images in datatrain\L-B\ = 2252
label: L-C : 5
The total number of images in datatrain\L-C\ = 415
label: L-E : 6
The total number of images in datatrain\L-E\ = 1312
label: L-H : 7
The total number of images in datatrain\L-H\ = 561
label: L-O : 8
The total number of images in datatrain\L-O\ = 3007
label: L-S : 9
The total number of images in datatrain\L-S\ = 710
label: R-A : 10
The total number of images in datatrain\R-A\ = 629
label: R-L : 11
The total number of images in datatrain\R-L\ = 1576
label: S-M : 12
The total number of images in datatrain\S-M\ = 756
label: S-P : 13
The total number of images in datatrain\S-P\ = 626
label: T-A : 14
The total n

### notes
Set different image sizes, test models need appropriate image sizes！

img_size= Set different sizes

dataToNpy(img_size)