### importing all required library

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
''' importing seaborn QQ plot module '''
try :
    from seaborn_qqplot import pplot
except:
    !pip install seaborn_qqplot
    from seaborn_qqplot import pplot

In [None]:
from kaggle_datasets import KaggleDatasets

# Changing plot style

In [None]:
sns.set_style("whitegrid")

# image reading and transformation library fromscikit

In [None]:
from skimage import io
from skimage.transform import resize

# Reading train and test csv file 

In [None]:
train_file = r"../input/petfinder-pawpularity-score/train.csv"
test_file =r"../input/petfinder-pawpularity-score/test.csv"
train_jpeg_path ="../input/petfinder-pawpularity-score/train/"
test_jpeg_path = "../input/petfinder-pawpularity-score/test/"
train_df = pd.read_csv( train_file, sep =",")
test_df = pd.read_csv( test_file, sep ="," )

train_df.head()

# THis to store min max value for MIn max Scaling opration in TF record 

In [None]:
MIN_DATA = train_df["Pawpularity"].min()
DENOMINATOR = train_df["Pawpularity"].max() - train_df["Pawpularity"].min()

# Checking for any missing values in feature column

In [None]:
train_df.info()

# Cheking distribution of train dataframe

In [None]:
train_df.describe().T

# Data set size

In [None]:
print ( "train data frame size ={}".format( train_df.shape))
print ( "test data frame size ={}".format( test_df.shape))

## Lets look at metat data to understand how each features are influencing popularity of each image/animal adaption

In [None]:
def create_line_plot( x_value, y_value, color,title ):
    
    fig = plt.figure( figsize = (20,6), dpi = 90 )
    fig.suptitle( title  )
    sns.lineplot(  x = x_value, y =y_value, hue = color )
    plt.show()

In [None]:
for each_feature in train_df.columns:
    if "Id" == each_feature or "Pawpularity" == each_feature: continue
        
    create_line_plot ( x_value = train_df.index, y_value = train_df["Pawpularity"], color = train_df[each_feature] , title =" Trend popularity if photo has " + each_feature )

#  Checking Distribution of ID for each category

In [None]:
fig = plt.figure( figsize = (20, 10 ), dpi = 90 )
counter = 0
for each_feature in train_df.columns:
    if "Id" == each_feature or "Pawpularity" == each_feature: continue
    counter =counter+1
    plt.subplot( 3, 4,counter )
    sns.countplot( data = train_df, x = each_feature )
plt.tight_layout()
plt.show()

# Check popularity score distribution for different categoryfig = plt.figure( figsize = (20, 10 ), dpi = 90 )

In [None]:
fig = plt.figure( figsize = (20, 10 ), dpi = 90 )
counter = 0
for each_feature in train_df.columns:
    if "Id" == each_feature or "Pawpularity" == each_feature: continue
    counter =counter+1
    plt.subplot( 3, 4,counter )
    plt.title(  "Pawpularity for different " + each_feature)
    sns.histplot(x = train_df["Pawpularity"], hue = train_df[each_feature], kde = True)
plt.tight_layout()
plt.show()

# Lets visualize few images 

In [None]:
sns.set_style("white")
fig = plt.figure( figsize = (8,8),dpi = 300 )
for each_index in range( 0, 9 ):
    plt.subplot( 3, 3, each_index + 1 )
    data = train_df.iloc[each_index ]
    img_data = io.imread( train_jpeg_path + train_df["Id"][each_index] + ".jpg" )
    plt.imshow(  resize( img_data, (800,800 ), anti_aliasing=True ) )
    
    title_text = "Photo with \n" +\
    ",".join([ col_name for col_name, value in zip( train_df.columns, train_df.iloc[each_index].values ) if (value != 0 ) & (col_name !="Id") & (col_name !="Pawpularity") ] )
    plt.xticks( fontsize = 1 )
    plt.yticks( fontsize = 1 )
    plt.title( title_text ,fontsize=5)

plt.tight_layout()
plt.show()

# Thing to note, image size are not uniform. Have to consider resizing all image while working with model

# Lets start bulidng model 

In [None]:
## import all required building library 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import tensorflow as tf

In [None]:
class CFG():
    ''' Class to store all setting parameters'''
    SEED = 100
    TEST_SIZE= 0.2
    SHUFFLE_DATA = True
    IMG_HEIGHT = 300 # controler for image height
    IMG_WIDTH  = 300 # controler for image width
    BUFFER_SIZE = 1000 # Buffer control size for shuffle 
    BATCH_SIZE =200 # Number of images in each batch
    

In [None]:
## Adding each image complete path to new column usefull while working with Tensorflow datagenerator
train_df["img_path"] =  train_jpeg_path + train_df["Id"]  +".jpg"
train_df.head()

In [None]:

def create_test_val_train_split():
    
    train_split_df,test_split_df = train_test_split( train_df , test_size = CFG.TEST_SIZE , shuffle = CFG.SHUFFLE_DATA, random_state = CFG.SEED )
    train_split_df,val_split_df = train_test_split( train_split_df , test_size = CFG.TEST_SIZE , shuffle = CFG.SHUFFLE_DATA, random_state = CFG.SEED )
    min_max_scalar_obj =MinMaxScaler()
    train_split_df["popularity_transformed"] = min_max_scalar_obj.fit_transform( train_split_df["Pawpularity"].values.reshape(-1,1) )
    test_split_df["popularity_transformed"] = min_max_scalar_obj.transform( test_split_df["Pawpularity"].values.reshape(-1,1) )
    val_split_df["popularity_transformed"] = min_max_scalar_obj.transform( val_split_df["Pawpularity"].values.reshape(-1,1) )
    
    return train_split_df, val_split_df, test_split_df, min_max_scalar_obj

train_split_df, val_split_df, test_split_df,_ = create_test_val_train_split()    
print ( "Test data frame size after split = {}".format( test_split_df.shape ))
print ( "Train data frame size after split = {}".format( train_split_df.shape ))
print ( "Validation data frame size after split = {}".format( val_split_df.shape ))

# check updated Train dataframe

In [None]:
train_split_df.head()

# Check Updated Test Data Frame

In [None]:
test_split_df.head()

# Check Updated Validation Data Frame

In [None]:
val_split_df.head()

# Check Distribution of Train and test popularity_transformed column 

In [None]:
sns.set_style("whitegrid")
fig,axis = plt.subplots( nrows = 1, ncols = 3 , figsize = (15,8), dpi = 90  )

axis = axis.flatten()


sns.histplot( x=train_split_df["popularity_transformed"] , kde = True , ax = axis[0] )
axis[0].set_title ("Train split data frame transformed popularity score ")

sns.histplot( x=test_split_df["popularity_transformed"] , kde = True , ax = axis[1])
axis[1].set_title ("Test split data frame transformed popularity score ")

sns.histplot( x=val_split_df["popularity_transformed"] , kde = True , ax = axis[2])
axis[2].set_title ("Validation split data frame transformed popularity score ")

plt.show()



# Test QQ plot

In [None]:

pplot( data =train_split_df, 
      x="popularity_transformed" ,
      y = "popularity_transformed", 
      kind = "qq", height = 4, aspect = 2, display_kws={"identity":False, "fit":True})


# Test QQ plot

In [None]:
pplot( data =test_split_df, 
      x="popularity_transformed" ,
      y = "popularity_transformed", 
      kind = "qq", height = 4, aspect = 2, display_kws={"identity":False, "fit":True})



# Validation QQ Plot

In [None]:
pplot( data =val_split_df, 
      x="popularity_transformed" ,
      y = "popularity_transformed", 
      kind = "qq", height = 4, aspect = 2, display_kws={"identity":False, "fit":True})



# Creating TF record

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Device:', tpu.master())
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except:
    strategy = tf.distribute.get_strategy()
print('Number of replicas:', strategy.num_replicas_in_sync)

In [None]:

def create_test_val_train_split():
    
    train_split_df,test_split_df = train_test_split( train_df , test_size = CFG.TEST_SIZE , shuffle = CFG.SHUFFLE_DATA, random_state = CFG.SEED )
    train_split_df,val_split_df = train_test_split( train_split_df , test_size = CFG.TEST_SIZE , shuffle = CFG.SHUFFLE_DATA, random_state = CFG.SEED )
    min_max_scalar_obj =MinMaxScaler()
    train_split_df["popularity_transformed"] = min_max_scalar_obj.fit_transform( train_split_df["Pawpularity"].values.reshape(-1,1) )
    test_split_df["popularity_transformed"] = min_max_scalar_obj.transform( test_split_df["Pawpularity"].values.reshape(-1,1) )
    val_split_df["popularity_transformed"] = min_max_scalar_obj.transform( val_split_df["Pawpularity"].values.reshape(-1,1) )
    
    return train_split_df, val_split_df, test_split_df, min_max_scalar_obj

train_split_df, val_split_df, test_split_df,_ = create_test_val_train_split()    
print ( "Test data frame size after split = {}".format( test_split_df.shape ))
print ( "Train data frame size after split = {}".format( train_split_df.shape ))
print ( "Validation data frame size after split = {}".format( val_split_df.shape ))

In [None]:
CREATE_TF_RECORD = False

def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _uint8_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


def train_serialize_example(image, img_id, score ):
    feature = {
      'image'         : _uint8_feature(image),
      'image_id'      : _bytes_feature(img_id),   
      'score'        : _float_feature(score),
      }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()




if CREATE_TF_RECORD :
    
    dest_file="./"
    each_tfrec_size = 1
    
    temp_train, temp_test, temp_val, min_max_obj = create_test_val_train_split()
    
    
    for i in range(0, 3 ) :
        if i ==0 :
            img_id = temp_train["Id"].values
            file_name = temp_train["img_path"].values
            target = temp_train["popularity_transformed"].values
            tf_record_name = "Pet_Finder_Score_300x300_tf_record_train_img_count-" +str(len(file_name)) +".tfrec"
        if i == 1:
            img_id = temp_test["Id"].values
            file_name = temp_test["img_path"].values
            target = temp_test["popularity_transformed"].values
            tf_record_name = "Pet_Finder_Score_300x300_tf_record_test_img_count-" +str(len(file_name)) +".tfrec"
            
        
        if i == 2:
            img_id = temp_val["Id"].values
            file_name = temp_val["img_path"].values
            target = temp_val["popularity_transformed"].values
            tf_record_name = "Pet_Finder_Score_300x300_tf_record_val_img_count-" +str(len(file_name)) +".tfrec"
        
        
        with tf.io.TFRecordWriter( dest_file +tf_record_name ) as writer:
                    counter = 0
                    for each_id, each_file, each_target  in zip( img_id,file_name, target ):
                        ##read_img = tf.io.read_file(  each_file )
                        ##read_img = tf.io.decode_jpeg( read_img, channels =3  )
                        ##read_img = tf.image.resize( images = read_img, size = [ 300, 300 ] )
                        ##read_img = read_img.numpy()
                        ##read_img = read_img.astype(np.float32 )
                        read_img = io.imread(each_file )
                        read_img = np.resize( read_img, [ 300, 300,3 ] )
                        #read_img = tf.cast( read_img, dtype = tf.float32 ).numpy()
                        #writer.write(train_serialize_example( read_img.tobytes() , str.encode(each_id ), each_target ))
                        writer.write(train_serialize_example( read_img.tobytes() , str.encode(each_id ), each_target ))
                        counter += 1
                        #if counter> 2: break 
                            
                    writer.close()
            
        print ( "completed " + tf_record_name )
    
    

## Reading TF record File to train model 

In [None]:


def prepare_target(target):    
    target = tf.cast(target, tf.float32)   
    #target = tf.divide( tf.subtract( target , MIN_DATA ), DENOMINATOR) 
    
    #target = tf.reshape(target, [1])         
    return target

def read_labeled_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        "image" : tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        "image_id":tf.io.FixedLenFeature([], tf.string),
        "score": tf.io.FixedLenFeature([], tf.float32),  # shape [] means single element
    }
    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    
    image =  tf.io.decode_raw( example['image'] ,tf.uint8 )
    #image = example['image'] 
    #image = tf.reshape(image, ( CFG.IMG_HEIGHT, CFG.IMG_WIDTH ,3) )
    image = tf.divide(image,  255 )
    
    target = prepare_target(example['score'])
    return tf.reshape(image, ( CFG.IMG_HEIGHT, CFG.IMG_WIDTH ,3) ), target # returns a dataset of (image, label) pairs



def load_dataset(fileids, labeled=True, ordered=False ):
    # Read from TFRecords. For optimal performance, reading from multiple files at once and
    # disregarding data order. Order does not matter since we will be shuffling the data anyway.

    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False # disable order, increase speed

    dataset = tf.data.TFRecordDataset(fileids, num_parallel_reads=tf.data.AUTOTUNE) # automatically interleaves reads from multiple files
    dataset = dataset.with_options(ignore_order) # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.map(read_labeled_tfrecord,num_parallel_calls= tf.data.AUTOTUNE)
    return dataset


## Main function 
def get_training_dataset_tf_rec(file_ist,repeat = True, order = False , drop_remainder= True   ):
    dataset = load_dataset(file_ist, labeled=True, ordered = False )
    dataset = dataset.repeat()  if repeat else  dataset # the training dataset must repeat for several epochs
    dataset = dataset.shuffle(20, seed=CFG.SEED)
    dataset = dataset.batch(CFG.BATCH_SIZE, drop_remainder=drop_remainder)
    dataset = dataset.prefetch(tf.data.AUTOTUNE) # prefetch next batch while training (autotune prefetch buffer size)
    dataset = dataset.cache()

    return dataset

In [None]:
#train_data_gen = get_training_dataset_tf_rec ( train_files ,repeat = True, order = False , drop_remainder= False   )

In [None]:
def read_jpeg( image_path ):
    read_img = tf.io.read_file(  image_path )
    read_img = tf.io.decode_jpeg( read_img, channels =3  )
    read_img = tf.image.resize( images = read_img, size = [ CFG.IMG_HEIGHT, CFG.IMG_WIDTH ] )
    read_img = tf.divide(read_img,255 )
    return read_img
    
    
    
def transform_jepg( ):
    
    def read_transform_jpeg(image_path, score ):
        
        return read_jpeg( image_path ), score
    
    return read_transform_jpeg

def data_generator( image_files, score ):
    read_transform_jpeg = transform_jepg()
    AUTO_TUNE = tf.data.AUTOTUNE
    
    data_gen = tf.data.Dataset.from_tensor_slices( ( image_files, score) )
    
    data_gen = data_gen.map( map_func = read_transform_jpeg , num_parallel_calls= AUTO_TUNE )
    #data_gen = data_gen.cache()
    data_gen = data_gen.prefetch(buffer_size = CFG.BUFFER_SIZE )
    data_gen = data_gen.shuffle( buffer_size = CFG.BUFFER_SIZE, seed = CFG.SEED, reshuffle_each_iteration = True )
    data_gen = data_gen.batch( batch_size = CFG.BATCH_SIZE,drop_remainder=False)#, num_parallel_calls = AUTO_TUNE )
    data_gen = data_gen.prefetch(buffer_size = CFG.BUFFER_SIZE )
    data_gen = data_gen.repeat( True )
    return data_gen

#train_data_gen = data_generator( image_files = train_split_df["img_path"].values[:10], score = train_split_df["popularity_transformed"].values[:10] )

In [None]:
def mobile_net_model( pre_trained_model_trainable = False ):

    input_layer = tf.keras.layers.Input( shape = ( CFG.IMG_HEIGHT, CFG.IMG_WIDTH, 3 ) , name ="input_layer" )
    mobile_net_model = tf.keras.applications.MobileNetV2( input_shape = [ CFG.IMG_HEIGHT, CFG.IMG_WIDTH, 3 ], 
                                                         input_tensor = input_layer ,
                                                         include_top= False, 
                                                         weights = "imagenet",
                                                         pooling = True )
    mobile_net_model.trainable =pre_trained_model_trainable
    
    gaussian_noise = tf.keras.layers.GaussianNoise( stddev = 0.3 ) ( input_layer )
    random_crop = tf.keras.layers.experimental.preprocessing.RandomCrop( height = 30, width = 30  ) (gaussian_noise)
    random_flip =tf.keras.layers.experimental.preprocessing.RandomFlip( mode="horizontal_and_vertical") ( random_crop )
    zoom_layer = tf.keras.layers.experimental.preprocessing.RandomZoom(  height_factor =(-0.3, -0.2)  , width_factor=(-0.3, -0.2), fill_mode='reflect', interpolation='bilinear', fill_value=0.0 ) ( random_flip)
    random_contrast = tf.keras.layers.experimental.preprocessing.RandomContrast( factor =[0.2, 0.8 ]  ) ( zoom_layer )
    
    mobile_net_model.layers[0]( random_contrast )

    layer_00 = mobile_net_model.layers[-1].output
    layer_01 = tf.keras.layers.GlobalAvgPool2D () ( layer_00 )
    layer_02 = tf.keras.layers.Flatten(name ="flatten_layer")(layer_01)
    layer_03 = tf.keras.layers.Dense(units = 100, activation= tf.keras.layers.LeakyReLU(alpha = 1 ),use_bias=True, kernel_initializer='glorot_uniform' )(layer_02 )
    layer_04 = tf.keras.layers.Dense(units = 1, activation= tf.keras.layers.LeakyReLU(alpha = 1 ),use_bias=True, kernel_initializer='glorot_uniform' )(layer_03 )

    model = tf.keras.Model( inputs = input_layer, outputs = layer_04 )

    return model

In [None]:

if strategy.num_replicas_in_sync !=1:
    gcs_path= KaggleDatasets().get_gcs_path("pet-finder-tf-record-300x300-rev-01")
    
else:
    
    gcs_path = KaggleDatasets().get_gcs_path("petfinder-pawpularity-score")

    
val_file_name = [gcs_path + "/Pet_Finder_Score_300x300_tf_record_val_img_count-1983.tfrec"]
train_file_name = [ gcs_path + "/Pet_Finder_Score_300x300_tf_record_train_img_count-6343.tfrec"  ]
test_file_name= [ gcs_path + "/Pet_Finder_Score_300x300_tf_record_test_img_count-1586.tfrec"]



try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Device:', tpu.master())
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except:
    strategy = tf.distribute.get_strategy()
print('Number of replicas:', strategy.num_replicas_in_sync)

class CFG():
    ''' Class to store all setting parameters'''
    SEED = 100
    TEST_SIZE= 0.3
    SHUFFLE_DATA = True
    IMG_HEIGHT = 300 # controler for image height
    IMG_WIDTH  = 300 # controler for image width
    BUFFER_SIZE = 1024 # Buffer control size for shuffle 
    BATCH_SIZE =  100 # Number of images in each batch

if strategy.num_replicas_in_sync != 1:  CFG.BATCH_SIZE =  strategy.num_replicas_in_sync * 70
CFG.BATCH_SIZE

In [None]:
with strategy.scope():
    model_1 = mobile_net_model( pre_trained_model_trainable= True )
    optimizer1 = tf.keras.optimizers.Adam( learning_rate = 0.001 )
    model_1.compile( optimizer = optimizer1, loss = tf.keras.losses.MeanSquaredError() , metrics = [ tf.keras.metrics.MeanSquaredError() ])

train_split_df, val_split_df, test_split_df,min_max_scalar_obj = create_test_val_train_split()

if False:
    '''Data generator using JPEG files '''
    #train_split_df ["img_path"] = gcs_path  +"/train/" + train_split_df["Id"] +".jpg" #../input/petfinder-pawpularity-score/train
    #val_split_df["img_path"] = gcs_path  +"/train/" +  val_split_df["Id"] +".jpg"
    train_data_gen = data_generator( image_files = train_split_df["img_path"].values, 
                                                    score = train_split_df["popularity_transformed"].values )

    val_data_gen = data_generator( image_files = val_split_df["img_path"].values, 
                                                    score = val_split_df["popularity_transformed"].values )
    val_steps  = int ( val_split_df.shape[0] / CFG.BATCH_SIZE ) + ( 1 if (val_split_df.shape[0] % CFG.BATCH_SIZE ) != 0 else 0 )
    train_steps  = int( train_split_df.shape[0] / CFG.BATCH_SIZE ) + ( 1 if (train_split_df.shape[0] % CFG.BATCH_SIZE) != 0 else 0 )

else:
    
    '''Data generator using TFRECORD files '''
    train_data_gen = get_training_dataset_tf_rec ( train_file_name ,repeat = True, order = False , drop_remainder= False   )
    val_data_gen   = get_training_dataset_tf_rec ( val_file_name ,repeat = True, order = False , drop_remainder= False   )

    val_steps  = int ( 1983 / CFG.BATCH_SIZE ) + ( 1 if (1983 % CFG.BATCH_SIZE ) != 0 else 0 )
    train_steps  = int( 6343 / CFG.BATCH_SIZE ) + ( 1 if (6343 % CFG.BATCH_SIZE) != 0 else 0 )



lr_reducer = tf.keras.callbacks.ReduceLROnPlateau(  patience=38,
                                                    min_lr= 0.000001,
                                                    monitor='val_loss', 
                                                    factor=0.45, 
                                                    verbose=1,
                                                    min_delta = 0.02,
                                                    cooldown=3,
                                                    mode='auto', 
                                                   )

checkpoint = tf.keras.callbacks.ModelCheckpoint( f'./model{1}.h5', save_best_only=True, monitor='val_loss', mode='min')

model_history = model_1.fit( train_data_gen,
                        
                         steps_per_epoch= train_steps, 
                         epochs = 100, 
                         validation_data= val_data_gen,
                         validation_steps = val_steps,
                         callbacks=[ checkpoint,lr_reducer ]
                       )


In [None]:
#val_steps,train_steps

In [None]:
if False:
    model_1 = mobile_net_model( pre_trained_model_trainable= True )
    model_1.load_weights("./model1.h5")
    #test_data_gen = data_generator( image_files = test_split_df["img_path"].values,  score = test_split_df["popularity_transformed"].values , test_data = True )
    test_data_gen   = get_training_dataset_tf_rec ( ["./test_300x300_tf_record.tfrec"],repeat = True, order = True , drop_remainder= False   )
    test_split_df["model_predict"] = model_1.predict( test_data_gen )

In [None]:
#sns.scatterplot( data = test_split_df, x = "model_predict", y ="popularity_transformed")