In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import tensorflow as tf
import re
import math
import os
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, GroupKFold
from sklearn.preprocessing import LabelEncoder
import cudf
import cuml
from cuml.feature_extraction.text import TfidfVectorizer
from cuml import PCA
from joblib import dump, load
import gc


# Amount of tf records we want to create
FOLDS = 15
# Random seed for stratification
SEED = 123
# Image size 
IMAGE_SIZE = (512, 512)

In [None]:
# Function to get our f1 score
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

# Function to get our text title embeddings (we also use pca to reduce the dimension)
def get_text_embeddings(df_cu, max_features = 15000, n_components = 5000):
    model = TfidfVectorizer(stop_words = 'english', binary = True, max_features = max_features)
    text_embeddings = model.fit_transform(df_cu['title']).toarray()
    # Save tfidf model to disk for inference
    dump(model, 'TfidfVectorizer.joblib')
    # Sanity Check
    model = load('TfidfVectorizer.joblib')
    # Save pca model to disk for inference
    pca = PCA(n_components = n_components)
    text_embeddings = pca.fit_transform(text_embeddings).get()
    dump(pca, 'PCA.joblib')
    print(f'Our title text embedding shape is {text_embeddings.shape}')
    del model, pca
    gc.collect()
    return text_embeddings

# Function to read and preprocess our data
def preprocess():
    # Read train and test csv
    train = pd.read_csv('../input/shopee-product-matching/train.csv')
    test = pd.read_csv('../input/shopee-product-matching/test.csv')
    label_mapper = dict(zip(train['label_group'].unique(), np.arange(len(train['label_group'].unique()))))
    train['label_group'] = train['label_group'].map(label_mapper)
    # Get ground truth labels format
    tmp = train.groupby(['label_group'])['posting_id'].unique().to_dict()
    train['matches'] = train['label_group'].map(tmp)
    train['matches'] = train['matches'].apply(lambda x: ' '.join(x))
    # Calculate title features with tfidf
    train_cu = cudf.DataFrame(train)
    text_embeddings = get_text_embeddings(train_cu)
    # Calculate naive score using self-post
    train['f1'] = f1_score(train['matches'], train['posting_id'])
    score = train['f1'].mean()
    print(f'Using the same posting id as prediction our f1 score is {score}')
    return train, text_embeddings

train, text_embeddings = preprocess()

kfold = StratifiedKFold(n_splits = FOLDS, shuffle = True, random_state = SEED)
for fold, (trn_ind, val_ind) in enumerate(kfold.split(train, train['label_group'])):
    train.loc[val_ind, 'fold'] = fold
train['fold'] = train['fold'].astype(int)

# Save train
train.to_csv('train_folds.csv', index = False)

def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy()
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_array(array):
    tensor = tf.convert_to_tensor(array)
    result = tf.io.serialize_tensor(tensor)
    return result

def serialize_example(posting_id, image, title, label_group, matches):
    feature = {
        'posting_id': _bytes_feature(posting_id),
        'image': _bytes_feature(image),
        'title': _bytes_feature(title),
        'label_group': _int64_feature(label_group),
        'matches': _bytes_feature(matches)
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()


for fold in range(FOLDS):
    print('\n')
    print('-'*50)
    print(f'Writing TFRecord {fold} of {FOLDS - 1}...')
    train_ = train[train['fold'] == fold]
    # Get indices to slice our text features
    text_embeddings_ = text_embeddings[train_.index]
    with tf.io.TFRecordWriter('train%.2i-%i.tfrec'%(fold, train_.shape[0])) as writer:
        for k in range(train_.shape[0]):
            row = train_.iloc[k]
            image = cv2.imread('../input/shopee-product-matching/train_images/' + row['image'])
            image = cv2.resize(image, IMAGE_SIZE)
            image = cv2.imencode('.jpg', image, (cv2.IMWRITE_JPEG_QUALITY, 100))[1].tobytes()
            title = text_embeddings_[k].astype(np.float64)
            title = serialize_array(title)
            posting_id = row['posting_id']
            label_group = row['label_group']
            matches = row['matches']
            example = serialize_example(str.encode(posting_id),
                                        image,
                                        title,
                                        label_group,
                                        str.encode(matches))
            writer.write(example)
            if k%100==0: print(k,', ',end='')
                

# Save csv
train.to_csv('train_folds.csv', index = False)