In [3]:
import numpy as np
import os
import joblib
import torch
from sklearn import preprocessing
from word2vec_util import TextCorpusProcess
from skip_gram_neg import SkipGramNeg
import time
import copy
from torch import Tensor, nn


In [1]:
# load train labels (local)
dir = os.getcwd() + '/text/'

In [None]:
# load product labels (colab)
dir = os.getcwd() + '/drive/MyDrive/fb_marketplace/data/text/'

In [4]:

train_pklname = dir + 'train_txt_labels.pkl'
train_txt_labels = joblib.load(train_pklname)

val_pklname = dir + 'val_txt_labels.pkl'
val_txt_labels = joblib.load(val_pklname)

# create ordinal encoded train target
label_train = np.array(train_txt_labels).reshape(-1, 1)
target_enc = preprocessing.OrdinalEncoder()
target_enc.fit(label_train)
target_train = target_enc.transform(label_train)
target_train = target_train.reshape(-1)

# create ordinal encoded val target
label_val = np.array(val_txt_labels).reshape(-1, 1)
target_val = target_enc.transform(label_val)
target_val = target_val.reshape(-1)

In [4]:
# load the saved processed text data
text_file = 'text_processed.pkl'
text_path = dir + text_file
textCorpus = joblib.load(text_path)
train_descs_idx = textCorpus.prod_word_idx_lists

In [5]:
# save val desc data for easy load
data_path = os.getcwd() + '/data/images/' + 'img_prepared'
val_pklname = data_path + '_val.pkl'
val_data = joblib.load(val_pklname)
val_text_descs = val_data['desc']
joblib.dump(val_text_descs, dir + 'val_text_descs.plk')

['/home/shbz/Dropbox/Workspace/Aicore/VSworkspace/fb_marketplace_reco/text/val_text_descs.plk']

In [5]:
# load save val description data
val_text_descs = joblib.load(dir + 'val_text_descs.plk')

In [6]:
# load the word2vec model

# hyperparams
EMBED_DIM = 128
BATCH_SIZE = 16
NEG_SAMPLE_SIZE = 5

# instantiating the model
model = SkipGramNeg(
    vocab_size=textCorpus.vocab_size,
    embed_dim=EMBED_DIM,
    neg_sample_size=NEG_SAMPLE_SIZE,
    batch_size=BATCH_SIZE,
)
# load saved word2vec model
current_path = os.getcwd()
save_dir_path = current_path + '/save/'
model.load_state_dict(torch.load(
    save_dir_path + 'best_word2vec_ebedding_model.pt'))
embeddings = model.in_embed

In [7]:
# prepare embedded train data
train_words_emb = []
for desc_idxs in train_descs_idx:
    words_idx = torch.from_numpy(np.array(desc_idxs).reshape(-1, 1))
    words_emb = embeddings(words_idx).squeeze()
    train_words_emb.append(words_emb)

# prepare embedded val data
val_descs_idx = []
for desc in val_text_descs:
    desc_words = []
    for word in desc:
        val = textCorpus.word_to_idx.get(word)
        if not val is None:
            desc_words.append(val)
    if desc_words:
        val_descs_idx.append(desc_words)
    else:
        raise ValueError('Empty description!')

val_words_emb = []
for desc_idxs in val_descs_idx:
    words_idx = torch.from_numpy(np.array(desc_idxs).reshape(-1, 1))
    words_emb = embeddings(words_idx).squeeze()
    val_words_emb.append(words_emb)

In [8]:
# create a Dataset object creates sequence of word embeddings padded to fill max_desc_len

class TextEmbedDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, labels, max_word_len):
        # expects word embeddings
        self.inputs = inputs
        # expects ordinal encoded classes
        self.labels = labels
        self.max_word_len = max_word_len

    def __getitem__(self, index):
        words = self.inputs[index]
        pad_len = self.max_word_len - len(words)
        assert(pad_len >= 0)
        pad_len_r = pad_len // 2
        pad_len_l = pad_len - pad_len_r
        pad_r = torch.zeros(pad_len_r, words.shape[1])
        pad_l = torch.zeros(pad_len_l, words.shape[1])
        words_padded = torch.concat([pad_l, words, pad_r], dim=0)
        label = self.labels[index]
        return words_padded, label

    def __len__(self):
        return len(self.label)


In [None]:

# find max description word length
max_train_desc_len = max(map(len, train_descs_idx))
max_val_desc_len = max(map(len, val_descs_idx))
assert(max_train_desc_len >= max_val_desc_len)

trainEmbedDataset = TextEmbedDataset(
    train_words_emb, target_train, max_train_desc_len)
valEmbedDataset = TextEmbedDataset(
    val_words_emb, target_val, max_val_desc_len)

text_datasets = {'train': trainEmbedDataset, 'val': valEmbedDataset}

BATCH_SIZE = 32

# create a dict of train and val dataloaders
dataloaders = {x: torch.utils.data.DataLoader(text_datasets[x], batch_size=BATCH_SIZE,
                                              shuffle=True, num_workers=4)
               for x in ['train', 'val']}

dataset_sizes = {x: len(text_datasets[x]) for x in ['train', 'val']}

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
def train_model(model, criterion, optimizer, scheduler, writer, num_epochs=50, save_path=None):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print(f'Epoch {epoch}/{num_epochs - 1}')
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.type(torch.LongTensor)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

            writer.add_scalar(f'Loss/{phase}', epoch_loss, epoch)
            writer.add_scalar(f'Accuracy/{phase}', epoch_acc, epoch)

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
                torch.save(best_model_wts, save_path + 'best_model.pt')

        print()
        writer.flush()

    time_elapsed = time.time() - since
    print(
        f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    print(f'Best val Acc: {best_acc:4f}')

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model


In [None]:
class TextClassification(nn.module):
    def __init__(self, word_embd_dim):
        super().__init__(self)
        self.word_embd_dim = word_embd_dim

        self.cnn_layers = nn.Sequential(
            nn.Conv1d(in_channels=self.word_embd_dim,
                      out_channels=4,
                      kernel_size=2,
                      stride=1,
                      padding=1)
                     ),
            


In [None]:
EMBED_DIM = 128

class Net(Module):
    def __init__(self):
        super(Net, self).__init__()

        self.cnn_layers = Sequential(
            # Defining a 2D convolution layer
            Conv2d(1, 4, kernel_size=3, stride=1, padding=1),
            BatchNorm2d(4),
            ReLU(inplace=True),
            MaxPool2d(kernel_size=2, stride=2),
            # Defining another 2D convolution layer
            Conv2d(4, 4, kernel_size=3, stride=1, padding=1),
            BatchNorm2d(4),
            ReLU(inplace=True),
            MaxPool2d(kernel_size=2, stride=2),
        )

        self.linear_layers = Sequential(
            Linear(4 * 7 * 7, 10)
        )

    # Defining the forward pass
    def forward(self, x):
        x = self.cnn_layers(x)
        x = x.view(x.size(0), -1)
        x = self.linear_layers(x)
        return x


class NaturalSceneClassification(ImageClassificationBase):
    def __init__(self):
        super().__init__()
        self.network = nn.Sequential(

            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),

            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),

            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),

            nn.Flatten(),
            nn.Linear(82944, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 6)
        )

    def forward(self, xb):
        return self.network(xb)
