In [1]:
!pip install tensorflow-text



In [2]:
import csv
import numpy as np
import os
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from tqdm.auto import tqdm

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
class DataProcessor:

    def __init__(self) -> None:
        self.data = []
        self.labels = []
        self.label_encoding = {}
        self.dataset = None
        pass

    def load_data(self, fp:str) -> None:

        for root, subdirs, files in os.walk(fp):
            i = 0
            for file in tqdm(files):
                path = os.path.join(root, file)
                with open(path, 'r', encoding='utf8') as f:
                    reader = csv.DictReader(f)
                    for row in reader:
                        skills = row['skills'].strip('[]').replace("'", '').split(',')
                        text_label = file.split('.')[0]
                        if i not in self.label_encoding.keys():
                            self.label_encoding[i] = text_label
                        self.labels.extend([i]*len(skills))
                        self.data.extend(skills)
                    i += 1


    def process_data(self) -> tf.data.Dataset:
        self.dataset = tf.data.Dataset.from_tensor_slices((self.data, self.labels))
        self.data = None


    def check_tensors(self):
        lengths = {}
        for row in tqdm(self.data):
            lngth = len(row[0])
            if not lngth in lengths.keys():
                lengths[lngth] = 1
                continue
            else:
                lengths[lngth] += 1
        print(lengths)
        print(max(lengths.keys()))

    def test_train_split(self, train_split=0.8, val_split=0.1, test_split=0.1, shuffle=True):
        assert (train_split + test_split + val_split) == 1

        ds_size = len(self.labels)
        self.labels = None

        if shuffle:
            # Specify seed to always have the same split distribution between runs
            ds = self.dataset.shuffle(self.dataset.cardinality(), seed=12)
        else:
            ds = self.dataset
        train_size = int(train_split * ds_size)
        val_size = int(val_split * ds_size)

        train_ds = ds.take(train_size)
        val_ds = ds.skip(train_size).take(val_size)
        test_ds = ds.skip(train_size).skip(val_size)

        return train_ds, val_ds, test_ds

In [4]:
processor = DataProcessor()
processor.load_data('drive/MyDrive/Colab Notebooks/JobsData/data/')
processor.process_data()
train, val, test = processor.test_train_split()

  0%|          | 0/187 [00:00<?, ?it/s]

In [5]:
train = train.batch(64, drop_remainder=True).cache().prefetch(10000)
test = test.batch(64, drop_remainder=True).cache().prefetch(10000)
val = val.batch(64, drop_remainder=True).cache().prefetch(10000)

In [6]:
encoder = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4", trainable=True)

model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(len(processor.label_encoding.keys()))
])

model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics = [tf.keras.metrics.SparseCategoricalAccuracy(),
                         tf.keras.metrics.SparseTopKCategoricalAccuracy(k=4)],
              run_eagerly=True
             )

In [7]:
history = model.fit(
    train,
    epochs=50,
    validation_data=val,
)

Epoch 1/50


Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


  101/41998 [..............................] - ETA: 143:09:20 - loss: 5.2081 - sparse_categorical_accuracy: 0.0147 - sparse_top_k_categorical_accuracy: 0.0435

KeyboardInterrupt: ignored