In [422]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [423]:
import os

# this is necessary for tensorflowjswizard
os.environ["TF_USE_LEGACY_KERAS"] = "1"

# Import & Model load

In [424]:
use_model_path = '/content/drive/MyDrive/ndev-task-tracker/universal-sentence-encoder-tensorflow1-lite-v2'

In [425]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text  # required for USE Lite
import numpy as np
import sentencepiece as spm
from sklearn.model_selection import train_test_split

In [426]:
print(tf.__version__)

2.18.0


In [427]:

# 1. Load the Universal Sentence Encoder Lite
embed = hub.load(use_model_path)

# 2. Load the SentencePiece tokenizer model directly from path
sp = spm.SentencePieceProcessor()
sp.load(f"{use_model_path}/assets/universal_encoder_8k_spm.model")

True

In [428]:
embed_fn = embed.signatures["default"]

# Data Preparation

In [429]:
# USE Lite is designed to be smaller and mobile/web-friendly, so it does not contain its own tokenizer.
# because the embed_fn signature of the Universal Sentence Encoder Lite model you loaded expects the input in this sparse format. If you tried to pass a dense tensor or just a list of token IDs directly, the model would likely throw an error.

def to_sparse(sentences):
    # Encode sentences to list of token ids
    ids = [sp.encode(s) for s in sentences]

    # Create values and indices for SparseTensor
    values = [token for sent in ids for token in sent]
    indices = [[i, j] for i, sent in enumerate(ids) for j in range(len(sent))]
    dense_shape = [len(ids), max(len(sent) for sent in ids)]

    # Convert to required tensors
    return {
        "values": tf.constant(values, dtype=tf.int64),
        "indices": tf.constant(indices, dtype=tf.int64),
        "dense_shape": tf.constant(dense_shape, dtype=tf.int64),
    }


In [None]:
valid_title = [
    "Project Sprint 1",
    "Project Sprint 5",
    "Project Sprint 1.1",
    "Project Sprint 5.1",
    "TaskHive Dev",
    "Ndev task tracker",
    "Spenicle v1",
    "Zenventory",
    "FlowNest"
]

meetings = [
    "planning sprint",
    "designing system architecture",
    "project discussion",
    "soda",
    "1:1 with manager",
    "attending a meeting",
    "developer catchup",
    "daily standup",
    "internal meeting",
    "english class",
    "running daily standup",
    "meeting with mr colleague",
    "meet with mrs jane"
]

background_task = [
    "setting up CI/CD",
    "configuring docker",
    "writing API spec",
    "deploying app",
    "refactoring code",
    "writing documentation",
    "resolving merge conflicts",
    "pushing to GitHub",
    "committing changes",
    "optimizing database",
    "managing backlog",
]

general_tasks = [
    "fixing bugs",
    "reviewing PRs",
    "reviewing code",
    "fixing linter issues",
    "updating documentation",
    "pair programming",
    "resolving bugs",
    "preparing presentation",
    "presenting updates",
    "discussing roadmap",
    "collaborating on design",
    "writing proposal",
]

general_activities = [
    "watching a movie",
    "playing games",
    "eating out",
    "scrolling TikTok",
    "cooking dinner",
    "taking a nap",
    "chatting with friends",
    "binge-watching Netflix",
    "reading a novel",
    "doing laundry",
    "shopping online",
    "going to the mall",
    "napping",
    "watching YouTube",
    "cleaning room",
    "going for a walk",
    "checking social media",
    "ordering food",
    "scrolling Instagram",
    "taking a break",
]

project_tasks = [
    "writing code",
    "writing unit tests",
    "debugging memory leak",
    "benchmarking app",
]


In [431]:
# for evaluation
class_names = [
    "valid_title",
    "background_task",
    "meetings",
    "general_tasks",
    "general_activities",
    "project_tasks",
]

# Combine all with labels
raw_data = []
raw_data += [(item, 0) for item in valid_title]
raw_data += [(item, 1) for item in background_task]
raw_data += [(item, 2) for item in meetings]
raw_data += [(item, 3) for item in general_tasks]
raw_data += [(item, 4) for item in general_activities]
raw_data += [(item, 5) for item in project_tasks]

print('You have ', len(raw_data), 'data points')

You have  67 data points


## preprocessing

In [432]:
sentences = []
labels = []

for sentence, label in raw_data:
    sentences.append(sentence.lower())
    labels.append(label)

sentences = np.array(sentences)
labels = np.array(labels)

# Display the first few elements to verify
print(f"First 5 sentences: {sentences[:5]}")
print(f"First 5 labels: {labels[:5]}")

First 5 sentences: ['project sprint 1' 'project sprint 5' 'project sprint 1.1'
 'project sprint 5.1' 'reapit dev']
First 5 labels: [0 0 0 0 0]


In [433]:
sparse_input = to_sparse(sentences)
embeddings = embed_fn(**sparse_input)['default']

## data splitting

In [None]:
x_train, x_val, y_train, y_val = train_test_split(
    embeddings.numpy(), labels, test_size=0.09, stratify=labels, random_state=5
)

# for now no need test split, because the data is small
# Second split: 30% val, 10% test from temp (i.e., 75/25 split of remaining 40%)
# X_val, X_test, y_val, y_test = train_test_split(
#     X_temp, y_temp, test_size=0.25, stratify=y_temp, random_state=42
# )

print(len(x_train), len(x_val))

60 7


## prod setup

In [435]:
x_train = np.concatenate((x_train, x_val), axis=0)
y_train = np.concatenate((y_train, y_val), axis=0)
len(x_train)

67

# Training

In [436]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(512,)),
    tf.keras.layers.Dense(50, activation="relu"),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(6, activation="softmax"),
])

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.fit(x_train, y_train, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tf_keras.src.callbacks.History at 0x7d8118d12950>

## evaluation

In [437]:

# Evaluate
loss, acc = model.evaluate(x_val, y_val, verbose=0)
print(f"Val Accuracy: {acc:.2%}")


Val Accuracy: 100.00%


In [438]:
new_sentences = ["writing code", "eating a little pizza for a minute and wrote a code", "catchup with mr x"]
new_input = to_sparse(new_sentences)
new_embeddings = embed_fn(**new_input)['default']

predictions = model.predict(new_embeddings)
# print(predictions) # values near 1 = work, near 0 = not work
confidences = predictions.max(axis=1)
print(confidences)
predicted_labels = predictions.argmax(axis=1)
predicted_class_names = [class_names[label] for label in predicted_labels]
print(predicted_class_names)


[0.2849344  0.52572596 0.43856803]
['general_tasks', 'general_activities', 'meetings']


# Save Model

In [382]:
model.save('saved_model.h5')

  saving_api.save_model(
