In [4]:
from numpy import *
import DataParser
# from joblib.numpy_pickle_utils import xrange
import numpy as np
import pickle


# from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer

from sklearn.preprocessing        import LabelEncoder, MultiLabelBinarizer
from sklearn.model_selection      import train_test_split
from tensorflow.keras             import Sequential
from tensorflow.keras.layers      import Dense, Dropout, InputLayer
from tensorflow.keras.utils       import to_categorical
import tensorflow as tf
import re


from tensorflow.keras.models import load_model

import matplotlib.pyplot as plt


# ANN Training Phase

get the dictionary of names and categories, one hot encode each category

In [5]:
data_dict = DataParser.get_input_data() # this function works

names      = list(data_dict.keys())
categories = list(data_dict.values())


KeyboardInterrupt: 

In [None]:
unq_categories = DataParser.get_unique_categories()
print(f"Found {len(unq_categories)} distinct categories")  # → should be ~1311


KeyboardInterrupt: 

In [None]:
sorted_cats = sorted(unq_categories)
mlb = MultiLabelBinarizer(classes=sorted_cats)

def split_cats(entry):
    if not isinstance(entry, str):
        return []
    return [cat.strip() for cat in entry.split(',') if cat.strip()]

Y = mlb.fit_transform([ split_cats(e) for e in categories ])



In [None]:
# vectorizer = TfidfVectorizer(
#     analyzer='char_wb',    # good for short text
#     ngram_range=(2,4),
#     max_features=5000
# )

# X = vectorizer.fit_transform(names)  # sparse matrix (150k × 5000)         # scipy sparse matrix, shape = (150k, 5000)


New Transformer:
instead of TF-IDF, I'm using Sentence Transformers (SBERT) to vectorize the names. Ideally, should work better than ngrams
"all-MiniLM-L6-v2” is small & fast with 384-dim embeddings


In [None]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')    
X = embedder.encode(names, show_progress_bar=True, convert_to_numpy=True)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

In [None]:
model = Sequential([
    InputLayer(input_shape=(X_train.shape[1],), sparse=True),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(64,  activation='relu'),
    Dropout(0.3),
    Dense(Y.shape[1], activation='sigmoid')
])

# model = load_model("business_ann_model.h5")

# with open("vectorizer.pkl", "rb") as f:
#     vectorizer = pickle.load(f)

# with open("mlb.pkl", "rb") as f:
#     mlb = pickle.load(f)
    
# sorted_cats = mlb.classes_.tolist()



In [None]:
def to_tf_sparse(csr):
    coo     = csr.tocoo()
    indices = np.vstack((coo.row, coo.col)).T
    st      = tf.sparse.SparseTensor(indices, coo.data.astype(np.float32), coo.shape)
    return tf.sparse.reorder(st)

def predict_business(name, threshold=0.5):
    # 1) CSR → SparseTensor
    v_sp   = to_tf_sparse(vectorizer.transform([name]))
    # 2) Predict on sparse
    probs  = model.predict(v_sp)[0]
    # 3) Threshold
    hits   = [cat for cat, p in zip(sorted_cats, probs) if p >= threshold]
    return hits, probs  

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['binary_accuracy'])


In [None]:
X_train_sub, X_val, Y_train_sub, Y_val = train_test_split(
    X_train, Y_train, test_size=0.1, random_state=42
)

# turn them into plain NumPy arrays
X_train_sub_sp = to_tf_sparse(X_train_sub)
X_val_sp       = to_tf_sparse(X_val)

history = model.fit(
    X_train_sub_sp,    # <-- sparse tensor here
    Y_train_sub,
    validation_data=(X_val_sp, Y_val),  # <-- sparse tensor here
    epochs=10,
    batch_size=256
)



In [None]:
plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='val loss')
plt.xlabel('Epoch')
plt.ylabel('Binary cross-entropy')
plt.legend()
plt.show()

In [None]:
X_test_sp = to_tf_sparse(X_test)
loss, acc = model.evaluate(X_test_sp, Y_test, verbose=0)
print(f"Test accuracy: {acc:.3f}")

model.save("business_ann_model.h5")


with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

with open("mlb.pkl", "wb") as f:
    pickle.dump(mlb, f)


Sample

This model is able to predict the following really well
- Chain Restaurants, Pizza, or Food (In generate)
- Home Improvement Stores (Home Depot, Ace Hardware, ~Best Buy)

In [None]:
model = load_model("business_ann_model.h5")

with open("vectorizer.pkl", "rb") as f:
    vectorizer = pickle.load(f)

with open("mlb.pkl", "rb") as f:
    mlb = pickle.load(f)

sorted_cats = mlb.classes_.tolist()   # the same order you trained with


In [None]:
cats, probs = predict_business("Verve Coffee Roasters", threshold=0.5)
print("Predicted categories:", cats)