# Inference Models

## Import Required Libraries

In [1]:
import os
import pickle
import joblib
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import text_to_word_sequence

## Load Models

In [2]:
svm_model_path      = "artifacts/svm_model.pkl"
rf_model_path       = "artifacts/random_forest.pkl"
tfidf_path          = "artifacts/tfidf_vectorizer.pkl"
w2v_path            = "artifacts/word2vec_model.pkl"
cnn_model_path      = "artifacts/cnn/cnn_model.keras"
tokenizer_path      = "artifacts/cnn/tokenizer.pkl"
label_encoder_path  = "artifacts/cnn/label_encoder.pkl"

svm_model = joblib.load(svm_model_path)
rf_model  = joblib.load(rf_model_path)
tfidf     = joblib.load(tfidf_path)
w2v_model = joblib.load(w2v_path)

cnn_model = tf.keras.models.load_model(cnn_model_path)
with open(tokenizer_path, "rb") as f:
    tokenizer = pickle.load(f)
with open(label_encoder_path, "rb") as f:
    le = pickle.load(f)

try:
    maxlen = cnn_model.input_shape[1] if cnn_model.input_shape[1] else 100
except Exception:
    maxlen = 100

try:
    if hasattr(w2v_model, "wv"):
        vec_size = w2v_model.wv.vector_size
        has_wv = True
    else:
        vec_size = w2v_model.vector_size
        has_wv = False
except Exception:
    vec_size = 300
    has_wv = hasattr(w2v_model, "wv")

## Inference Process

In [3]:
def text_to_avg_w2v(texts):
    X = np.zeros((len(texts), vec_size), dtype="float32")
    for i, t in enumerate(texts):
        tokens = text_to_word_sequence(t)
        if not tokens:
            continue
        vecs = []
        for tok in tokens:
            if has_wv:
                if tok in w2v_model.wv:
                    vecs.append(w2v_model.wv[tok])
            else:
                if tok in w2v_model:
                    vecs.append(w2v_model[tok])
        if vecs:
            X[i] = np.mean(vecs, axis=0)
    return X

def model_predict_with_confidence(model, X):
    if X is None:
        return None, None, None

    if hasattr(model, "predict_proba"):
        proba = model.predict_proba(X)
        preds = proba.argmax(axis=1)
        conf  = proba.max(axis=1)
        return preds, conf, proba

    if hasattr(model, "decision_function"):
        df = model.decision_function(X)

        if np.ndim(df) == 1 or (np.ndim(df) == 2 and df.shape[1] == 1):
            df = np.ravel(df)
            preds = (df >= 0).astype(int)

            conf  = 1.0 / (1.0 + np.exp(-np.abs(df)))
            return preds, conf, None
        else:
  
            df = np.asarray(df)
            e = np.exp(df - df.max(axis=1, keepdims=True))
            soft = e / e.sum(axis=1, keepdims=True)
            preds = soft.argmax(axis=1)
            conf  = soft.max(axis=1)
            return preds, conf, None

    preds = model.predict(X)
    conf  = np.full(len(preds), np.nan)
    return preds, conf, None

def choose_feature_space_and_predict(model, texts, X_tfidf, X_w2v):
    n_in = getattr(model, "n_features_in_", None)

    if X_tfidf is not None and (n_in is None or X_tfidf.shape[1] == n_in):
        p, c, _ = model_predict_with_confidence(model, X_tfidf)
        if p is not None:
            return p, c, "tfidf"

    if X_w2v is not None and (n_in is None or X_w2v.shape[1] == n_in):
        p, c, _ = model_predict_with_confidence(model, X_w2v)
        if p is not None:
            return p, c, "w2v"

    p, c, _ = model_predict_with_confidence(model, X_tfidf)
    if p is not None:
        return p, c, "tfidf"
    p, c, _ = model_predict_with_confidence(model, X_w2v)
    if p is not None:
        return p, c, "w2v"

    return None, None, "unknown"

def map_to_label_names(y_pred):
    if y_pred is None:
        return None
    try:
        arr = np.array(y_pred)
        if np.issubdtype(arr.dtype, np.integer) and hasattr(le, "classes_"):
            if len(le.classes_) >= (np.max(arr) + 1):
                return le.inverse_transform(arr)
        return y_pred
    except Exception:
        return y_pred

def safe_round(x, n=4):
    return np.round(x, n) if x is not None else None

texts = [
    "Filmnya bagus banget, aktingnya memukau dan ending-nya memuaskan!",
    "Pelayanannya buruk, pesanan terlambat dan rasanya mengecewakan.",
    "Saya baru selesai menonton film itu, akan coba baca ulasannya nanti."
]

X_tfidf = tfidf.transform(texts)
X_w2v   = text_to_avg_w2v(texts)

svm_preds_raw, svm_conf, svm_used = choose_feature_space_and_predict(svm_model, texts, X_tfidf, X_w2v)
rf_preds_raw,  rf_conf,  rf_used  = choose_feature_space_and_predict(rf_model,  texts, X_tfidf, X_w2v)

svm_labels = map_to_label_names(svm_preds_raw)
rf_labels  = map_to_label_names(rf_preds_raw)

seqs = tokenizer.texts_to_sequences(texts)
pads = pad_sequences(seqs, maxlen=maxlen, padding="post")
cnn_raw = cnn_model.predict(pads, verbose=0)

if cnn_raw.ndim == 1 or (cnn_raw.ndim == 2 and cnn_raw.shape[1] == 1):
    cnn_conf  = cnn_raw.ravel()
    cnn_preds = (cnn_conf >= 0.5).astype(int)
else:
    cnn_conf  = cnn_raw.max(axis=1)
    cnn_preds = cnn_raw.argmax(axis=1)

cnn_labels = le.inverse_transform(cnn_preds)

## Inference Result

In [4]:
df = pd.DataFrame({
    "text": texts,
    "svm_pred": svm_labels if svm_labels is not None else ["<failed>"]*len(texts),
    "svm_conf": safe_round(svm_conf),
    "svm_feat": [svm_used]*len(texts),
    "rf_pred":  rf_labels if rf_labels is not None else ["<failed>"]*len(texts),
    "rf_conf":  safe_round(rf_conf),
    "rf_feat":  [rf_used]*len(texts),
    "cnn_pred": cnn_labels,
    "cnn_conf": safe_round(cnn_conf),
})

pd.set_option("display.max_colwidth", 200)
print(df.to_string(index=False))

                                                                text svm_pred  svm_conf svm_feat  rf_pred  rf_conf rf_feat cnn_pred  cnn_conf
   Filmnya bagus banget, aktingnya memukau dan ending-nya memuaskan! positive    0.7836    tfidf positive   0.9547     w2v positive     1.000
     Pelayanannya buruk, pesanan terlambat dan rasanya mengecewakan. negative    0.7566    tfidf negative   0.4840     w2v negative     1.000
Saya baru selesai menonton film itu, akan coba baca ulasannya nanti. positive    0.7696    tfidf positive   0.6330     w2v positive     0.999
