In [None]:
# app.py
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from pathlib import Path
import json, joblib, numpy as np
import re
import os

# Transformer imports (optional; only needed when transformer is present)
try:
    from transformers import AutoTokenizer, AutoModelForSequenceClassification
    import torch
except Exception:
    AutoTokenizer = None
    AutoModelForSequenceClassification = None
    torch = None

app = FastAPI(title="Reply Classification API", version="1.0")

# Request schema
class PredictRequest(BaseModel):
    text: str

# Basic cleaning function (same as training)
def clean_text(s: str) -> str:
    if s is None:
        return ""
    s = str(s).lower()
    s = re.sub(r'http\S+|www\.\S+', ' ', s)
    s = re.sub(r'\S+@\S+', ' ', s)
    s = re.sub(r'\d+', ' ', s)
    s = re.sub(r'[^a-z\s]', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

# Load metadata (label mapping)
BASE = Path(".")   # current working directory
metadata_path = BASE / "metadata.json"
if metadata_path.exists():
    with open(metadata_path, "r") as f:
        metadata = json.load(f)
    label_mapping = metadata.get("label_mapping", None)
    # label_mapping: { "positive": 2, ... } -> convert to inv map
    inv_label_mapping = {int(v): k for k, v in label_mapping.items()} if label_mapping else None
else:
    inv_label_mapping = None

# Try to load transformer
transformer_model = None
tokenizer = None
use_transformer = False
tfidf = None
clf = None
lgb_booster = None

if (BASE / "distilbert_finetuned").exists() and AutoTokenizer is not None:
    try:
        tokenizer = AutoTokenizer.from_pretrained(str(BASE / "distilbert_finetuned"))
        transformer_model = AutoModelForSequenceClassification.from_pretrained(str(BASE / "distilbert_finetuned"))
        # If GPU available, move model to GPU
        if torch is not None and torch.cuda.is_available():
            transformer_model.cuda()
        use_transformer = True
        print("Using transformer model: distilbert_finetuned")
    except Exception as e:
        print("Failed to load transformer:", e)
        use_transformer = False

# Fallback: load TF-IDF + classifier (joblib)
if not use_transformer:
    if (BASE / "tfidf_vectorizer.joblib").exists() and (BASE / "logistic_regression_tfidf.joblib").exists():
        tfidf = joblib.load(str(BASE / "tfidf_vectorizer.joblib"))
        clf = joblib.load(str(BASE / "logistic_regression_tfidf.joblib"))
        print("Using TF-IDF + classifier model")
    elif (BASE / "tfidf_vectorizer.joblib").exists() and (BASE / "lightgbm_tfidf.txt").exists():
        # Load TF-IDF and LightGBM booster
        tfidf = joblib.load(str(BASE / "tfidf_vectorizer.joblib"))
        try:
            import lightgbm as lgb
            lgb_booster = lgb.Booster(model_file=str(BASE / "lightgbm_tfidf.txt"))
            print("Using TF-IDF + LightGBM model")
        except Exception as e:
            print("LightGBM not available:", e)
    else:
        print("No model artifacts found. Please place distilbert_finetuned/ or tfidf_vectorizer.joblib + classifier in the app folder.")

@app.get("/health")
def health():
    return {"status": "ok", "model": ("distilbert" if use_transformer else ("tfidf_lr" if clf is not None else "none"))}

@app.post("/predict")
def predict(req: PredictRequest):
    text = req.text
    if not text or not isinstance(text, str) or len(text.strip()) == 0:
        raise HTTPException(status_code=400, detail="`text` is required and must be non-empty string.")
    # optional: limit text length
    if len(text) > 5000:
        text = text[:5000]

    cleaned = clean_text(text)

    # Transformer path
    if use_transformer and tokenizer is not None and transformer_model is not None:
        # tokenize
        inputs = tokenizer(cleaned, truncation=True, padding=True, max_length=128, return_tensors="pt")
        if torch is not None and torch.cuda.is_available():
            inputs = {k: v.cuda() for k, v in inputs.items()}
            transformer_model.cuda()
        transformer_model.eval()
        with torch.no_grad():
            outputs = transformer_model(**inputs)
            logits = outputs.logits
            probs = torch.nn.functional.softmax(logits, dim=-1).cpu().numpy()[0]
        pred_id = int(np.argmax(probs))
        label = inv_label_mapping.get(pred_id, str(pred_id)) if inv_label_mapping else str(pred_id)
        confidence = float(np.max(probs))
        return {"label": label, "confidence": round(confidence, 4)}

    # TF-IDF + sklearn classifier
    if tfidf is not None and clf is not None:
        X = tfidf.transform([cleaned])
        if hasattr(clf, "predict_proba"):
            probs = clf.predict_proba(X)[0]
        else:
            # fallback: attempt decision_function -> softmax
            try:
                scores = clf.decision_function(X)[0]
                exp = np.exp(scores - np.max(scores))
                probs = exp / exp.sum()
            except Exception:
                probs = None
        pred_id = int(np.argmax(probs)) if probs is not None else int(clf.predict(X)[0])
        label = inv_label_mapping.get(pred_id, str(pred_id)) if inv_label_mapping else str(pred_id)
        confidence = float(np.max(probs)) if probs is not None else 1.0
        return {"label": label, "confidence": round(confidence, 4)}

    # TF-IDF + LightGBM booster
    if tfidf is not None and lgb_booster is not None:
        X = tfidf.transform([cleaned])
        probs = lgb_booster.predict(X)[0]
        pred_id = int(np.argmax(probs))
        label = inv_label_mapping.get(pred_id, str(pred_id)) if inv_label_mapping else str(pred_id)
        confidence = float(np.max(probs))
        return {"label": label, "confidence": round(confidence, 4)}

    raise HTTPException(status_code=500, detail="No model loaded on server.")


No model artifacts found. Please place distilbert_finetuned/ or tfidf_vectorizer.joblib + classifier in the app folder.


In [None]:
import json
import os

# Ensure the directory exists
os.makedirs("/content/app", exist_ok=True)

labels = {"0": "negative", "1": "neutral", "2": "positive"}

# Now write the JSON file
with open("/content/app/metadata.json", "w") as f:
    json.dump(labels, f)


In [None]:
with open("/content/app/metadata.json", "r") as f:
    id2label = json.load(f)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_path = "./distilbert_finetuned"  # make sure this folder exists
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

In [None]:
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
import json

# Load model + tokenizer
MODEL_PATH = "distilbert_finetuned"
model = DistilBertForSequenceClassification.from_pretrained(MODEL_PATH)
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_PATH)

# Load label metadata
with open("metadata.json", "r") as f:
    id2label = json.load(f)

app = FastAPI()

class InputText(BaseModel):
    text: str

@app.post("/predict")
def predict(input: InputText):
    inputs = tokenizer(input.text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=1)
    conf, pred_id = torch.max(probs, dim=1)
    return {
        "label": id2label[str(pred_id.item())],
        "confidence": float(conf.item())
    }

# New Section