In [1]:
import os, re, json, glob, argparse, sys, types
from datetime import datetime
import numpy as np
import pandas as pd
import joblib
import librosa, soundfile as sf

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# ---------- Defaults ----------
DEFAULT_MODEL = r"C:\Users\sagni\Downloads\Poly Glot AI\polyglot_model.pkl"
DEFAULT_AUDIO_DIR = r"C:\Users\sagni\Downloads\Poly Glot AI\archive\audio"
DEFAULT_TEXT_DIR  = r"C:\Users\sagni\Downloads\Poly Glot AI\archive\text"
DEFAULT_CSV_PATH  = r"C:\Users\sagni\Downloads\Poly Glot AI\archive\British English Speech Recognition.csv"
DEFAULT_OUT_DIR   = r"C:\Users\sagni\Downloads\Poly Glot AI"
POSSIBLE_TEXT_COLS = ["Text","text","Transcript","transcript","utterance","sentence","phrase"]

# ---------- Robust I/O helpers ----------
def list_basenames(root, exts=("*.wav","*.WAV")):
    out = {}
    for pat in exts:
        for path in glob.glob(os.path.join(root, pat)):
            out[os.path.splitext(os.path.basename(path))[0]] = path
    return out

def try_read_csv(path):
    try:
        return pd.read_csv(path)
    except Exception:
        return pd.read_csv(path, encoding="latin-1")

def safe_read_txt(path):
    try:
        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            return f.read().strip()
    except Exception:
        return ""

def robust_load_wav(path, sr=16000):
    try:
        y, r = sf.read(path, always_2d=False)
        if isinstance(y, np.ndarray):
            if y.ndim > 1:
                y = np.mean(y, axis=1)
            if r != sr:
                y = librosa.resample(y.astype(float), orig_sr=r, target_sr=sr)
                r = sr
        return y.astype(float), r
    except Exception:
        try:
            y, r = librosa.load(path, sr=sr, mono=True)
            return y, r
        except Exception:
            return None, None

# ---------- Re-register custom classes so unpickling works ----------
# The model was pickled with custom transformers (Squeeze1D, TextStatExtractor, AudioFeatureExtractor)
# likely under module 'polyglot_train_and_export'. We create a dummy module of that name and
# register identical classes into it so joblib.load can resolve them.
poly_mod_name = "polyglot_train_and_export"
poly_mod = types.ModuleType(poly_mod_name)
sys.modules[poly_mod_name] = poly_mod

class Squeeze1D(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X): return np.asarray(X, dtype=object).ravel()
Squeeze1D.__module__ = poly_mod_name
setattr(poly_mod, "Squeeze1D", Squeeze1D)

class TextStatExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.feature_names_ = np.array(["char_len","word_count","avg_word_len"])
        return self
    def transform(self, X):
        X = np.asarray(X, dtype=object).ravel().tolist()
        rows = []
        for t in X:
            t = t if isinstance(t, str) else ""
            words = re.findall(r"\w+", t, flags=re.UNICODE)
            rows.append([len(t), len(words), (sum(len(w) for w in words)/len(words) if words else 0.0)])
        return np.array(rows, dtype=float)
    def get_feature_names_out(self, input_features=None):
        return np.array(["char_len","word_count","avg_word_len"])
TextStatExtractor.__module__ = poly_mod_name
setattr(poly_mod, "TextStatExtractor", TextStatExtractor)

class AudioFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, sr=16000, n_mfcc=13):
        self.sr = sr; self.n_mfcc = n_mfcc
        self.feature_names_ = None
    def fit(self, X, y=None):
        names = ["dur_sec","rms_mean","rms_std","zcr_mean","spec_cent_mean","spec_cent_std"]
        names += [f"mfcc{i+1}_mean" for i in range(self.n_mfcc)]
        names += [f"mfcc{i+1}_std" for i in range(self.n_mfcc)]
        self.feature_names_ = names
        return self
    def transform(self, X):
        X = np.asarray(X, dtype=object).ravel().tolist()
        feats = []
        for p in X:
            feats.append(self._feat_one(p))
        return np.array(feats)
    def _feat_one(self, path):
        try:
            y, sr = robust_load_wav(path, sr=self.sr)
            if y is None or len(y)==0:
                raise RuntimeError
            dur = len(y) / float(sr)
            rms = librosa.feature.rms(y=y).flatten()
            zcr = librosa.feature.zero_crossing_rate(y).flatten()
            sc  = librosa.feature.spectral_centroid(y=y, sr=sr).flatten()
            mf  = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=self.n_mfcc)
            v = [
                dur,
                float(np.mean(rms)) if rms.size else 0.0,
                float(np.std(rms))  if rms.size else 0.0,
                float(np.mean(zcr)) if zcr.size else 0.0,
                float(np.mean(sc))  if sc.size else 0.0,
                float(np.std(sc))   if sc.size else 0.0,
            ]
            if mf.size:
                v += list(np.mean(mf, axis=1)); v += list(np.std(mf, axis=1))
            else:
                v += [0.0]*self.n_mfcc + [0.0]*self.n_mfcc
            return v
        except Exception:
            return [0.0]*(6 + self.n_mfcc*2)
    def get_feature_names_out(self, input_features=None):
        return np.array(self.feature_names_ or [])
AudioFeatureExtractor.__module__ = poly_mod_name
setattr(poly_mod, "AudioFeatureExtractor", AudioFeatureExtractor)

# ---------- Build inputs ----------
def build_dataframe(audio_dir, text_dir, csv_path):
    audio_map = list_basenames(audio_dir, ("*.wav","*.WAV"))
    text_map  = {}
    for pat in ("*.txt","*.TXT"):
        for p in glob.glob(os.path.join(text_dir, pat)):
            text_map[os.path.splitext(os.path.basename(p))[0]] = p

    df_csv = try_read_csv(csv_path) if os.path.isfile(csv_path) else pd.DataFrame()
    if not df_csv.empty:
        fname_col = next((c for c in ["filename","file","path","wav","audio","fname","id","ID","Name","name","Audio"] if c in df_csv.columns), None)
        if fname_col:
            df_csv["basename"] = df_csv[fname_col].astype(str).apply(lambda p: os.path.splitext(os.path.basename(p))[0])
        elif "basename" not in df_csv.columns:
            df_csv["basename"] = df_csv.index.astype(str)
    else:
        union = sorted(set(audio_map.keys()))
        df_csv = pd.DataFrame({"basename": union})

    # attach paths + text
    df_csv["audio_path"] = df_csv["basename"].map(audio_map)
    # text value: prefer CSV text column, else .txt file
    text_col = next((c for c in POSSIBLE_TEXT_COLS if c in df_csv.columns), None)
    if text_col:
        df_csv["text_value"] = df_csv[text_col].fillna("").astype(str)
    else:
        df_csv["text_value"] = ""

    def fill_from_txt(row):
        if row["text_value"]:
            return row["text_value"]
        tp = text_map.get(row["basename"])
        return safe_read_txt(tp) if tp else ""
    df_csv["text_value"] = df_csv.apply(fill_from_txt, axis=1)

    # keep rows with audio_path
    df = df_csv[df_csv["audio_path"].notna()].reset_index(drop=True)

    # X_frame for model
    X_frame = pd.DataFrame({
        "audio_path": df["audio_path"].astype(str).values,
        "text_value": df["text_value"].astype(str).values
    })
    return df, X_frame

# ---------- Prediction core ----------
def run_prediction(model_path, audio_dir, text_dir, csv_path, out_dir):
    os.makedirs(out_dir, exist_ok=True)

    # load model
    model = joblib.load(model_path)

    # prepare data
    df_meta, X_frame = build_dataframe(audio_dir, text_dir, csv_path)
    if X_frame.empty:
        raise SystemExit("No audio found to predict on.")

    # predict
    preds = None
    proba = None
    details = {}

    # determine if supervised ('model' step) or unsupervised ('cluster' step)
    final_step_name, final_step = list(model.named_steps.items())[-1]
    if final_step_name == "model":
        preds = model.predict(X_frame)
        if hasattr(model, "predict_proba"):
            proba = model.predict_proba(X_frame)  # shape (n, n_classes)
            classes_ = getattr(final_step, "classes_", None)
            details["classes_"] = classes_.tolist() if classes_ is not None else None
    elif final_step_name == "cluster":
        preds = model.predict(X_frame)  # cluster index
        # optional distances if available
        if hasattr(final_step, "transform"):
            distances = final_step.transform(model.named_steps["features"].transform(X_frame))
            details["cluster_distances_shape"] = list(distances.shape)
    else:
        # generic fallback
        preds = model.predict(X_frame)

    # assemble output table
    out = df_meta.copy()
    out.insert(len(out.columns), "prediction", preds)

    # add probabilities per class if available (supervised)
    if proba is not None:
        classes = details.get("classes_")
        if classes is None:
            classes = list(range(proba.shape[1]))
        for i, cls in enumerate(classes):
            col = f"proba_{cls}"
            out[col] = proba[:, i]

    # save CSV
    csv_out = os.path.join(out_dir, "polyglot_predictions.csv")
    out.to_csv(csv_out, index=False, encoding="utf-8")

    # summarize
    summary = {
        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "model_path": model_path,
        "audio_dir": audio_dir,
        "text_dir": text_dir,
        "csv_path": csv_path if os.path.isfile(csv_path) else None,
        "n_rows": int(out.shape[0]),
        "prediction_column": "prediction",
        "unique_predictions": out["prediction"].value_counts(dropna=False).to_dict(),
        "probabilities_included": proba is not None,
        "details": details
    }
    js_path = os.path.join(out_dir, "polyglot_summary.json")
    with open(js_path, "w", encoding="utf-8") as f:
        json.dump(summary, f, indent=2)

    # print a tiny preview
    print("\n=== Prediction complete ===")
    print("Predictions CSV ->", csv_out)
    print("Summary JSON    ->", js_path)
    print("Head:")
    print(out.head(10).to_string(index=False))

def main():
    ap = argparse.ArgumentParser(description="PolyGlotAI batch prediction")
    ap.add_argument("--model", default=DEFAULT_MODEL, help="Path to polyglot_model.pkl")
    ap.add_argument("--audio", default=DEFAULT_AUDIO_DIR, help="Audio dir with .wav")
    ap.add_argument("--text",  default=DEFAULT_TEXT_DIR,  help="Text dir with .txt (optional)")
    ap.add_argument("--csv",   default=DEFAULT_CSV_PATH,  help="CSV metadata (optional but recommended)")
    ap.add_argument("--out",   default=DEFAULT_OUT_DIR,   help="Output directory")
    args, _ = ap.parse_known_args()

    run_prediction(args.model, args.audio, args.text, args.csv, args.out)

if __name__ == "__main__":
    main()



=== Prediction complete ===
Predictions CSV -> C:\Users\sagni\Downloads\Poly Glot AI\polyglot_predictions.csv
Summary JSON    -> C:\Users\sagni\Downloads\Poly Glot AI\polyglot_summary.json
Head:
 ID       Audio       Text basename                                                audio_path text_value  prediction
  1 audio/1.wav text/1.txt        1 C:\Users\sagni\Downloads\Poly Glot AI\archive\audio\1.wav text/1.txt           0
  2 audio/2.wav text/2.txt        2 C:\Users\sagni\Downloads\Poly Glot AI\archive\audio\2.wav text/2.txt           1
  3 audio/3.wav text/3.txt        3 C:\Users\sagni\Downloads\Poly Glot AI\archive\audio\3.wav text/3.txt           3
  4 audio/4.wav text/4.txt        4 C:\Users\sagni\Downloads\Poly Glot AI\archive\audio\4.wav text/4.txt           2
  5 audio/5.wav text/5.txt        5 C:\Users\sagni\Downloads\Poly Glot AI\archive\audio\5.wav text/5.txt           4
