In [5]:
import os
import re
import json
import glob
import argparse
from datetime import datetime

import numpy as np
import pandas as pd
import joblib
import yaml
import h5py

# Audio feature extraction
import librosa
import soundfile as sf

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import MiniBatchKMeans


# =========================
# Paths (defaults)
# =========================
DEFAULT_AUDIO_DIR = r"C:\Users\sagni\Downloads\Poly Glot AI\archive\audio"
DEFAULT_TEXT_DIR  = r"C:\Users\sagni\Downloads\Poly Glot AI\archive\text"
DEFAULT_CSV_PATH  = r"C:\Users\sagni\Downloads\Poly Glot AI\archive\British English Speech Recognition.csv"
DEFAULT_OUT_DIR   = r"C:\Users\sagni\Downloads\Poly Glot AI"

POSSIBLE_LABELS = [
    "label", "Label", "target", "Target", "accent", "Accent",
    "speaker_id", "Speaker", "class", "Class"
]
POSSIBLE_TEXT_COLS = ["transcript", "text", "utterance", "sentence", "phrase", "Transcript", "Text"]


# =========================
# Utilities
# =========================
def list_basenames(root, exts=("*.wav","*.WAV")):
    """Return dict: basename (without ext) -> full path for first match."""
    mapping = {}
    for pat in exts:
        for path in glob.glob(os.path.join(root, pat)):
            base = os.path.splitext(os.path.basename(path))[0]
            mapping[base] = path
    return mapping

def list_basenames_txt(root, exts=("*.txt","*.TXT")):
    mapping = {}
    for pat in exts:
        for path in glob.glob(os.path.join(root, pat)):
            base = os.path.splitext(os.path.basename(path))[0]
            mapping[base] = path
    return mapping

def safe_read_txt(path):
    try:
        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            return f.read().strip()
    except Exception:
        return ""

def save_h5_from_pickle(pkl_path: str, h5_path: str, meta: dict):
    """Create an HDF5 file embedding the pickled model bytes (+ metadata)."""
    with open(pkl_path, "rb") as f:
        model_bytes = f.read()
    with h5py.File(h5_path, "w") as h5:
        dset = h5.create_dataset("model_pickle", data=np.void(model_bytes))
        for k, v in meta.items():
            try:
                if isinstance(v, (str, int, float, np.integer, np.floating)):
                    dset.attrs[k] = v
                else:
                    dset.attrs[k] = json.dumps(v)
            except Exception:
                dset.attrs[k] = str(v)

def robust_load_wav(path, sr=16000):
    """
    Prefer soundfile (no librosa deprecation), fallback to librosa.load.
    Returns (y, sr). On failure, (None, None).
    """
    try:
        y, r = sf.read(path, always_2d=False)
        if isinstance(y, np.ndarray):
            if y.ndim > 1:
                y = np.mean(y, axis=1)
            if r != sr:
                y = librosa.resample(y.astype(float), orig_sr=r, target_sr=sr)
                r = sr
        return y.astype(float), r
    except Exception:
        try:
            y, r = librosa.load(path, sr=sr, mono=True)
            return y, r
        except Exception:
            return None, None


# =========================
# Pickle-safe Transformers
# =========================
class Squeeze1D(BaseEstimator, TransformerMixin):
    """Ensure a pandas column slice becomes a 1-D array (n_samples,)."""
    def fit(self, X, y=None): return self
    def transform(self, X):
        return np.asarray(X, dtype=object).ravel()

class TextStatExtractor(BaseEstimator, TransformerMixin):
    """Simple numeric text stats: char_len, word_count, avg_word_len."""
    def fit(self, X, y=None):
        self.feature_names_ = np.array(["char_len", "word_count", "avg_word_len"])
        return self
    def transform(self, X):
        X = np.asarray(X, dtype=object).ravel().tolist()
        rows = []
        for t in X:
            t = t if isinstance(t, str) else ""
            chars = len(t)
            words = re.findall(r"\w+", t, flags=re.UNICODE)
            wcnt = len(words)
            avgw = (sum(len(w) for w in words) / wcnt) if wcnt else 0.0
            rows.append([chars, wcnt, avgw])
        return np.array(rows, dtype=float)
    def get_feature_names_out(self, input_features=None):
        return self.feature_names_

class AudioFeatureExtractor(BaseEstimator, TransformerMixin):
    """
    Extract compact audio features from file paths:
      - duration, rms mean/std, zcr mean, spectral centroid mean/std,
        MFCC 13 means + 13 stds  (total 6 + 26 = 32 dims)
    """
    def __init__(self, sr=16000, n_mfcc=13):
        self.sr = sr
        self.n_mfcc = n_mfcc
        self.feature_names_ = None

    def fit(self, X, y=None):
        names = ["dur_sec", "rms_mean", "rms_std", "zcr_mean", "spec_cent_mean", "spec_cent_std"]
        names += [f"mfcc{i+1}_mean" for i in range(self.n_mfcc)]
        names += [f"mfcc{i+1}_std" for i in range(self.n_mfcc)]
        self.feature_names_ = names
        return self

    def transform(self, X):
        X = np.asarray(X, dtype=object).ravel().tolist()
        feats = []
        for wav_path in X:
            try:
                y, sr = robust_load_wav(wav_path, sr=self.sr)
                if y is None or sr is None or len(y) == 0:
                    raise RuntimeError("Audio load failed")

                dur = len(y) / float(sr)

                rms = librosa.feature.rms(y=y).flatten()
                zcr = librosa.feature.zero_crossing_rate(y).flatten()
                spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr).flatten()
                mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=self.n_mfcc)

                vec = [
                    dur,
                    float(np.mean(rms)) if rms.size else 0.0,
                    float(np.std(rms))  if rms.size else 0.0,
                    float(np.mean(zcr)) if zcr.size else 0.0,
                    float(np.mean(spec_cent)) if spec_cent.size else 0.0,
                    float(np.std(spec_cent))  if spec_cent.size else 0.0,
                ]
                if mfcc.size:
                    vec += list(np.mean(mfcc, axis=1))
                    vec += list(np.std(mfcc, axis=1))
                else:
                    vec += [0.0]*self.n_mfcc + [0.0]*self.n_mfcc
            except Exception:
                vec = [0.0]*(6 + self.n_mfcc*2)
            feats.append(vec)
        return np.array(feats)

    def get_feature_names_out(self, input_features=None):
        return np.array(self.feature_names_ or [])


# =========================
# Main
# =========================
def main():
    parser = argparse.ArgumentParser(description="PolyGlotAI train + export (pickle-safe)")
    parser.add_argument("--audio", default=DEFAULT_AUDIO_DIR, help="Audio directory with .wav")
    parser.add_argument("--text",  default=DEFAULT_TEXT_DIR,  help="Text directory with .txt")
    parser.add_argument("--csv",   default=DEFAULT_CSV_PATH,  help="CSV with metadata (optional)")
    parser.add_argument("--out",   default=DEFAULT_OUT_DIR,   help="Output directory for artifacts")
    args, _ = parser.parse_known_args()

    os.makedirs(args.out, exist_ok=True)

    # -------- Collect file lists --------
    audio_map = list_basenames(args.audio, exts=("*.wav","*.WAV"))
    text_map  = list_basenames_txt(args.text, exts=("*.txt","*.TXT"))

    df_csv = None
    if os.path.isfile(args.csv):
        try:
            df_csv = pd.read_csv(args.csv)
        except Exception:
            df_csv = pd.read_csv(args.csv, encoding="latin-1")

    # normalize csv to have 'basename' and maybe 'transcript'/'label'
    if df_csv is not None:
        df_csv = df_csv.copy()
        fname_col = next((c for c in ["filename","file","path","wav","audio","fname","id","ID","Name","name"] if c in df_csv.columns), None)
        if fname_col is not None:
            df_csv["basename"] = df_csv[fname_col].astype(str).apply(lambda p: os.path.splitext(os.path.basename(p))[0])
        else:
            df_csv["basename"] = df_csv.index.astype(str)

        text_col = next((c for c in POSSIBLE_TEXT_COLS if c in df_csv.columns), None)
        label_col = next((c for c in POSSIBLE_LABELS if c in df_csv.columns), None)
    else:
        union_basenames = sorted(set(audio_map.keys()) | set(text_map.keys()))
        df_csv = pd.DataFrame({"basename": union_basenames})
        text_col = None
        label_col = None

    # Attach resolved paths and text/transcript
    df_csv["audio_path"] = df_csv["basename"].map(audio_map)
    if 'text_value' not in df_csv.columns:
        df_csv["text_value"] = ""
    if text_col:
        df_csv["text_value"] = df_csv[text_col].fillna("").astype(str)

    # fill text_value from text files if empty
    def _fill(row):
        if row["text_value"]:
            return row["text_value"]
        p = text_map.get(row["basename"])
        return safe_read_txt(p) if p else ""
    df_csv["text_value"] = df_csv.apply(_fill, axis=1)

    # Drop rows without audio
    df = df_csv[df_csv["audio_path"].notna()].reset_index(drop=True)

    # Determine y (label) if available
    y = None
    label_used = None
    if label_col and label_col in df.columns:
        y_series = df[label_col]
        if y_series.dtype == object:
            cats = {v: i for i, v in enumerate(sorted(y_series.dropna().unique()))}
            y = y_series.map(cats)
            label_mapping = {str(k): int(v) for k, v in cats.items()}
        else:
            y = y_series.copy()
            label_mapping = None
        label_used = label_col
    else:
        label_mapping = None

    # Frame for transformers
    X_frame = pd.DataFrame({
        "audio_path": df["audio_path"].astype(str).values,
        "text_value": df["text_value"].astype(str).values
    })

    # --------------------------
    # Build feature + model pipeline(s) (no lambdas)
    # --------------------------
    audio_pipe = Pipeline([
        ("select", Squeeze1D()),
        ("afe", AudioFeatureExtractor(sr=16000, n_mfcc=13)),
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    text_stats = TextStatExtractor()
    text_tfidf = TfidfVectorizer(analyzer="char", ngram_range=(2,4), max_features=800)

    # Two branches on the same source column "text_value"
    text_features = ColumnTransformer(
        transformers=[
            ("tfidf", Pipeline([("sel", Squeeze1D()), ("tfidf", text_tfidf)]), "text_value"),
            ("tstats", Pipeline([("sel", Squeeze1D()), ("tstat", text_stats)]), "text_value"),
        ],
        remainder="drop"
    )

    # Supervised path
    supervised_results = None
    best_pipe = None
    mode = "unsupervised"

    if y is not None and pd.Series(y).nunique() >= 2:
        mode = "supervised"
        full_feat = ColumnTransformer(
            transformers=[
                ("audio", audio_pipe, ["audio_path"]),
                ("text",  text_features, ["text_value"])
            ],
            remainder="drop"
        )

        models = {
            "logreg": LogisticRegression(max_iter=2000),
            "rf": RandomForestClassifier(n_estimators=400, random_state=42, n_jobs=-1)
        }

        X_train, X_test, y_train, y_test = train_test_split(
            X_frame, y, test_size=0.25, random_state=42, stratify=y
        )

        supervised_results = {}
        best_name, best_score = None, -np.inf

        for name, est in models.items():
            pipe = Pipeline([("features", full_feat), ("model", est)])
            pipe.fit(X_train, y_train)
            y_pred = pipe.predict(X_test)
            acc = accuracy_score(y_test, y_pred)

            auc = None
            try:
                if len(np.unique(y_test)) == 2 and hasattr(pipe, "predict_proba"):
                    probs = pipe.predict_proba(X_test)[:, 1]
                    if not np.allclose(np.min(probs), np.max(probs)):
                        auc = roc_auc_score(y_test, probs)
            except Exception:
                auc = None

            score = auc if auc is not None else acc
            supervised_results[name] = {
                "accuracy": float(acc),
                "roc_auc": (None if auc is None else float(auc)),
                "chosen_score": float(score)
            }
            if score > best_score:
                best_score = score
                best_name = name
                best_pipe = pipe

        chosen_model = best_name

    # Unsupervised path (if no label)
    if best_pipe is None:
        mode = "unsupervised"
        full_feat = ColumnTransformer(
            transformers=[
                ("audio", audio_pipe, ["audio_path"]),
                ("text",  text_features, ["text_value"])
            ],
            remainder="drop"
        )
        kmeans = MiniBatchKMeans(n_clusters=5, random_state=42, batch_size=128, n_init="auto")
        best_pipe = Pipeline([("features", full_feat), ("cluster", kmeans)])
        best_pipe.fit(X_frame)

        clusters = best_pipe.predict(X_frame)
        unique, counts = np.unique(clusters, return_counts=True)
        supervised_results = {"clusters": {int(k): int(v) for k, v in zip(unique, counts)}}
        chosen_model = "MiniBatchKMeans(k=5)"

    # --------------------------
    # Exports
    # --------------------------
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    pkl_path  = os.path.join(args.out, "polyglot_model.pkl")
    h5_path   = os.path.join(args.out, "polyglot_model.h5")
    yml_path  = os.path.join(args.out, "polyglot_config.yaml")
    jsn_path  = os.path.join(args.out, "polyglot_report.json")

    joblib.dump(best_pipe, pkl_path)

    cfg = {
        "run": {
            "timestamp": timestamp,
            "mode": mode,
            "audio_dir": args.audio,
            "text_dir": args.text,
            "csv_path": args.csv,
            "out_dir": args.out
        },
        "features": {
            "audio": {"sr": 16000, "n_mfcc": 13},
            "text":  {"tfidf": {"analyzer": "char", "ngram_range": [2,4], "max_features": 800},
                      "stats": ["char_len", "word_count", "avg_word_len"]}
        },
        "supervised": {
            "label_column": (label_used if mode == "supervised" else None),
            "label_mapping": (label_mapping if mode == "supervised" else None),
            "candidates": (["logreg", "rf"] if mode == "supervised" else None),
            "chosen_model": chosen_model if mode == "supervised" else None
        },
        "unsupervised": {"algorithm": "MiniBatchKMeans", "n_clusters": 5}
    }
    with open(yml_path, "w", encoding="utf-8") as f:
        yaml.safe_dump(cfg, f, sort_keys=False, allow_unicode=True)

    # Dataset summary (audio durations)
    dur_stats = duration_stats(df["audio_path"].tolist())

    report = {
        "timestamp": timestamp,
        "mode": mode,
        "rows": int(df.shape[0]),
        "columns": int(df.shape[1]),
        "durations_sec": dur_stats,
        "results": supervised_results,
        "chosen_model": chosen_model
    }
    with open(jsn_path, "w", encoding="utf-8") as f:
        json.dump(report, f, indent=2)

    # H5 (embed pickle)
    h5_meta = {"artifact_type": "pickle_container", "mode": mode, "chosen_model": chosen_model, "timestamp": timestamp}
    save_h5_from_pickle(pkl_path, h5_path, h5_meta)

    print("\n=== PolyGlotAI Artifacts Ready ===")
    print("PKL  ->", pkl_path)
    print("H5   ->", h5_path)
    print("YAML ->", yml_path)
    print("JSON ->", jsn_path)
    print(f"Rows used: {df.shape[0]} | Mode: {mode}")


def duration_stats(paths):
    vals = []
    for p in paths:
        try:
            y, sr = robust_load_wav(p, sr=16000)
            if y is None or sr is None or len(y) == 0:
                continue
            vals.append(len(y)/float(sr))
        except Exception:
            continue
    if not vals:
        return None
    s = pd.Series(vals)
    return {
        "count": int(s.size),
        "mean_sec": float(s.mean()),
        "median_sec": float(s.median()),
        "min_sec": float(s.min()),
        "max_sec": float(s.max()),
    }


if __name__ == "__main__":
    main()



=== PolyGlotAI Artifacts Ready ===
PKL  -> C:\Users\sagni\Downloads\Poly Glot AI\polyglot_model.pkl
H5   -> C:\Users\sagni\Downloads\Poly Glot AI\polyglot_model.h5
YAML -> C:\Users\sagni\Downloads\Poly Glot AI\polyglot_config.yaml
JSON -> C:\Users\sagni\Downloads\Poly Glot AI\polyglot_report.json
Rows used: 5 | Mode: unsupervised
