In [None]:
from google.colab import drive
drive.mount('/content/drive')


# === INSTALL DEPENDENCIES ===
# This sets up everything for all 3 stages:
# 1) Whisper (Voice → Text)
# 2) LanguageTool (Text Correction)
# 3) scikit-learn, librosa (Age Group Detection)

!apt-get update -qq
!apt-get install -y -qq ffmpeg default-jre
!pip install -q git+https://github.com/openai/whisper.git
!pip install -q soundfile librosa language-tool-python pandas scikit-learn numpy matplotlib

import importlib, sys, torch
print("✅ Installations complete.")
print("PyTorch CUDA available:", torch.cuda.is_available())


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
✅ Installations complete.
PyTorch CUDA available: False


In [1]:
print("sabur")

sabur


In [None]:
import os, sys, librosa, soundfile as sf, numpy as np, pandas as pd, difflib
from datetime import datetime
import whisper
from language_tool_python import LanguageTool
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import torch, random, warnings
warnings.filterwarnings("ignore")

# ========== CONFIG ==========
CONFIG = {
    "whisper_model": "small",
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "language_tool_lang": "en-US",
    "results_dir": "full_system_results",
}
os.makedirs(CONFIG["results_dir"], exist_ok=True)

print("✅ Device:", CONFIG["device"])

# ========== STEP 1: Load Whisper ==========
def load_whisper():
    print(f"Loading Whisper model: {CONFIG['whisper_model']} ...")
    model = whisper.load_model(CONFIG["whisper_model"], device=CONFIG["device"])
    print("Whisper loaded.")
    return model

# ========== STEP 2: Grammar Correction ==========
def correct_text(text):
    tool = LanguageTool(CONFIG["language_tool_lang"])
    corrected = tool.correct(text)
    return corrected

def show_diff(original, corrected):
    orig_tokens, corr_tokens = original.split(), corrected.split()
    seqm = difflib.SequenceMatcher(a=orig_tokens, b=corr_tokens)
    parts = []
    for tag, i1, i2, j1, j2 in seqm.get_opcodes():
        if tag == "equal":
            parts.append(" ".join(orig_tokens[i1:i2]))
        elif tag == "replace":
            parts.append(f"[~ {' '.join(orig_tokens[i1:i2])} → {' '.join(corr_tokens[j1:j2])} ~]")
        elif tag == "delete":
            parts.append(f"[- {' '.join(orig_tokens[i1:i2])} -]")
        elif tag == "insert":
            parts.append(f"[+ {' '.join(corr_tokens[j1:j2])} +]")
    return " ".join(parts)

# ========== STEP 3: Age Group Classifier ==========
# Simple MFCC feature extractor + synthetic demo model (for testing)
def extract_features(audio_path, sr=16000, n_mfcc=13):
    y, orig_sr = sf.read(audio_path)
    if y.ndim > 1:
        y = np.mean(y, axis=1)
    if orig_sr != sr:
        y = librosa.resample(y.astype(float), orig_sr, sr)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    feats = np.concatenate([mfcc.mean(axis=1), mfcc.std(axis=1)])
    return feats

def build_age_classifier():
    """
    Build a simple mock age classifier using Common Voice–style distribution.
    In real research, train this on Common Voice age-labeled data.
    Here we simulate trained SVM weights.
    """
    random.seed(42)
    np.random.seed(42)
    # Synthetic feature generator (for demonstration)
    X = np.vstack([
        np.random.normal(0, 1, (100, 26)),  # child
        np.random.normal(1, 1, (100, 26)),  # teen
        np.random.normal(2, 1, (100, 26)),  # adult
        np.random.normal(3, 1, (100, 26)),  # elderly
    ])
    y = np.array(["child"]*100 + ["teenager"]*100 + ["adult"]*100 + ["elderly"]*100)
    le = LabelEncoder()
    y_enc = le.fit_transform(y)
    scaler = StandardScaler()
    Xs = scaler.fit_transform(X)
    model = SVC(kernel="rbf", probability=True)
    model.fit(Xs, y_enc)
    return model, scaler, le

AGE_MODEL, AGE_SCALER, AGE_ENCODER = build_age_classifier()
print("✅ Age group classifier initialized (demo version).")

# ========== MASTER PIPELINE FUNCTION ==========
def process_audio(audio_path, model_whisper, model_age, scaler, le):
    print(f"\n🎧 Processing: {audio_path}")
    # 1) Transcription
    result = model_whisper.transcribe(audio_path, language="en")
    text = result["text"].strip()
    print("\n--- Recognized Text ---")
    print(text)

    # 2) Grammar correction
    corrected = correct_text(text)
    print("\n--- Corrected Text ---")
    print(corrected)
    print("\n--- Diff ---")
    print(show_diff(text, corrected))

    # 3) Age group prediction
    feat = extract_features(audio_path)
    feat_scaled = scaler.transform([feat])
    pred_idx = model_age.predict(feat_scaled)[0]
    pred_label = le.inverse_transform([pred_idx])[0]
    prob = model_age.predict_proba(feat_scaled)[0].max()
    print(f"\n--- Predicted Age Group ---")
    print(f"{pred_label.upper()} (confidence: {prob:.2f})")

    # Save summary
    out = {
        "audio_path": audio_path,
        "recognized_text": text,
        "corrected_text": corrected,
        "predicted_age": pred_label,
        "confidence": prob,
    }
    pd.DataFrame([out]).to_csv(
        os.path.join(CONFIG["results_dir"], f"result_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"),
        index=False)
    print("\n✅ Result saved in:", CONFIG["results_dir"])


✅ Device: cpu
✅ Age group classifier initialized (demo version).


In [None]:
# === RUN THE FULL SYSTEM ===
# 1️⃣ Upload your audio first (use the Files panel on the left of Colab).
# 2️⃣ Then replace "your_audio.wav" below with your file name.
# 3️⃣ Run this cell — it will print recognized text, corrected text, and age group.

model_whisper = load_whisper()

# Example: replace this with your file name after uploading to Colab
example_audio = "/content/your_audio.wav"  # <-- change this line

if not os.path.exists(example_audio):
    print("⚠️ File not found. Please upload an audio file using the left sidebar (📁).")
else:
    process_audio(example_audio, model_whisper, AGE_MODEL, AGE_SCALER, AGE_ENCODER)


Loading Whisper model: small ...
Whisper loaded.
⚠️ File not found. Please upload an audio file using the left sidebar (📁).


In [None]:
# This cell is no longer needed as the audio file collection is done in cell Ko6effT96rG6
# import os

# # Path to your folder in Google Drive
# AUDIO_FOLDER = "/content/drive/MyDrive/Thesis Adio"

# # Collect all WAV files
# audio_files = [os.path.join(AUDIO_FOLDER, f)
#                for f in os.listdir(AUDIO_FOLDER)
#                if f.lower().endswith('.wav')]

# print(f"✅ Found {len(audio_files)} audio files.")
# for f in audio_files[:5]:
#     print("•", f)

In [None]:
!apt-get update -qq
!apt-get install -y openjdk-17-jre-headless
!update-alternatives --set java /usr/lib/jvm/java-17-openjdk-amd64/bin/java
!java -version


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
Suggested packages:
  libnss-mdns fonts-ipafont-gothic fonts-ipafont-mincho fonts-wqy-microhei
  | fonts-wqy-zenhei fonts-indic
The following NEW packages will be installed:
  openjdk-17-jre-headless
0 upgraded, 1 newly installed, 0 to remove and 38 not upgraded.
Need to get 48.3 MB of archives.
After this operation, 193 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 openjdk-17-jre-headless amd64 17.0.16+8~us1-0ubuntu1~22.04.1 [48.3 MB]
Fetched 48.3 MB in 2s (26.1 MB/s)
Selecting previously unselected package openjdk-17-jre-headless:amd64.
(Reading database ... 126870 files and directories currently installed.)
Preparing to unpack .../openjdk-

In [None]:
!pip install -q --upgrade language-tool-python


In [None]:
import os, sys, librosa, soundfile as sf, numpy as np, pandas as pd, difflib
from datetime import datetime
import whisper
from language_tool_python import LanguageTool
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import torch, random, warnings
warnings.filterwarnings("ignore")
from transformers import pipeline

# ========== CONFIG ==========
CONFIG = {
    "whisper_model": "small",
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "language_tool_lang": "en-US",
    "results_dir": "full_system_results",
}
os.makedirs(CONFIG["results_dir"], exist_ok=True)

print("✅ Device:", CONFIG["device"])

# ========== STEP 1: Load Whisper ==========
def load_whisper():
    print(f"Loading Whisper model: {CONFIG['whisper_model']} ...")
    model = whisper.load_model(CONFIG["whisper_model"], device=CONFIG["device"])
    print("Whisper loaded.")
    return model

# ========== STEP 2: Grammar Correction ==========
def correct_text(text):
    tool = LanguageTool(CONFIG["language_tool_lang"])
    corrected = tool.correct(text)
    return corrected

def show_diff(original, corrected):
    orig_tokens, corr_tokens = original.split(), corrected.split()
    seqm = difflib.SequenceMatcher(a=orig_tokens, b=corr_tokens)
    parts = []
    for tag, i1, i2, j1, j2 in seqm.get_opcodes():
        if tag == "equal":
            parts.append(" ".join(orig_tokens[i1:i2]))
        elif tag == "replace":
            parts.append(f"[~ {' '.join(orig_tokens[i1:i2])} → {' '.join(corr_tokens[j1:j2])} ~]")
        elif tag == "delete":
            parts.append(f"[- {' '.join(orig_tokens[i1:i2])} -]")
        elif tag == f"[+ {' '.join(corr_tokens[j1:j2])} +]":
            parts.append(f"[+ {' '.join(corr_tokens[j1:j2])} +]")
    return " ".join(parts)

# ========== STEP 3: Age Group Classifier ==========
# Simple MFCC feature extractor + synthetic demo model (for testing)
def extract_features(audio_path, sr=16000, n_mfcc=13):
    y, orig_sr = sf.read(audio_path)
    if y.ndim > 1:
        y = np.mean(y, axis=1)
    if orig_sr != sr:
        y = librosa.resample(y.astype(float), orig_sr, sr)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    feats = np.concatenate([mfcc.mean(axis=1), mfcc.std(axis=1)])
    return feats

def build_age_classifier():
    """
    Build a simple mock age classifier using Common Voice–style distribution.
    In real research, train this on Common Voice age-labeled data.
    Here we simulate trained SVM weights.
    """
    random.seed(42)
    np.random.seed(42)
    # Synthetic feature generator (for demonstration)
    X = np.vstack([
        np.random.normal(0, 1, (100, 26)),  # child
        np.random.normal(1, 1, (100, 26)),  # teen
        np.random.normal(2, 1, (100, 26)),  # adult
        np.random.normal(3, 1, (100, 26)),  # elderly
    ])
    y = np.array(["child"]*100 + ["teenager"]*100 + ["adult"]*100 + ["elderly"]*100)
    le = LabelEncoder()
    y_enc = le.fit_transform(y)
    scaler = StandardScaler()
    Xs = scaler.fit_transform(X)
    model = SVC(kernel="rbf", probability=True)
    model.fit(Xs, y_enc)
    return model, scaler, le

AGE_MODEL, AGE_SCALER, AGE_ENCODER = build_age_classifier()
print("✅ Age group classifier initialized (demo version).")

# Load a small pretrained model (demo for threat / non-threat classification)
threat_classifier = pipeline(
    "text-classification",
    model="mrm8488/bert-tiny-finetuned-sms-spam-detection",
    device=0 if torch.cuda.is_available() else -1
)

print("✅ Threat classifier loaded and ready.")

# =============================
# Threat Analysis Function
# =============================
def analyze_threat(text):
    """
    Basic threat analysis function.
    Returns a simple label and confidence score.
    """
    result = threat_classifier(text[:512])   # limit to 512 tokens
    label = result[0]['label']
    score = result[0]['score']

    if label.lower() == "spam":
        return "Possible Threat", round(score, 2)
    else:
        return "Non-threat", round(score, 2)


# ========== MASTER PIPELINE FUNCTION ==========
def process_audio(audio_path, model_whisper, model_age, scaler, le):
    print(f"\n🎧 Processing: {audio_path}")
    # 1) Transcription
    result = model_whisper.transcribe(audio_path, language="en")
    text = result["text"].strip()
    print("\n--- Recognized Text ---")
    print(text)

    # 2) Grammar correction
    corrected = correct_text(text)
    print("\n--- Corrected Text ---")
    print(corrected)
    print("\n--- Diff ---")
    print(show_diff(text, corrected))

    # 3) Age group prediction
    feat = extract_features(audio_path)
    feat_scaled = scaler.transform([feat])
    pred_idx = model_age.predict(feat_scaled)[0]
    pred_label = le.inverse_transform([pred_idx])[0]
    prob = model_age.predict_proba(feat_scaled)[0].max()
    print(f"\n--- Predicted Age Group ---")
    print(f"{pred_label.upper()} (confidence: {prob:.2f})")

    # 4) Threat Analysis
    threat_label, threat_conf = analyze_threat(corrected)
    print(f"\n--- Threat Analysis ---")
    print(f"{threat_label} (confidence: {threat_conf})")


    # Save summary
    out = {
        "audio_path": audio_path,
        "recognized_text": text,
        "corrected_text": corrected,
        "predicted_age": pred_label,
        "confidence": prob,
        "threat_label": threat_label,
        "threat_confidence": threat_conf,
    }
    pd.DataFrame([out]).to_csv(
        os.path.join(CONFIG["results_dir"], f"result_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"),
        index=False)
    print("\n✅ Result saved in:", CONFIG["results_dir"])

✅ Device: cpu
✅ Age group classifier initialized (demo version).


Device set to use cpu


✅ Threat classifier loaded and ready.


In [None]:
# This cell is no longer needed as the model loading is done in cell Ko6effT96rG6
# import whisper
# model_whisper = whisper.load_model("small")
# print("✅ Whisper model loaded.")

In [None]:
!pip install -q transformers torch


In [None]:
from transformers import pipeline
import torch

# Load a small pretrained model (demo for threat / non-threat classification)
threat_classifier = pipeline(
    "text-classification",
    model="mrm8488/bert-tiny-finetuned-sms-spam-detection",
    device=0 if torch.cuda.is_available() else -1
)

print("✅ Threat classifier loaded and ready.")


Device set to use cpu


✅ Threat classifier loaded and ready.


In [None]:
import os

# Path to your folder in Google Drive
AUDIO_FOLDER = "/content/drive/MyDrive/Thesis Adio"

# Collect all WAV files
audio_files = [os.path.join(AUDIO_FOLDER, f)
               for f in os.listdir(AUDIO_FOLDER)
               if f.lower().endswith('.wav')]

print(f"✅ Found {len(audio_files)} audio files.")
for f in audio_files[:5]:
    print("•", f)

model_whisper = load_whisper()

for path in audio_files:
    try:
        process_audio(path, model_whisper, AGE_MODEL, AGE_SCALER, AGE_ENCODER)
    except Exception as e:
        print(f"⚠️ Error with {path}: {e}")

✅ Found 100 audio files.
• /content/drive/MyDrive/Thesis Adio/000060077.WAV
• /content/drive/MyDrive/Thesis Adio/000010089.WAV
• /content/drive/MyDrive/Thesis Adio/000060015.WAV
• /content/drive/MyDrive/Thesis Adio/000030116.WAV
• /content/drive/MyDrive/Thesis Adio/000240151.WAV
Loading Whisper model: small ...
Whisper loaded.

🎧 Processing: /content/drive/MyDrive/Thesis Adio/000060077.WAV

--- Recognized Text ---
and age a little dog.


Downloading LanguageTool latest: 100%|██████████| 254M/254M [00:13<00:00, 18.8MB/s]
INFO:language_tool_python.download_lt:Unzipping /tmp/tmpbm65xrss.zip to /root/.cache/language_tool_python.
INFO:language_tool_python.download_lt:Downloaded https://internal1.languagetool.org/snapshots/LanguageTool-latest-snapshot.zip to /root/.cache/language_tool_python.



--- Corrected Text ---
And age a little dog.

--- Diff ---
[~ and → And ~] age a little dog.

--- Predicted Age Group ---
ADULT (confidence: 0.28)

--- Threat Analysis ---
Non-threat (confidence: 0.94)

✅ Result saved in: full_system_results

🎧 Processing: /content/drive/MyDrive/Thesis Adio/000010089.WAV

--- Recognized Text ---
Mandy has a big arm.


Downloading LanguageTool latest: 100%|██████████| 254M/254M [00:09<00:00, 27.6MB/s]
INFO:language_tool_python.download_lt:Unzipping /tmp/tmpbmkcxvjj.zip to /root/.cache/language_tool_python.
INFO:language_tool_python.download_lt:Downloaded https://internal1.languagetool.org/snapshots/LanguageTool-latest-snapshot.zip to /root/.cache/language_tool_python.
Downloading LanguageTool latest: 100%|██████████| 254M/254M [00:09<00:00, 25.9MB/s]
INFO:language_tool_python.download_lt:Unzipping /tmp/tmp3z194fm7.zip to /root/.cache/language_tool_python.
INFO:language_tool_python.download_lt:Downloaded https://internal1.languagetool.org/snapshots/LanguageTool-latest-snapshot.zip to /root/.cache/language_tool_python.



--- Corrected Text ---
Mandy has a big arm.

--- Diff ---
Mandy has a big arm.

--- Predicted Age Group ---
ADULT (confidence: 0.28)

--- Threat Analysis ---
Non-threat (confidence: 0.94)

✅ Result saved in: full_system_results

🎧 Processing: /content/drive/MyDrive/Thesis Adio/000060015.WAV

--- Recognized Text ---
Jamie is going to see Hen.


Downloading LanguageTool latest: 100%|██████████| 254M/254M [00:14<00:00, 17.0MB/s]
INFO:language_tool_python.download_lt:Unzipping /tmp/tmp8n_6lkb4.zip to /root/.cache/language_tool_python.
INFO:language_tool_python.download_lt:Downloaded https://internal1.languagetool.org/snapshots/LanguageTool-latest-snapshot.zip to /root/.cache/language_tool_python.
Downloading LanguageTool latest: 100%|██████████| 254M/254M [00:09<00:00, 27.4MB/s]
INFO:language_tool_python.download_lt:Unzipping /tmp/tmp_n5ep6ln.zip to /root/.cache/language_tool_python.
INFO:language_tool_python.download_lt:Downloaded https://internal1.languagetool.org/snapshots/LanguageTool-latest-snapshot.zip to /root/.cache/language_tool_python.



--- Corrected Text ---
Jamie is going to see Hen.

--- Diff ---
Jamie is going to see Hen.

--- Predicted Age Group ---
ADULT (confidence: 0.28)

--- Threat Analysis ---
Non-threat (confidence: 0.94)

✅ Result saved in: full_system_results

🎧 Processing: /content/drive/MyDrive/Thesis Adio/000030116.WAV

--- Recognized Text ---
So Billy went into the pet shop.


Downloading LanguageTool latest: 100%|██████████| 254M/254M [00:08<00:00, 30.4MB/s]
INFO:language_tool_python.download_lt:Unzipping /tmp/tmp1f0nfvyn.zip to /root/.cache/language_tool_python.
INFO:language_tool_python.download_lt:Downloaded https://internal1.languagetool.org/snapshots/LanguageTool-latest-snapshot.zip to /root/.cache/language_tool_python.
Downloading LanguageTool latest: 100%|██████████| 254M/254M [00:09<00:00, 26.0MB/s]
INFO:language_tool_python.download_lt:Unzipping /tmp/tmp5mw3cyri.zip to /root/.cache/language_tool_python.
INFO:language_tool_python.download_lt:Downloaded https://internal1.languagetool.org/snapshots/LanguageTool-latest-snapshot.zip to /root/.cache/language_tool_python.



--- Corrected Text ---
So Billy went into the pet shop.

--- Diff ---
So Billy went into the pet shop.

--- Predicted Age Group ---
ADULT (confidence: 0.28)

--- Threat Analysis ---
Non-threat (confidence: 0.94)

✅ Result saved in: full_system_results

🎧 Processing: /content/drive/MyDrive/Thesis Adio/000240151.WAV

--- Recognized Text ---
Successful first day for college.


Downloading LanguageTool latest: 100%|██████████| 254M/254M [00:12<00:00, 21.0MB/s]
INFO:language_tool_python.download_lt:Unzipping /tmp/tmpbiaqki9o.zip to /root/.cache/language_tool_python.
INFO:language_tool_python.download_lt:Downloaded https://internal1.languagetool.org/snapshots/LanguageTool-latest-snapshot.zip to /root/.cache/language_tool_python.
Downloading LanguageTool latest: 100%|██████████| 254M/254M [00:09<00:00, 26.3MB/s]
INFO:language_tool_python.download_lt:Unzipping /tmp/tmp40pzm1gk.zip to /root/.cache/language_tool_python.
INFO:language_tool_python.download_lt:Downloaded https://internal1.languagetool.org/snapshots/LanguageTool-latest-snapshot.zip to /root/.cache/language_tool_python.



--- Corrected Text ---
Successful first day for college.

--- Diff ---
Successful first day for college.

--- Predicted Age Group ---
ADULT (confidence: 0.28)

--- Threat Analysis ---
Non-threat (confidence: 0.93)

✅ Result saved in: full_system_results

🎧 Processing: /content/drive/MyDrive/Thesis Adio/000010106.WAV

--- Recognized Text ---
What about the bus?


Downloading LanguageTool latest: 100%|██████████| 254M/254M [00:11<00:00, 22.0MB/s]
INFO:language_tool_python.download_lt:Unzipping /tmp/tmp4z73of5c.zip to /root/.cache/language_tool_python.
INFO:language_tool_python.download_lt:Downloaded https://internal1.languagetool.org/snapshots/LanguageTool-latest-snapshot.zip to /root/.cache/language_tool_python.
Downloading LanguageTool latest: 100%|██████████| 254M/254M [00:08<00:00, 29.2MB/s]
INFO:language_tool_python.download_lt:Unzipping /tmp/tmpn7xvp43e.zip to /root/.cache/language_tool_python.
INFO:language_tool_python.download_lt:Downloaded https://internal1.languagetool.org/snapshots/LanguageTool-latest-snapshot.zip to /root/.cache/language_tool_python.


KeyboardInterrupt: 

In [None]:
import pandas as pd, glob

# Path where the results were saved
all_csvs = glob.glob("/content/full_system_results/*.csv")

# Merge them into one dataset
df = pd.concat([pd.read_csv(c) for c in all_csvs], ignore_index=True)

print("✅ Combined dataset created. Total rows:", len(df))
display(df)

✅ Combined dataset created. Total rows: 5


Unnamed: 0,audio_path,recognized_text,corrected_text,predicted_age,confidence,threat_label,threat_confidence
0,/content/drive/MyDrive/Thesis Adio/000010089.WAV,Mandy has a big arm.,Mandy has a big arm.,adult,0.283133,Non-threat,0.94
1,/content/drive/MyDrive/Thesis Adio/000030116.WAV,So Billy went into the pet shop.,So Billy went into the pet shop.,adult,0.283133,Non-threat,0.94
2,/content/drive/MyDrive/Thesis Adio/000240151.WAV,Successful first day for college.,Successful first day for college.,adult,0.283133,Non-threat,0.93
3,/content/drive/MyDrive/Thesis Adio/000060077.WAV,and age a little dog.,And age a little dog.,adult,0.283133,Non-threat,0.94
4,/content/drive/MyDrive/Thesis Adio/000060015.WAV,Jamie is going to see Hen.,Jamie is going to see Hen.,adult,0.283133,Non-threat,0.94


In [None]:
import pandas as pd, glob

# Step 7: Combine all result CSVs
all_csvs = glob.glob("/content/full_system_results/*.csv")

# Merge all individual CSVs into one dataset
df = pd.concat([pd.read_csv(c) for c in all_csvs], ignore_index=True)

# Save the combined dataset into your Google Drive
output_path = "/content/drive/MyDrive/Thesis Adio/Final_Results.csv"
df.to_csv(output_path, index=False)

print(f"✅ All results saved to: {output_path}")
print("📊 Total rows in dataset:", len(df))
df.head()


✅ All results saved to: /content/drive/MyDrive/Thesis Adio/Final_Results.csv
📊 Total rows in dataset: 5


Unnamed: 0,audio_path,recognized_text,corrected_text,predicted_age,confidence,threat_label,threat_confidence
0,/content/drive/MyDrive/Thesis Adio/000010089.WAV,Mandy has a big arm.,Mandy has a big arm.,adult,0.283133,Non-threat,0.94
1,/content/drive/MyDrive/Thesis Adio/000030116.WAV,So Billy went into the pet shop.,So Billy went into the pet shop.,adult,0.283133,Non-threat,0.94
2,/content/drive/MyDrive/Thesis Adio/000240151.WAV,Successful first day for college.,Successful first day for college.,adult,0.283133,Non-threat,0.93
3,/content/drive/MyDrive/Thesis Adio/000060077.WAV,and age a little dog.,And age a little dog.,adult,0.283133,Non-threat,0.94
4,/content/drive/MyDrive/Thesis Adio/000060015.WAV,Jamie is going to see Hen.,Jamie is going to see Hen.,adult,0.283133,Non-threat,0.94
