In [47]:
# !pip install ipywidgets  #1 time installation
# !jupyter nbextension enable --py widgetsnbextension
# !pip install scikit-learn


In [48]:
import json
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics.pairwise import cosine_similarity
from ipywidgets import Widget, Text, Button, VBox, Label


In [49]:
candidate_paths = [
    Path("corpus_json/corpus.json"),
    Path("../corpus_json/corpus.json"),
    Path("../../corpus_json/corpus.json"),
]
for p in candidate_paths:
    if p.exists():
        corpus_path = p
        break

with corpus_path.open("r", encoding="utf-8") as f:
    corpus = json.load(f)

doc_ids = [doc["id"] for doc in corpus]
doc_titles = [doc.get("title", "") for doc in corpus]
doc_texts = [doc["text"] for doc in corpus]

# Train document-level SVM classifier
doc_vectorizer = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1, 2),   # unigrams + bigrams
    max_df=0.9,
    min_df=1
)

doc_model = make_pipeline(
    doc_vectorizer,
    SVC(C=1.0, kernel="sigmoid", probability=True, random_state=42)
)
doc_model.fit(doc_texts, doc_ids)

print(f"Loaded {len(corpus)} documents from {corpus_path}")
print(f"Document-level SVM model trained on {len(doc_texts)} documents")


Loaded 20 documents from ../../corpus_json/corpus.json
Document-level SVM model trained on 20 documents


In [50]:
# Display model information
feature_names = doc_vectorizer.get_feature_names_out()
print("Number of features:", len(feature_names))
print("Sample features:", feature_names[:10])

# Get feature importance from first document
first_doc_vector = doc_vectorizer.transform([doc_texts[0]]).toarray()[0]
nonzero_idx = first_doc_vector.nonzero()[0]

df = pd.DataFrame({
    "term": [feature_names[i] for i in nonzero_idx],
    "tfidf": [first_doc_vector[i] for i in nonzero_idx]
}).sort_values("tfidf", ascending=False)

print("\nTop 10 TF-IDF terms in Document 0:")
print(df.head(10))


Number of features: 50406
Sample features: ['00' '00 00' '00 15' '00 20' '00 40' '00 75' '00 95' '00 99' '00 aigc'
 '00 cm']

Top 10 TF-IDF terms in Document 0:
                   term     tfidf
1406              crime  0.671330
3568           mobility  0.209394
986              cities  0.138597
1484             crimes  0.134266
2412        forecasting  0.129443
3175                 lb  0.125604
3756                 nn  0.106497
1437  crime forecasting  0.090954
863                cell  0.088116
4189                poi  0.086623


In [51]:
# Document-level retrieval function using SVM
def retrieve_svm_docs(query: str, k: int = 5):
    """
    Returns top-k documents using SVM classification.
    """
    if not query.strip():
        return []

    # Get probability predictions
    proba = doc_model.predict_proba([query])[0]
    classes = doc_model.classes_

    # Get top-k indices
    topk_idx = np.argsort(proba)[::-1][:k]

    results = []
    for rank, idx in enumerate(topk_idx, start=1):
        doc_idx = doc_ids.index(classes[idx])
        results.append({
            "rank": rank,
            "score": float(proba[idx]),
            "id": doc_ids[doc_idx],
            "title": doc_titles[doc_idx],
            "text": doc_texts[doc_idx]
        })
    return results


In [52]:
label = Label("Enter your question:")
txt = Text(placeholder="Type your question here...")
btn = Button(description="Submit")

def on_click(b):
    global query
    query = txt.value
    

btn.on_click(on_click)

VBox([label, txt, btn])


VBox(children=(Label(value='Enter your question:'), Text(value='', placeholder='Type your question here...'), …

In [53]:
print(query)


How can we detect sarcasm via deep learning?


In [54]:
# Show results at title level
results = retrieve_svm_docs(query, k=3)

print(f"Question: {query}")
print("-" * 80)

for r in results:
    print(f"[{r['rank']}] {r['id']}  (SVM probability={r['score']:.4f})")
    print(f"Paper title: {r['title']}")
    print("-" * 80)
    print(r["text"][:500], "...")
    print()




Question: How can we detect sarcasm via deep learning?
--------------------------------------------------------------------------------
[1] 2510.05163v1  (SVM probability=0.0507)
Paper title: Deep Learning-Based Multi-Factor Authentication: A Survey of Biometric and Smart Card Integration Approaches
--------------------------------------------------------------------------------
Deep Learning-Based Multi-Factor Authentication: A Survey of Biometric and Smart Card Integration Approaches. In the era of pervasive cyber threats and exponential growth in digital services the inadequacy of single-factor authentication has become increasingly evident. Multi-Factor Authentication MFA which combines knowledge-based factors passwords PINs possessionbased factors smart cards tokens and inherence-based factors biometric traits has emerged as a robust defense mechanism. Recent break ...

[2] 2510.05736v1  (SVM probability=0.0507)
Paper title: Convolution and Graph-based Deep Learning Approaches for

In [55]:
def chunk_text(text, chunk_size=220, overlap=40):
    """
    Split text into overlapping chunks.
    chunk_size: target words per chunk
    overlap: how many words to overlap between consecutive chunks
    """
    words = text.split()
    if not words:
        return []

    chunks = []
    start = 0
    n = len(words)

    while start < n:
        end = start + chunk_size
        chunk_words = words[start:end]
        chunk = " ".join(chunk_words)
        chunks.append(chunk)

        if end >= n:
            break

        start = end - overlap  

    return chunks


In [56]:
passage_texts = []   
passage_meta = []   

for doc in corpus:
    doc_id = doc["id"]
    title = doc.get("title", "")
    text = doc["text"]

    chunks = chunk_text(text, chunk_size=220, overlap=40)
    start_word = 0

    for i, chunk in enumerate(chunks):
        end_word = start_word + len(chunk.split())
        passage_texts.append(chunk)
        passage_meta.append({
            "doc_id": doc_id,
            "title": title,
            "chunk_id": f"{doc_id}_chunk_{i}",
            "start_word": start_word,
            "end_word": end_word,
        })
        start_word = end_word - 40  # keep aligned with overlap


In [57]:
print(f"Number of documents: {len(corpus)}")
print(f"Number of passages:  {len(passage_texts)}")


Number of documents: 20
Number of passages:  491


In [58]:
# Train passage-level SVM classifier
passage_vectorizer = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1, 2),
    max_df=0.9,
    min_df=1
)

# Get labels for passages
passage_labels = [meta["doc_id"] for meta in passage_meta]

passage_model = make_pipeline(
    passage_vectorizer,
    SVC(C=1.0, kernel="linear", probability=True, random_state=42)
)
passage_model.fit(passage_texts, passage_labels)

print(f"Passage-level SVM model trained on {len(passage_texts)} passages")


Passage-level SVM model trained on 491 passages


In [59]:
def retrieve_svm_chunks(query: str, k: int = 3):
    """
    Retrieve top-k passages (chunks) using SVM classification.
    """
    if not query.strip():
        return []

    # Get probability predictions for each document class
    proba = passage_model.predict_proba([query])[0]
    classes = passage_model.classes_

    # Get top-k document IDs
    topk_idx = np.argsort(proba)[::-1][:k]
    topk_doc_ids = [classes[idx] for idx in topk_idx]

    # For each top document, find the best matching passage from that document
    query_vec = passage_vectorizer.transform([query])
    results = []
    
    for rank, doc_id in enumerate(topk_doc_ids, start=1):
        # Get all passages from this document
        doc_passage_indices = [i for i, label in enumerate(passage_labels) if label == doc_id]
        
        if doc_passage_indices:
            # Find the passage with highest cosine similarity to query
            doc_passage_vectors = passage_vectorizer.transform([passage_texts[i] for i in doc_passage_indices])
            sims = cosine_similarity(query_vec, doc_passage_vectors)[0]
            best_passage_idx = doc_passage_indices[np.argmax(sims)]
            
            meta = passage_meta[best_passage_idx]
            doc_prob = proba[classes.tolist().index(doc_id)]
            results.append({
                "rank": rank,
                "score": float(doc_prob),
                "text": passage_texts[best_passage_idx],
                "doc_id": meta["doc_id"],
                "title": meta["title"],
                "chunk_id": meta["chunk_id"],
                "start_word": meta["start_word"],
                "end_word": meta["end_word"],
            })

    return results


In [60]:
results = retrieve_svm_chunks(query, k=3)

print(f"Question: {query}")
print("-" * 80)

for r in results:
    print(f"[{r['rank']}] SVM probability={r['score']:.4f}")
    print(f"Paper: {r['doc_id']} — {r['title']}")
    print(f"Chunk: {r['chunk_id']} (words {r['start_word']}–{r['end_word']})")
    print("-" * 80)
    print(r["text"][:500], "...")
    print()


Question: How can we detect sarcasm via deep learning?
--------------------------------------------------------------------------------
[1] SVM probability=0.4198
Paper: 2510.10729v1 — Sarcasm Detection Using Deep Convolutional Neural Networks: A Modular Deep Learning
Chunk: 2510.10729v1_chunk_2 (words 360–580)
--------------------------------------------------------------------------------
text using BERT for text embeddings and Dense Net for visual features. Sarcasm detection is vital for enhancing the interpretability of automated systems like sentiment analyzers chatbots and recommendation engines. While humans rely on context tone and expressions machines must infer sarcasm from textual patterns alone. This paper explores a conceptual solution using DCNNs combined with contextual embedding models to understand sarcasm s complex indicators such as irony sentiment contradiction  ...

[2] SVM probability=0.0682
Paper: 2509.20913v1 — Deep Learning for Crime Forecasting: The Role of Mo