In [96]:
# !pip install ipywidgets  #1 time installation
# !jupyter nbextension enable --py widgetsnbextension
# !pip install scikit-learn


In [97]:
import json
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics.pairwise import cosine_similarity
from ipywidgets import Widget, Text, Button, VBox, Label
import warnings
warnings.filterwarnings('ignore')


In [98]:
candidate_paths = [
    Path("corpus_json/corpus.json"),
    Path("../corpus_json/corpus.json"),
    Path("../../corpus_json/corpus.json"),
]
for p in candidate_paths:
    if p.exists():
        corpus_path = p
        break

with corpus_path.open("r", encoding="utf-8") as f:
    corpus = json.load(f)

doc_ids = [doc["id"] for doc in corpus]
doc_titles = [doc.get("title", "") for doc in corpus]
doc_texts = [doc["text"] for doc in corpus]

print(f"Loaded {len(corpus)} documents from {corpus_path}")

# ============================================================================
# DOCUMENT-LEVEL: Simple TF-IDF + Cosine Similarity (No Grid Search Needed)
# ============================================================================
# Document-level is just for initial ranking, not classification
# We use simple TF-IDF + cosine similarity 
print("\n" + "="*80)
print("DOCUMENT-LEVEL: Simple TF-IDF Vectorizer")
print("="*80)

# Simple TF-IDF vectorizer for document-level retrieval
doc_vectorizer = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1, 2),
    min_df=2,  # Require words to appear in at least 2 documents
    max_df=0.8,  # Ignore very common words
    max_features=5000  # Limit feature space to prevent overfitting
)

# Fit on all documents
doc_vectors = doc_vectorizer.fit_transform(doc_texts)
print(f"Document-level TF-IDF vectorizer fitted on {len(doc_texts)} documents")
print(f"Number of features: {len(doc_vectorizer.get_feature_names_out()):,}")
print("="*80)

Loaded 20 documents from ../../corpus_json/corpus.json

DOCUMENT-LEVEL: Simple TF-IDF Vectorizer
Document-level TF-IDF vectorizer fitted on 20 documents
Number of features: 5,000


In [99]:
# Document-level retrieval function using TF-IDF + Cosine Similarity
def retrieve_docs_tfidf(query: str, k: int = 5):
    """
    Returns top-k documents using TF-IDF + cosine similarity.
    Simple and fast - no SVM needed for document-level ranking.
    """
    if not query.strip():
        return []
    
    # Transform query to TF-IDF vector
    query_vec = doc_vectorizer.transform([query])
    
    # Calculate cosine similarities
    similarities = cosine_similarity(query_vec, doc_vectors)[0]
    
    # Get top-k indices
    topk_idx = np.argsort(similarities)[::-1][:k]
    
    results = []
    for rank, idx in enumerate(topk_idx, start=1):
        results.append({
            "rank": rank,
            "score": float(similarities[idx]),
            "id": doc_ids[idx],
            "title": doc_titles[idx],
            "text": doc_texts[idx]
        })
    return results

In [100]:
# Display model information
feature_names = doc_vectorizer.get_feature_names_out()
print("="*80)
print("DOCUMENT-LEVEL MODEL INFORMATION")
print("="*80)
print(f"Number of features: {len(feature_names):,}")
print(f"Feature reduction: From ~50,000+ to {len(feature_names):,} features")
print(f"Reduction ratio: {(1 - len(feature_names)/50000)*100:.1f}%")
print(f"\nSample features: {feature_names[:10]}")

# Get feature importance from first document
first_doc_vector = doc_vectorizer.transform([doc_texts[0]]).toarray()[0]
nonzero_idx = first_doc_vector.nonzero()[0]

df = pd.DataFrame({
    "term": [feature_names[i] for i in nonzero_idx],
    "tfidf": [first_doc_vector[i] for i in nonzero_idx]
}).sort_values("tfidf", ascending=False)

print("\nTop 10 TF-IDF terms in Document 0:")
print(df.head(10))

print("\n✓ Document-level uses simple TF-IDF + cosine similarity")
print("="*80)


DOCUMENT-LEVEL MODEL INFORMATION
Number of features: 5,000
Feature reduction: From ~50,000+ to 5,000 features
Reduction ratio: 90.0%

Sample features: ['00' '00 00' '000' '000 images' '001' '006' '007' '009' '01' '010']

Top 10 TF-IDF terms in Document 0:
             term     tfidf
961      mobility  0.420206
645   forecasting  0.259764
1013           nn  0.213715
254          cell  0.176828
1206       recall  0.162278
277          city  0.160442
901          lstm  0.158118
1384      spatial  0.149042
375          conv  0.132621
1132    precision  0.125659

✓ Document-level uses simple TF-IDF + cosine similarity


In [101]:
label = Label("Enter your question:")
txt = Text(placeholder="Type your question here...")
btn = Button(description="Submit")

def on_click(b):
    global query
    query = txt.value
    

btn.on_click(on_click)

VBox([label, txt, btn])


VBox(children=(Label(value='Enter your question:'), Text(value='', placeholder='Type your question here...'), …

In [102]:
print(query)


how can we detect sarcasm using deep learning?


In [103]:
# Show results at title level
results = retrieve_docs_tfidf(query, k=3)

print(f"Question: {query}")
print("-" * 80)

for r in results:
    print(f"[{r['rank']}] {r['id']}  (similarity={r['score']:.4f})")
    print(f"Paper title: {r['title']}")
    print("-" * 80)
    print(r["text"][:500], "...")
    print()




Question: how can we detect sarcasm using deep learning?
--------------------------------------------------------------------------------
[1] 2510.10729v1  (similarity=0.0342)
Paper title: Sarcasm Detection Using Deep Convolutional Neural Networks: A Modular Deep Learning
--------------------------------------------------------------------------------
Sarcasm is a nuanced and often misinterpreted form of communication especially in text where tone and body language are absent. This paper presents a proposed modular deep learning framework for sarcasm detection leveraging Deep Convolutional Neural Networks DCNNs and contextual models like BERT to analyze linguistic emotional and contextual cues. The system is conceptually designed to integrate sentiment analysis contextual embeddings linguistic feature extraction and emotion detection through  ...

[2] 2510.08770v1  (similarity=0.0255)
Paper title: Detecting spills using thermal imaging, pretrained deep learning models, and a robotic pl

In [104]:
def chunk_text(text, chunk_size=220, overlap=0):
    """
    Split text into overlapping chunks.
    chunk_size: target words per chunk
    overlap: how many words to overlap between consecutive chunks
    """
    words = text.split()
    if not words:
        return []

    chunks = []
    start = 0
    n = len(words)

    while start < n:
        end = start + chunk_size
        chunk_words = words[start:end]
        chunk = " ".join(chunk_words)
        chunks.append(chunk)

        if end >= n:
            break

        start = end - overlap  

    return chunks


In [105]:
passage_texts = []   
passage_meta = []   

for doc in corpus:
    doc_id = doc["id"]
    title = doc.get("title", "")
    text = doc["text"]

    chunks = chunk_text(text, chunk_size=220, overlap=40)
    start_word = 0

    for i, chunk in enumerate(chunks):
        end_word = start_word + len(chunk.split())
        passage_texts.append(chunk)
        passage_meta.append({
            "doc_id": doc_id,
            "title": title,
            "chunk_id": f"{doc_id}_chunk_{i}",
            "start_word": start_word,
            "end_word": end_word,
        })
        start_word = end_word - 40  # keep aligned with overlap


In [106]:
print(f"Number of documents: {len(corpus)}")
print(f"Number of passages:  {len(passage_texts)}")


Number of documents: 20
Number of passages:  491


In [107]:
# Train passage-level SVM classifier with validation and test set
# Get labels for passages
passage_labels = [meta["doc_id"] for meta in passage_meta]

X_train_pass, X_val_pass, X_test_pass = [], [], []
y_train_pass, y_val_pass, y_test_pass = [], [], []

unique_docs = list(set(passage_labels))

for doc_id in unique_docs:
    indices = [i for i, x in enumerate(passage_labels) if x == doc_id]
    doc_texts = [passage_texts[i] for i in indices]
    doc_labels = [passage_labels[i] for i in indices]
    
    if len(doc_texts) < 3:
        X_train_pass.extend(doc_texts)
        y_train_pass.extend(doc_labels)
        continue

    

    texts_temp, texts_test, labels_temp, labels_test = train_test_split(
        doc_texts, doc_labels, test_size=0.20, shuffle=False
    )
    

    texts_train, texts_val, labels_train, labels_val = train_test_split(
        texts_temp, labels_temp, test_size=0.25, shuffle=False
    )

    X_train_pass.extend(texts_train)
    y_train_pass.extend(labels_train)
    
    X_val_pass.extend(texts_val)
    y_val_pass.extend(labels_val)
    
    X_test_pass.extend(texts_test)
    y_test_pass.extend(labels_test)

#check split
print("="*60)
print(f"Neuer Split mit sklearn (shuffle=False):")
print(f"Training:   {len(X_train_pass)} Chunks (60%)")
print(f"Validation: {len(X_val_pass)} Chunks (20%)")
print(f"Test:       {len(X_test_pass)} Chunks (20%)")
print("="*60)

Neuer Split mit sklearn (shuffle=False):
Training:   281 Chunks (60%)
Validation: 103 Chunks (20%)
Test:       107 Chunks (20%)


In [108]:
print("="*80)
print("TRAINING PASSAGE-LEVEL MODEL WITH GRID SEARCH")
print("="*80)

# Define parameter grid for passage-level model
passage_param_grid = {
    'tfidfvectorizer__min_df': [3, 5, 7],  # BALANCED: Moderate filtering (was too high at 15)
    'tfidfvectorizer__max_df': [0.7, 0.8, 0.9],
    'tfidfvectorizer__max_features': [50, 80, 100],
    'svc__C': [0.01, 0.1, 0.8],  # BALANCED: Moderate regularization 
    'svc__kernel': ['linear', 'rbf'] 
}

# Create pipeline
passage_pipeline = make_pipeline(
    TfidfVectorizer(stop_words="english", ngram_range=(1, 2)),
    SVC(probability=True, random_state=42)
)

# Perform grid search
print("Performing grid search for passage-level model (this may take a while)...")
passage_grid_search = GridSearchCV(
    passage_pipeline,
    passage_param_grid,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

passage_grid_search.fit(X_train_pass, y_train_pass)

print(f"\nBest parameters: {passage_grid_search.best_params_}")
print(f"Best cross-validation score: {passage_grid_search.best_score_:.4f}")

# Use the best model
passage_model = passage_grid_search.best_estimator_
passage_vectorizer = passage_model.named_steps['tfidfvectorizer']

# Validate on validation set
val_pred_pass = passage_model.predict(X_val_pass)
val_acc_pass = accuracy_score(y_val_pass, val_pred_pass)
print(f"\nPassage-level validation accuracy: {val_acc_pass:.4f}")
print(f"Training set: {len(X_train_pass)} passages, Validation set: {len(X_val_pass)} passages")
print(f"Number of features: {len(passage_vectorizer.get_feature_names_out())}")
print("\nValidation classification report:")
print(classification_report(y_val_pass, val_pred_pass, zero_division=0))

# Test on held-out test set
test_pred_pass = passage_model.predict(X_test_pass)
test_acc_pass = accuracy_score(y_test_pass, test_pred_pass)
print(f"\nPassage-level TEST accuracy: {test_acc_pass:.4f}")
print(f"Test set: {len(X_test_pass)} passages")
print("Test set classification report:")
print(classification_report(y_test_pass, test_pred_pass, zero_division=0))

# Check for overfitting: compare train vs validation accuracy
train_pred_pass = passage_model.predict(X_train_pass)
train_acc_pass = accuracy_score(y_train_pass, train_pred_pass)
print(f"\nTraining accuracy: {train_acc_pass:.4f}")
print(f"Validation accuracy: {val_acc_pass:.4f}")
print(f"Difference (overfitting indicator): {train_acc_pass - val_acc_pass:.4f}")
if train_acc_pass - val_acc_pass > 0.1:
    print("WARNING: Large gap between train and validation accuracy suggests overfitting!")

# Train final model on all data for production use (using best parameters)
print("\n" + "="*80)
print("TRAINING FINAL MODEL ON ALL DATA")
print("="*80)
final_passage_model = make_pipeline(
    TfidfVectorizer(
        stop_words="english",
        ngram_range=(1, 2),
        min_df=passage_grid_search.best_params_['tfidfvectorizer__min_df'],
        max_df=passage_grid_search.best_params_['tfidfvectorizer__max_df'],
        max_features=passage_grid_search.best_params_['tfidfvectorizer__max_features']
    ),
    SVC(
        C=passage_grid_search.best_params_['svc__C'],
        kernel=passage_grid_search.best_params_['svc__kernel'],
        probability=True,
        random_state=42
    )
)
final_passage_model.fit(passage_texts, passage_labels)
passage_model = final_passage_model
passage_vectorizer = passage_model.named_steps['tfidfvectorizer']
print(f"Passage-level SVM model trained on {len(passage_texts)} passages (all data)")
print(f"Final model features: {len(passage_vectorizer.get_feature_names_out())}")


TRAINING PASSAGE-LEVEL MODEL WITH GRID SEARCH
Performing grid search for passage-level model (this may take a while)...
Fitting 5 folds for each of 162 candidates, totalling 810 fits



Best parameters: {'svc__C': 0.8, 'svc__kernel': 'linear', 'tfidfvectorizer__max_df': 0.7, 'tfidfvectorizer__max_features': 100, 'tfidfvectorizer__min_df': 5}
Best cross-validation score: 0.8718

Passage-level validation accuracy: 0.7087
Training set: 281 passages, Validation set: 103 passages
Number of features: 100

Validation classification report:
              precision    recall  f1-score   support

2509.20913v1       0.80      0.80      0.80        10
2509.23158v1       0.31      0.80      0.44         5
2510.05163v1       1.00      1.00      1.00         3
2510.05736v1       0.00      0.00      0.00         2
2510.07320v1       1.00      0.50      0.67         4
2510.08116v1       1.00      0.83      0.91         6
2510.08411v1       1.00      0.75      0.86         4
2510.08662v1       0.50      1.00      0.67         4
2510.08770v1       0.50      0.25      0.33         4
2510.09187v1       0.60      0.75      0.67         4
2510.10729v1       0.00      0.00      0.00        

In [109]:
def retrieve_svm_chunks(query: str, k: int = 3):
    """
    Retrieve top-k passages (chunks) using SVM classification.
    """
    if not query.strip():
        return []

    # Get probability predictions for each document class
    proba = passage_model.predict_proba([query])[0]
    classes = passage_model.classes_

    # Get top-k document IDs
    topk_idx = np.argsort(proba)[::-1][:k]
    topk_doc_ids = [classes[idx] for idx in topk_idx]

    # For each top document, find the best matching passage from that document
    query_vec = passage_vectorizer.transform([query])
    results = []
    
    for rank, doc_id in enumerate(topk_doc_ids, start=1):
        # Get all passages from this document
        doc_passage_indices = [i for i, label in enumerate(passage_labels) if label == doc_id]
        
        if doc_passage_indices:
            # Find the passage with highest cosine similarity to query
            doc_passage_vectors = passage_vectorizer.transform([passage_texts[i] for i in doc_passage_indices])
            sims = cosine_similarity(query_vec, doc_passage_vectors)[0]
            best_passage_idx = doc_passage_indices[np.argmax(sims)]
            
            meta = passage_meta[best_passage_idx]
            doc_prob = proba[classes.tolist().index(doc_id)]
            results.append({
                "rank": rank,
                "score": float(doc_prob),
                "text": passage_texts[best_passage_idx],
                "doc_id": meta["doc_id"],
                "title": meta["title"],
                "chunk_id": meta["chunk_id"],
                "start_word": meta["start_word"],
                "end_word": meta["end_word"],
            })

    return results


In [110]:
results = retrieve_svm_chunks(query, k=5)

print(f"Question: {query}")
print("-" * 80)

for r in results:
    print(f"[{r['rank']}] SVM probability={r['score']:.4f}")
    print(f"Paper: {r['doc_id']} — {r['title']}")
    print(f"Chunk: {r['chunk_id']} (words {r['start_word']}–{r['end_word']})")
    print("-" * 80)
    print(r["text"][:500], "...")
    print()


Question: how can we detect sarcasm using deep learning?
--------------------------------------------------------------------------------
[1] SVM probability=0.2004
Paper: 2510.14855v1 — A Multi-Task Deep Learning Framework for Skin Lesion Classification, ABCDE Feature Quantification, and Evolution
Chunk: 2510.14855v1_chunk_26 (words 4680–4811)
--------------------------------------------------------------------------------
class imbalance through augmentation or resampling may improve fairness across lesion types. A promising next step is generating simulated lesion images using GAN-based methods to complement feature-space evolution and provide more intuitive visual feedback. Incorporating expert-reviewed or longitudinal data would also enable supervised learning of the E component and improve clinical realism. In summary, we developed a deep learning model to classify skin lesions, interpret predictions using AB ...

[2] SVM probability=0.1025
Paper: 2509.20913v1 — Deep Learning for