In [1]:
# !pip install ipywidgets  #1 time installation
# !jupyter nbextension enable --py widgetsnbextension
# import sys, subprocess
#subprocess.check_call([sys.executable, "-m", "pip", "install", "ipywidgets"])
#!pip install scikit-learn
#import sys
#!{sys.executable} -m pip install scikit-learn

In [2]:
import json
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import Javascript
from ipywidgets import Widget, Text, Button, VBox, Label
from traitlets import Unicode

In [3]:
corpus_path = Path(r"C:\Users\netsi\OneDrive\Desktop\TU Wien\NLP\Topic12\corpus\corpus.json")  

with corpus_path.open("r", encoding="utf-8") as f:
    corpus = json.load(f)

doc_ids   = [doc["id"] for doc in corpus]
doc_titles = [doc.get("title", "") for doc in corpus]
doc_texts = [doc["text"] for doc in corpus]


vectorizer = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1, 2),   # unigrams + bigrams
    max_df=0.9,
    min_df=1
)

tfidf_matrix = vectorizer.fit_transform(doc_texts)  # num_docs, num_terms


In [4]:
start = 12005
stop = start + 10

feature_names = vectorizer.get_feature_names_out()
print("Number of features:", len(feature_names))
print("Sample features:", feature_names[start:stop])   


first_doc_vector = tfidf_matrix[0].toarray()[0]
nonzero_idx = first_doc_vector.nonzero()[0]

print("\nTF-IDF values for sample features):")
for idx in range(start, stop):
    print(f"{feature_names[idx]:25s} {first_doc_vector[idx]:.4f}")


df = pd.DataFrame({
    "term": [feature_names[i] for i in nonzero_idx],
    "tfidf": [first_doc_vector[i] for i in nonzero_idx]
}).sort_values("tfidf", ascending=False)

print("\nTop 10 TF-IDF terms in Document 0:")
print(df.head(10))


Number of features: 50406
Sample features: ['crime' 'crime absence' 'crime acknowledge' 'crime actually'
 'crime aggregations' 'crime analysis' 'crime appeared' 'crime attractors'
 'crime baltimore' 'crime best']

TF-IDF values for sample features):
crime                     0.6713
crime absence             0.0043
crime acknowledge         0.0043
crime actually            0.0043
crime aggregations        0.0043
crime analysis            0.0043
crime appeared            0.0043
crime attractors          0.0043
crime baltimore           0.0043
crime best                0.0043

Top 10 TF-IDF terms in Document 0:
                   term     tfidf
1406              crime  0.671330
3568           mobility  0.209394
986              cities  0.138597
1484             crimes  0.134266
2412        forecasting  0.129443
3175                 lb  0.125604
3756                 nn  0.106497
1437  crime forecasting  0.090954
863                cell  0.088116
4189                poi  0.086623


In [5]:
Javascript("""
require.undef('text_sender');

define('text_sender', ["@jupyter-widgets/base"], function(widgets) {

    var TextSenderView = widgets.DOMWidgetView.extend({
        render: function() {

            const box = document.createElement("div");
            box.style.display = "flex";
            box.style.flexDirection = "row";
            box.style.width = "100%";        // full width inside widget
            box.style.maxWidth = "100%";

            const input = document.createElement("input");
            input.type = "text";
            input.placeholder = "Type something…";

            input.style.padding = "6px";
            input.style.marginRight = "8px";

            // FORCE width expansion
            input.style.flex = "1";          // <--- This is the important part
            input.style.minWidth = "0";      // <--- prevents shrink issues

            const button = document.createElement("button");
            button.innerHTML = "Send to Python";
            button.style.padding = "6px 12px";

            button.onclick = () => {
                this.send({text: input.value});
            };

            box.appendChild(input);
            box.appendChild(button);

            this.el.appendChild(box);
        }
    });

    return {
        TextSenderView : TextSenderView
    };
});
""")


<IPython.core.display.Javascript object>

In [6]:
def retrieve_tfidf(query: str, k: int = 5):
    
    # Returns top-k documents.

    if not query.strip():
        return []

    # Vectorize query
    q_vec = vectorizer.transform([query])  # shape: (1, num_terms)

    # Cosine similarity with all docs
    sims = cosine_similarity(q_vec, tfidf_matrix)[0]  # shape: (num_docs,)

    # Get top-k indices
    topk_idx = np.argsort(sims)[::-1][:k]

    results = []
    for rank, idx in enumerate(topk_idx, start=1):
        results.append({
            "rank": rank,
            "score": float(sims[idx]),
            "id": doc_ids[idx],
            "title": doc_titles[idx],
            "text": doc_texts[idx]
        })
    return results


In [7]:
label = Label("Enter your question:")
txt = Text(placeholder="Type your question here...")
btn = Button(description="Submit")

def on_click(b):
    global query
    query = txt.value
    

btn.on_click(on_click)

VBox([label, txt, btn])

VBox(children=(Label(value='Enter your question:'), Text(value='', placeholder='Type your question here...'), …

In [9]:
print(query)

how can we detect sacasm using deep learning?


In [10]:
#Show results at title level
results = retrieve_tfidf(query, k=3)

print(f"Question: {query}")
print("-" * 80)

for r in results:
    print(f"[{r['rank']}] {r['id']}  (cosine similarity={r['score']:.4f})")
    print(f"Paper title: {r["title"]}")
    print("-" * 80)
    print(r["text"][:500], "...")
    print()


Question: how can we detect sacasm using deep learning?
--------------------------------------------------------------------------------
[1] 2510.10729v1  (cosine similarity=0.0208)
Paper title: Sarcasm Detection Using Deep Convolutional Neural Networks: A Modular Deep Learning
--------------------------------------------------------------------------------
Sarcasm is a nuanced and often misinterpreted form of communication especially in text where tone and body language are absent. This paper presents a proposed modular deep learning framework for sarcasm detection leveraging Deep Convolutional Neural Networks DCNNs and contextual models like BERT to analyze linguistic emotional and contextual cues. The system is conceptually designed to integrate sentiment analysis contextual embeddings linguistic feature extraction and emotion detection through  ...

[2] 2510.08770v1  (cosine similarity=0.0133)
Paper title: Detecting spills using thermal imaging, pretrained deep learning models, and

In [11]:
def chunk_text(text, chunk_size=220, overlap=40):
    
    #Split text into overlapping chunks.
    #chunk_size: target words per chunk
    #overlap: how many words to overlap between consecutive chunks
    
    words = text.split()
    if not words:
        return []

    chunks = []
    start = 0
    n = len(words)

    while start < n:
        end = start + chunk_size
        chunk_words = words[start:end]
        chunk = " ".join(chunk_words)
        chunks.append(chunk)

        if end >= n:
            break

        start = end - overlap  

    return chunks

In [12]:
passage_texts = []   
passage_meta = []   

for doc in corpus:
    doc_id = doc["id"]
    title = doc.get("title", "")
    text = doc["text"]

    chunks = chunk_text(text, chunk_size=220, overlap=40)
    start_word = 0

    for i, chunk in enumerate(chunks):
        end_word = start_word + len(chunk.split())
        passage_texts.append(chunk)
        passage_meta.append({
            "doc_id": doc_id,
            "title": title,
            "chunk_id": f"{doc_id}_chunk_{i}",
            "start_word": start_word,
            "end_word": end_word,
        })
        start_word = end_word - 40  # keep aligned with overlap


In [13]:
print(f"Number of documents: {len(corpus)}")
print(f"Number of passages:  {len(passage_texts)}")

Number of documents: 20
Number of passages:  491


In [14]:
vectorizer = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1, 2),
    max_df=0.9,
    min_df=1
)

tfidf_matrix_passages = vectorizer.fit_transform(passage_texts)


In [15]:
def retrieve_tfidf_chunks(query: str, k: int = 3):
    
    #Retrieve top-k passages (chunks)
    
    if not query.strip():
        return []

    # Vectorize query
    q_vec = vectorizer.transform([query])

    # Cosine similarity against all passages
    sims = cosine_similarity(q_vec, tfidf_matrix_passages)[0]

    # Top-k indices
    topk_idx = np.argsort(sims)[::-1][:k]

    results = []
    for rank, idx in enumerate(topk_idx, start=1):
        meta = passage_meta[idx]
        results.append({
            "rank": rank,
            "score": float(sims[idx]),
            "text": passage_texts[idx],
            "doc_id": meta["doc_id"],
            "title": meta["title"],
            "chunk_id": meta["chunk_id"],
            "start_word": meta["start_word"],
            "end_word": meta["end_word"],
        })

    return results


In [17]:
results = retrieve_tfidf_chunks(query, k=3)

print(f"Question: {query}")
print("-" * 80)

for r in results:
    print(f"[{r['rank']}] Cosine similarity={r['score']:.4f}")
    print(f"Paper: {r['doc_id']} — {r['title']}")
    print(f"Chunk: {r['chunk_id']} (words {r['start_word']}–{r['end_word']})")
    print("-" * 80)
    print(r["text"][:500], "...")
    print()

Question: how can we detect sacasm using deep learning?
--------------------------------------------------------------------------------
[1] Cosine similarity=0.0932
Paper: 2510.10729v1 — Sarcasm Detection Using Deep Convolutional Neural Networks: A Modular Deep Learning
Chunk: 2510.10729v1_chunk_0 (words 0–220)
--------------------------------------------------------------------------------
Sarcasm is a nuanced and often misinterpreted form of communication especially in text where tone and body language are absent. This paper presents a proposed modular deep learning framework for sarcasm detection leveraging Deep Convolutional Neural Networks DCNNs and contextual models like BERT to analyze linguistic emotional and contextual cues. The system is conceptually designed to integrate sentiment analysis contextual embeddings linguistic feature extraction and emotion detection through  ...

[2] Cosine similarity=0.0838
Paper: 2510.05736v1 — Convolution and Graph-based Deep Learning Approa