In [1]:
# !pip install ipywidgets  #1 time installation
# !jupyter nbextension enable --py widgetsnbextension
# import sys, subprocess
#subprocess.check_call([sys.executable, "-m", "pip", "install", "ipywidgets"])
#!pip install scikit-learn
#import sys
#!{sys.executable} -m pip install scikit-learn

In [2]:
import json
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import Javascript
from ipywidgets import Widget, Text, Button, VBox, Label
from traitlets import Unicode

In [3]:
corpus_path = Path(r"C:\Users\netsi\OneDrive\Desktop\TU Wien\NLP\Topic12\corpus\corpus.json")  

with corpus_path.open("r", encoding="utf-8") as f:
    corpus = json.load(f)

doc_ids   = [doc["id"] for doc in corpus]
doc_titles = [doc.get("title", "") for doc in corpus]
doc_texts = [doc["text"] for doc in corpus]

#print(doc_titles)

# ---- Build TF-IDF vectorizer & matrix ----
vectorizer = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1, 2),   # unigrams + bigrams
    max_df=0.9,
    min_df=1
)

tfidf_matrix = vectorizer.fit_transform(doc_texts)  # num_docs, num_terms
print("TF-IDF matrix shape:", tfidf_matrix.shape) 

TF-IDF matrix shape: (20, 50409)


In [4]:
start = 12005
stop = start + 10

feature_names = vectorizer.get_feature_names_out()
print("Number of features:", len(feature_names))
print("Sample features:", feature_names[start:stop])   


first_doc_vector = tfidf_matrix[0].toarray()[0]
nonzero_idx = first_doc_vector.nonzero()[0]

print("\nTF-IDF values for sample features):")
for idx in range(start, stop):
    print(f"{feature_names[idx]:25s} {first_doc_vector[idx]:.4f}")


df = pd.DataFrame({
    "term": [feature_names[i] for i in nonzero_idx],
    "tfidf": [first_doc_vector[i] for i in nonzero_idx]
}).sort_values("tfidf", ascending=False)

print("\nTop 10 TF-IDF terms in Document 0:")
print(df.head(20))


Number of features: 50409
Sample features: ['crime' 'crime absence' 'crime acknowledge' 'crime actually'
 'crime aggregations' 'crime analysis' 'crime appeared' 'crime attractors'
 'crime baltimore' 'crime best']

TF-IDF values for sample features):
crime                     0.6714
crime absence             0.0043
crime acknowledge         0.0043
crime actually            0.0043
crime aggregations        0.0043
crime analysis            0.0043
crime appeared            0.0043
crime attractors          0.0043
crime baltimore           0.0043
crime best                0.0043

Top 10 TF-IDF terms in Document 0:
                   term     tfidf
1406              crime  0.671356
3568           mobility  0.209402
986              cities  0.138602
1484             crimes  0.134271
2412        forecasting  0.129448
3175                 lb  0.125609
3756                 nn  0.106501
1437  crime forecasting  0.090958
863                cell  0.088119
4189                poi  0.086627
4545      

In [5]:
Javascript("""
require.undef('text_sender');

define('text_sender', ["@jupyter-widgets/base"], function(widgets) {

    var TextSenderView = widgets.DOMWidgetView.extend({
        render: function() {

            const box = document.createElement("div");

            const input = document.createElement("input");
            input.type = "text";
            input.placeholder = "Type something…";
            input.style.padding = "6px";
            input.style.marginRight = "8px";

            const button = document.createElement("button");
            button.innerHTML = "Send to Python";
            button.style.padding = "6px 12px";

            button.onclick = () => {
                this.send({text: input.value});
            };

            box.appendChild(input);
            box.appendChild(button);

            this.el.appendChild(box);
        }
    });

    return {
        TextSenderView : TextSenderView
    };
});
""")


<IPython.core.display.Javascript object>

In [6]:
def retrieve_tfidf(query: str, k: int = 5):
    
    # Returns top-k documents.

    if not query.strip():
        return []

    # Vectorize query
    q_vec = vectorizer.transform([query])  # shape: (1, num_terms)

    # Cosine similarity with all docs
    sims = cosine_similarity(q_vec, tfidf_matrix)[0]  # shape: (num_docs,)

    # Get top-k indices
    topk_idx = np.argsort(sims)[::-1][:k]

    results = []
    for rank, idx in enumerate(topk_idx, start=1):
        results.append({
            "rank": rank,
            "score": float(sims[idx]),
            "id": doc_ids[idx],
            "title": doc_titles[idx],
            "text": doc_texts[idx]
        })
    return results


In [8]:
label = Label("Enter your question:")
txt = Text(placeholder="Type your question here...")
btn = Button(description="Send")

def on_click(b):
    global query
    query = txt.value
    

btn.on_click(on_click)

VBox([label, txt, btn])

VBox(children=(Label(value='Enter your question:'), Text(value='', placeholder='Type your question here...'), …

In [11]:
print(query)


How can we detect sarcasm using deep learning?


In [12]:
#Show results at title level
results = retrieve_tfidf(query, k=3)

for r in results:
    print(f"[{r['rank']}] {r['id']}  (cosine similarity={r['score']:.4f})")
    print(f"Paper name: {r["title"]}")
    print("-" * 80)
    # You might want just a snippet, not full text:
    print(r["text"][:500], "...")
    print()


[1] 2510.10729v1  (cosine similarity=0.3359)
Paper name: Sarcasm Detection Using Deep Convolutional Neural Networks: A Modular Deep Learning
--------------------------------------------------------------------------------
Sarcasm is a nuanced and often misinterpreted form of communication especially in text where tone and body language are absent. This paper presents a proposed modular deep learning framework for sarcasm detection leveraging Deep Convolutional Neural Networks DCNNs and contextual models like BERT to analyze linguistic emotional and contextual cues. The system is conceptually designed to integrate sentiment analysis contextual embeddings linguistic feature extraction and emotion detection through  ...

[2] 2510.08770v1  (cosine similarity=0.0078)
Paper name: Detecting spills using thermal imaging, pretrained deep learning models, and a robotic platform
--------------------------------------------------------------------------------
Detecting spills using thermal imaging