In [1]:
! pip install whoosh

Collecting whoosh
  Downloading Whoosh-2.7.4-py2.py3-none-any.whl.metadata (3.1 kB)
Downloading Whoosh-2.7.4-py2.py3-none-any.whl (468 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/468.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.8/468.8 kB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: whoosh
Successfully installed whoosh-2.7.4


## Preparing the Data

In [6]:
! kaggle datasets download -d stackoverflow/stacksample
! unzip stacksample.zip

Dataset URL: https://www.kaggle.com/datasets/stackoverflow/stacksample
License(s): other
Downloading stacksample.zip to /content
100% 1.11G/1.11G [01:08<00:00, 15.6MB/s]
100% 1.11G/1.11G [01:08<00:00, 17.4MB/s]
Archive:  stacksample.zip
  inflating: Answers.csv             
  inflating: Questions.csv           
  inflating: Tags.csv                


In [7]:
import pandas as pd
questions = pd.read_csv("Questions.csv", nrows=20000)
questions

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...
1,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...
2,120,83.0,2008-08-01T15:50:08Z,,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...
3,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...
4,260,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...
...,...,...,...,...,...,...,...
19995,1114470,82266.0,2009-07-11T19:37:06Z,,0,"Trim all chars off file name after first ""_""",<p>I'd like to trim these purchase order file ...
19996,1114540,2288585.0,2009-07-11T20:16:06Z,,7,Xcode question: Quickly jump to a particular s...,<p>What is the quickest way to jump to a parti...
19997,1114550,131128.0,2009-07-11T20:20:11Z,,3,Serializing a generic collection with XMLSeria...,<p>Why won't XMLSerializer process my generic ...
19998,1114580,87271.0,2009-07-11T20:35:46Z,,1,Using Yahoo Fire Eagle on Grails / Java,<p>Has anyone implemented the Yahoo Fire Eagle...


The Index and Schema Objects

In [11]:
from whoosh.fields import Schema, TEXT, ID

schema = Schema(Id=ID(stored=True), Title=TEXT(stored=True), Body=TEXT(stored=True))
schema

<Schema: ['Body', 'Id', 'Title']>

In [12]:
import os.path
index_dir = 'indexdir'

if not os.path.exists(index_dir):
  os.mkdir(index_dir)

In [17]:
from whoosh.index import create_in
from whoosh.index import open_dir

ix = create_in(index_dir, schema)
writer = ix.writer()

for index,row in questions.iterrows():
  writer.add_document(Id=str(row['Id']), Title=row['Title'], Body=row['Body'])

writer.commit()

## How to Search

In [23]:
from whoosh.qparser import QueryParser
from whoosh.scoring import TF_IDF
from whoosh import scoring

qp = QueryParser("Title", schema=schema)

query_sentece = "How to install"
query = qp.parse(query_sentece)

searcher_tfidf = ix.searcher(weighting=scoring.TF_IDF())

results_tfidf = searcher_tfidf.search(query, limit=3, scored=True)

for hit in results_tfidf:
  print(hit["Id"])
  print()
  print(hit["Title"])
  print()
  print()

102850

How can I install CPAN modules locally without root access (DynaLoader.pm line 229 error)?


145900

How can I determine that Windows Installer is performing an upgrade rather than a first time install?


351640

How to install Hibernate Tools in Eclipse?




### Task 1

In [31]:
searcher_tfidf = ix.searcher(weighting=scoring.TF_IDF())

query_sentences = ["How can i", "Help me", "Is it possible"]

for sentence in query_sentences:
  query = qp.parse(sentence)
  results_tfidf = searcher_tfidf.search(query, limit=3, scored=True)
  print(sentence)
  for hit in results_tfidf:
    print(hit["Id"])
    print(hit["Title"])
    print()
  print()

How can i
97270
How do I capture an asterisk on the form's KeyUp event? OR, How do I get a KeyChar on the KeyUp event?

288600
How do I specify in my ccnet config file how to do daily builds?

409050
How to structure a Java EE system? How is the term application and thus the content of an EAR defined?


Help me
127630
Is there a good tool for MySQL that will help me optimise my queries and index settings?

155250
Help me understand how QA works in Scrum

509590
Help me understand this UI jargon


Is it possible
14760
Is it possible to disable command input in the toolbar search box?

25200
C#.NET Winforms: Is it possible to override Label.Autosize?

28560
Is it possible to use nHibernate with Paradox database?




### Task 2

In [32]:
from whoosh.scoring import BM25F

qp2 = QueryParser("Title", schema=schema)

searcher_tfidf = ix.searcher(weighting=scoring.BM25F())

query_sentences = ["How to install", "How can i", "Help me", "Is it possible"]

for sentence in query_sentences:
  query = qp2.parse(sentence)
  results_tfidf = searcher_tfidf.search(query, limit=3, scored=True)
  print(sentence)
  for hit in results_tfidf:
    print(hit["Id"])
    print(hit["Title"])
    print()
  print()

How to install
921780
How to install ImageMagick on MAMP?

998260
How do you install JDK?

351640
How to install Hibernate Tools in Eclipse?


How can i
919450
how to know visitors is actually looking at the webpage and for how long?

4170
How to learn ADO.NET

31480
How stable is WPF?


Help me
724570
Error, can anybody help me

962930
Help me name this property (C#)

509590
Help me understand this UI jargon


Is it possible
476290
beep in WinCE , it possible ?

568560
Possible memory leak?

1039520
Scribd API - is this possible?




## Query Expansion

In [33]:
more_results = results_tfidf[0].more_like_this("Title")

for hit in more_results:
  print(hit["Id"])
  print(hit["Title"])
  print()

614540
Where can I find the binaries for arm-wince-pe-gcc?

918280
How to compile Qt 4.5.1 on Windows XP for WinCE?

1091260
Access Remote SQL Server Database on WinCE programming

427850
How do I configure WinCE to use wildcard SSL certificates?

790650
compile AMR-nb codec with RVCT for WinCE/Window Mobile

619130
How can I make networking work in my WinCE app without launching IE first?

568560
Possible memory leak?

1039520
Scribd API - is this possible?

87970
C#: Is Implicit Arraylist assignment possible?

112320
Is static metaprogramming possible in Java?



In [35]:
keywords = [keyword for keyword, score in results_tfidf.key_terms("Title", docs=10, numterms=5)]

keywords

['possible', 'beep', 'scribd', 'wince', 'leak']

## Evaluating IR Systems

In [80]:
queries = {
    'q1': "Machine Learning",
    'q2': "AI algorithms"
}

relevance = {
    'q1': ["doc1", "doc2", "doc3"],
    'q2': ["doc1", "doc2", "doc3", "doc4", "doc5"]
}

documents = {
    'doc1': "Artificial    Intelligence    (AI)    is    transforming    various industries    through    automation    and    advanced    algorithms.    Machine learning,    a    subset    of    AI,    enables    computers    to    learn    from    data    and make    predictions.    Algorithms    are    at    the    core    of    AI    systems,    guiding decision-making    and    problem-solving    processes.    AI-powered    systems are    increasingly    used    in    healthcare    for    diagnosis    and    treatment planning.    The    ethical    implications    of    AI    algorithms,    such    as    bias and    fairness,    are    important    considerations    in    their    development.",
    'doc2': "Deep    learning,    a    branch    of    machine    learning,    uses    neural networks    to    process    complex    data.    AI    algorithms    are    capable    of analyzing    large    datasets    to    extract    meaningful    insights.    Natural Language    Processing    (NLP)    algorithms    enable    computers    to    understand and    generate    human    language.    AI-driven    recommendation    algorithms personalize    user    experiences    in    e-commerce    and    content    platforms. Ensuring    the    transparency    and    accountability    of    AI    algorithms    is essential    for    building    trust    in    AI    technologies.",
    'doc3': "Reinforcement    learning    algorithms    enable    AI    agents    to    learn through    trial    and    error    interactions    with    their    environment.    AI algorithms    are    used    in    financial    markets    for    high-frequency    trading and    risk    management.    Computer    vision    algorithms    enable    machines    to interpret    and    analyze    visual    information.    AI    algorithms    can    enhance cybersecurity    by    detecting    and    mitigating    cyber    threats    in real-time.    Continuous    research    and    development    are    essential    for advancing    AI    algorithms    and    overcoming    their    limitations.",
    'doc4': "Evolutionary    algorithms,    inspired    by    natural    selection,    are used    to    optimize    complex    systems    and    processes.    AI    algorithms    play a    crucial    role    in    autonomous    vehicles    for    navigation    and decision-making.    Quantum    computing    algorithms    have    the    potential    to revolutionize    AI    by    solving    complex    problems    exponentially    faster. AI    algorithms    are    employed    in    predictive    maintenance    to    anticipate equipment    failures    and    reduce    downtime.    Ethical    guidelines    and regulations    are    needed    to    govern    the    development    and    deployment    of AI    algorithms.",
    'doc5': "Genetic    algorithms    are    used    to    evolve    solutions    to optimization    and    search    problems    inspired    by    natural    selection.    AI algorithms    enable    personalized    content    recommendations    in    streaming services    and    social    media    platforms.    Swarm    intelligence    algorithms mimic    the    collective    behavior    of    social    insects    to    solve optimization    problems.    AI    algorithms    are    used    in    drug    discovery    to accelerate    the    identification    of    potential    treatments. Collaborative    efforts    are    essential    for    advancing    AI    algorithms    and harnessing    their    full    potential    for    societal    benefit."
}

In [37]:
from whoosh.fields import Schema, TEXT, ID
from whoosh.index import create_in, open_dir

schema = Schema(Id=ID(stored=True), Body=TEXT(stored=True))

In [38]:
import os.path
index_dir = 'indexdir_toy'

if not os.path.exists(index_dir):
  os.mkdir(index_dir)

In [76]:
ix = create_in(index_dir, schema)
writer = ix.writer()

for doc_id in documents:
  writer.add_document(Id=doc_id, Body=documents[doc_id])

writer.commit()

In [79]:
from whoosh.qparser import QueryParser
from whoosh.scoring import TF_IDF
from whoosh import scoring

qp = QueryParser("Body", schema=schema)

query_sentence = queries["q1"]
query = qp.parse(query_sentence)

searcher_tfidf = ix.searcher(weighting=scoring.TF_IDF())

results_tfidf = searcher_tfidf.search(query, limit=3, scored=True)

for hit in results_tfidf:
  print(hit["Id"])
  print(hit["Body"])
  print()

doc2
Deep    learning,    a    branch    of    machine    learning,    uses    neural networks    to    process    complex    data.    AI    algorithms    are    capable    of analyzing    large    datasets    to    extract    meaningful    insights.    Natural Language    Processing    (NLP)    algorithms    enable    computers    to    understand and    generate    human    language.    AI-driven    recommendation    algorithms personalize    user    experiences    in    e-commerce    and    content    platforms. Ensuring    the    transparency    and    accountability    of    AI    algorithms    is essential    for    building    trust    in    AI    technologies.

doc1
Artificial    Intelligence    (AI)    is    transforming    various industries    through    automation    and    advanced    algorithms.    Machine learning,    a    subset    of    AI,    enables    computers    to    learn    from    data    and make    predictions.    Algorithms    are    at    the    core    of

### Task 3

In [83]:
retrieved_docs = [hit["Id"] for hit in results_tfidf]
relevant_docs = relevance["q1"]

true_positives = set(retrieved_docs).intersection(relevant_docs)

precision = len(true_positives) / len(retrieved_docs) if retrieved_docs else 0
recall = len(true_positives) / len(relevant_docs) if relevant_docs else 0

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

Precision: 0.67
Recall: 0.67


In [44]:
queries.items()

dict_items([('q1', 'Machine Learning'), ('q2', 'AI Alogrithms')])

### Task 4

In [84]:
searcher_tfidf = ix.searcher(weighting=scoring.TF_IDF())

for key, query_sentence in queries.items():
    print(f"Query: {query_sentence}")

    query = qp.parse(query_sentence)
    results_tfidf = searcher_tfidf.search(query, limit=3, scored=True)

    retrieved_docs = [hit["Id"] for hit in results_tfidf]
    for hit in results_tfidf:
        print('\t' + hit["Id"])
        print('\t' + hit["Body"])
        print()

    relevant_docs = relevance[key]

    true_positives = set(retrieved_docs).intersection(relevant_docs)
    precision = len(true_positives) / len(retrieved_docs) if retrieved_docs else 0
    recall = len(true_positives) / len(relevant_docs) if relevant_docs else 0

    print('\t' + f"Precision: {precision:.2f}")
    print('\t' + f"Recall: {recall:.2f}")
    print()
    print()

Query: Machine Learning
	doc2
	Deep    learning,    a    branch    of    machine    learning,    uses    neural networks    to    process    complex    data.    AI    algorithms    are    capable    of analyzing    large    datasets    to    extract    meaningful    insights.    Natural Language    Processing    (NLP)    algorithms    enable    computers    to    understand and    generate    human    language.    AI-driven    recommendation    algorithms personalize    user    experiences    in    e-commerce    and    content    platforms. Ensuring    the    transparency    and    accountability    of    AI    algorithms    is essential    for    building    trust    in    AI    technologies.

	doc1
	Artificial    Intelligence    (AI)    is    transforming    various industries    through    automation    and    advanced    algorithms.    Machine learning,    a    subset    of    AI,    enables    computers    to    learn    from    data    and make    predictions.    Algorithms    ar

Trying with BM25F

In [86]:
searcher_bm25f = ix.searcher(weighting=scoring.BM25F())

for key, query_sentence in queries.items():
    print(f"Query: {query_sentence}")

    query = qp.parse(query_sentence)
    results_tfidf = searcher_bm25f.search(query, limit=3, scored=True)

    retrieved_docs = [hit["Id"] for hit in results_tfidf]
    for hit in results_tfidf:
        print('\t' + hit["Id"])
        print('\t' + hit["Body"])
        print()

    relevant_docs = relevance[key]

    true_positives = set(retrieved_docs).intersection(relevant_docs)
    precision = len(true_positives) / len(retrieved_docs) if retrieved_docs else 0
    recall = len(true_positives) / len(relevant_docs) if relevant_docs else 0

    print('\t' + f"Precision: {precision:.2f}")
    print('\t' + f"Recall: {recall:.2f}")
    print()
    print()

Query: Machine Learning
	doc2
	Deep    learning,    a    branch    of    machine    learning,    uses    neural networks    to    process    complex    data.    AI    algorithms    are    capable    of analyzing    large    datasets    to    extract    meaningful    insights.    Natural Language    Processing    (NLP)    algorithms    enable    computers    to    understand and    generate    human    language.    AI-driven    recommendation    algorithms personalize    user    experiences    in    e-commerce    and    content    platforms. Ensuring    the    transparency    and    accountability    of    AI    algorithms    is essential    for    building    trust    in    AI    technologies.

	doc1
	Artificial    Intelligence    (AI)    is    transforming    various industries    through    automation    and    advanced    algorithms.    Machine learning,    a    subset    of    AI,    enables    computers    to    learn    from    data    and make    predictions.    Algorithms    ar