In [1]:
import requests
import json
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# function to clean the data
def clean_text(text):
    # Remove unwanted characters and multiple spaces
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    # Convert text to lowercase
    text = text.lower()
    return text

In [3]:
# function to tokenize the data into words
def tokenize_text(text):
    tokens = text.split()
    return tokens

In [4]:
# function to calculate the Jaccard coefficient
def jaccard_coefficient(set1, set2):
    set1 = set(set1.nonzero()[0])
    set2 = set(set2.nonzero()[0])
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union > 0 else 0

In [5]:
# function to search Wikipedia using the API
def search_wikipedia(query, language='en', k=10):

    # Construct the API URL
    url = f'https://{language}.wikipedia.org/w/api.php'

    # Set the parameters for the API request
    params = {
        'action': 'query',
        'format': 'json',
        'list': 'search',
        'srsearch': query,
        'srprop': '',
        'srlimit': k
    }

    # Sending the request to API and get the response
    response = requests.get(url, params=params)

    # Parse the JSON response and extract the search results
    search_results = json.loads(response.text)['query']['search']

    # Extract the titles of the search results
    titles = [result['title'] for result in search_results]
    
    # Extract the page ids of the search results
    page_ids = [result['pageid'] for result in search_results]

    # Get the full texts of the search results from the Wikipedia API
    texts = []
    for page_id in page_ids:

        # Construct the API URL for the page content
        page_url = f'{url}?action=query&prop=extracts&format=json&pageids={page_id}&explaintext=1'

        # Send the API request and get the response
        page_response = requests.get(page_url)

        # Parse the JSON response and extract the page content
        page_content = json.loads(page_response.text)['query']['pages'][str(page_id)]['extract']

        # Add the page content to the list of texts
        texts.append(page_content)
        
    # Return the search results
    return titles, texts

In [6]:
# function to generate the TF-IDF matrix for a list of documents
def generate_tf_idf_matrix(documents):
    vectorizer = TfidfVectorizer(stop_words='english')

    # fit the vectorizer to the documents and calculate the TF-IDF matrix
    tf_idf_matrix = vectorizer.fit_transform(documents)
    # feature names from the vectorizer
    feature_names = vectorizer.vocabulary_
    # returning the TF-IDF matrix and feature names
    return tf_idf_matrix, feature_names

In [7]:
def search_tfidf(query, tf_idf_matrix, feature_names, documents, k=5):
    # Clean and tokenize the query text
    query_text = clean_text(query)
    query_tokens = set(tokenize_text(query_text))
    # Initialize the list of similarities
    similarities = []
    # Iterate over the documents
    for i in range(len(documents)):
        # Get the TF-IDF vector for the document
        document_vector = tf_idf_matrix[i].toarray()[0]
        # Initialize the query vector with zeros
        query_vector = np.zeros(len(feature_names))
        # Calculate the query vector
        for token in query_tokens:
            if token in feature_names:
                index = feature_names.get(token)
                query_vector[index] += 1
        query_vector = query_vector / len(query_tokens)
        # Calculate the Jaccard coefficient between the query vector and document vector
        similarity = jaccard_coefficient(query_vector, document_vector)
        # Add the similarity to the list of similarities
        similarities.append(similarity)
    # Sort the similarities in descending order and get the top k results
    indices = np.argsort(similarities)[::-1][:k]
    # Get the titles and texts of the top k results
    titles = [f'{i+1}. {documents[index].split(" ")[0]}' for i, index in enumerate(indices)]
    texts = [documents[index] for index in indices]
    # Return the search results
    return titles, texts

In [14]:
# Define the search query and language
query = 'wilson'
language = 'en'

# Search Wikipedia using the API
titles, texts = search_wikipedia(query, language)

# Generate the TF-IDF matrix for the search results
tf_idf_matrix, feature_names = generate_tf_idf_matrix(texts)

# Search the TF-IDF matrix using the Jaccard coefficient
titles_tfidf, texts_tfidf = search_tfidf(query, tf_idf_matrix, feature_names, texts)

# Print the search results
print(f'====================Search results for "{query}"============================')
print('Using Wikipedia API:')
for i, title in enumerate(titles):
    print(f'{i+1}. {title}')
    print(' '.join(tokenize_text(clean_text(texts[i]))))
    print()
print('====================Using TF-IDF and Jaccard coefficient=======================')
for i, title in enumerate(titles_tfidf):
    print()
    print(f'{i+1}.',' '.join(tokenize_text(texts_tfidf[i])))

Using Wikipedia API:
1. Wilson
wilson may refer to people wilson name list of people with given name wilson list of people with surname wilson wilson footballer 19271998 brazilian manager and defender wilson footballer born 1984 full name wilson rodrigues de moura júnior brazilian goalkeeper wilson footballer born 1985 full name wilson rodrigues fonseca brazilian forward wilson footballer born 1975 full name wilson roberto dos santos brazilian centreback places australia wilson south australia wilson western australia wilson inlet western australia wilson reef queensland wilsons promontory victoria australia and hencewilsons promontory islands important bird area wilsons promontory lighthouse wilsons promontory marine national park wilsons promontory national park canada wilson avenue toronto ontario wilson ttc subway station wilson subway yard poland wilson square plac wilsona in warsaw united kingdom wilson leicestershire the wilson cheltenham gloucestershire united states wilson ark