In [None]:
pip install gradio




In [None]:


def analyze_documents(folder_path,target_pdf,n):
    import os
    import json
    import numpy as np
    import nltk
    from nltk.tokenize import word_tokenize
    from nltk.corpus import stopwords as sw
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
    from scipy.cluster.hierarchy import ward, dendrogram, fcluster, single, complete
    import matplotlib.pyplot as plt

    # Downloading required NLTK packages
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)



    ### Nested functions:

    #process input text
    n = int(n)

    # Processing the text data
    def process_text(text):
        tokens = word_tokenize(text)
        stopwords = set(sw.words('english'))
        tokens = [token for token in tokens if token.lower() not in stopwords]
        return ' '.join(tokens)

    # Function to get the top n similar documents
    def get_top_n_similar_documents(index, similarity_matrix, n=5):
        # Getting similarity values for the given document with all other documents
        similarity_values = similarity_matrix[index]

        # Sorting the indices based on similarity values in descending order
        sorted_indices = np.argsort(similarity_values)[::-1]

        # Excluding the first index because it will be the given document itself (similarity with itself is 1)
        return sorted_indices[1:n+1]


    file_list = os.listdir(folder_path)
    # Creating list to hold documents' contents
    documents = []

    # Creating list to hold corresponding PDF filenames
    pdf_file_list = []

    # Loading the JSON files and extracting content
    for file_name in file_list:
        if file_name.endswith('.json'):                      # Checking if the file is a JSON file
            with open(os.path.join(folder_path, file_name), 'r') as file:
                data = json.load(file)
                abstract = data.get("SHORT_ABSTRACT", "")    # Using the information in the short abstract & title sections to represent the content of the documents
                documents.append(process_text(abstract))
                pdf_filename = data.get("pdf_files", [None])[0]   # Getting the corresponding PDF filename
                if pdf_filename:                             # Appendding the PDF filename to pdf_file_list
                    pdf_file_list.append(pdf_filename)

    vectorizer = TfidfVectorizer()                           # Using TF-IDF to vectorize the processed content
    tfidf_matrix = vectorizer.fit_transform(documents)

    # similarity_matrix[i][j] gives the cosine similarity between the i-th and j-th document
    similarity_matrix = cosine_similarity(tfidf_matrix)

    #produce hierarchical clustering and dendogram
    dist = 1 - similarity_matrix
    dist = dist - dist.min()
    linkage_matrix = ward(dist)
    #linkage_matrix = linkage(dist, method="single")
    #linkage_matrix = linkage(dist, method="complete")

    fig, ax = plt.subplots(figsize=(50,80))
    ax = dendrogram(linkage_matrix, orientation="right", leaf_font_size=6, labels=pdf_file_list)
    plt.tick_params(axis='x', bottom='off', top='off', labelbottom='off')
    plt.tight_layout() #show plot with tight layout"""

    # Initialize an empty result string
    result = ""

    if target_pdf in pdf_file_list:
        # Choosing a document index for the target PDF
        document_index = pdf_file_list.index(target_pdf)

        # Getting the top n similar document indices
        top_n_indices = get_top_n_similar_documents(document_index, similarity_matrix, n)

        # Append target PDF's SHORT_ABSTRACT to the result
        result += f"\nTarget PDF's SHORT_ABSTRACT:\n{documents[document_index]}\n"

        # Append the top n similar documents to the result
        result += f"\nTop {n} similar documents to {target_pdf} are:\n"
        for i in top_n_indices:
            result += f"\nPDF Name: {pdf_file_list[i]}\nSHORT_ABSTRACT: {documents[i]}\n"
    else:
        result = f"'{target_pdf}' not found in pdf_file_list."

    return result





In [None]:
import gradio as gr


demo = gr.Interface(
    fn=analyze_documents,
    inputs=[
        gr.Textbox(label="Please enter the folder path in local drive:", placeholder="Enter Folder path"),
        gr.Textbox(label="Please enter the name of the target PDF:", placeholder="Enter Filename"),
        gr.Number(label="Please enter the number of top similar documents to retrieve:", default=3, min=1, max=100, step=1)
    ],
    outputs=gr.Textbox(label="Similar Files"),
    examples=[
        ["/content/drive/MyDrive/CITS5553_Group 5/wamex_metadata/subset_1","a071228_051017_baldivis_minexpreport_10321296.pdf", 3],["/content/drive/MyDrive/CITS5553_Group 5/wamex_metadata/subset_1","a071874_700-100-go-rep-0005_11545981.pdf", 3]],
    title="Similarity Generator"
)

demo.launch()

  gr.Number(label="Please enter the number of top similar documents to retrieve:", default=3, min=1, max=100, step=1)


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://cc58c26a50211ac64a.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


