In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
import nltk
from nltk.corpus import stopwords
import string

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prais\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Load your dataset
df = pd.read_csv('../data-collection/dataset/disease_details.csv')

In [4]:
df.head(1)

Unnamed: 0,Disease,Description,Diagnosis,Differential Diagnosis,Treatment,Demographics,Gender,Age Group,Additional note,Cause,Symptoms,Complications,Outcomes,Preventive Measure
0,acanthosis-nigricans,Acanthosis nigricans is a skin condition chara...,"The diagnosis is made clinically,including a t...",Confluent and reticulated papillomatosis: occu...,The mainstay treatment is to manage the underl...,Acanthosis nigricans affects < 1% of Caucasian...,It affects both males and females,"Affects all ages,with people < 40 years old ty...",Those diagnosed with acanthosis nigricans were...,"The exact cause is still unclear,however it is...","Symmetric,thickened,brown,velvety patches and ...",Cosmetic disfigurement. Psychological distress...,Outcomes depend on the cause. If the underlyin...,


In [5]:
# Replace 'NaN' with "None"
df.fillna("None", inplace=True)

In [6]:
# Preprocess text (example, replace with your preprocessing logic)
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenization (split the text into words)
    tokens = text.split()
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Join tokens back into a single string
    text = ' '.join(tokens)
    
    return text

In [7]:
# Define the function to process the 'Sex' column
def preprocess_gender(text):
    text = text.lower()
    
    if 'male' in text and 'female' in text:
        return 'male and female'
    elif 'male' in text:
        return 'male'
    elif 'female' in text:
        return 'female'
    else:
        return text

In [8]:
# Apply preprocessing to relevant columns
columns_to_preprocess = ['Description', 'Diagnosis', 'Differential Diagnosis', 'Treatment', 
                         'Additional note', 'Cause', 'Gender', 'Symptoms', 'Complications']

In [9]:
updated_df = df.copy()

for col in columns_to_preprocess:
    if col == 'Gender':
        updated_df[col] = df[col].apply(preprocess_gender)
    else:
        updated_df[col] = df[col].apply(preprocess_text)

In [10]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(updated_df['Description'])

In [11]:
def retrieve_information(query, tfidf_matrix, tfidf_vectorizer, df):
    query_vec = tfidf_vectorizer.transform([query])
    cosine_similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    related_docs_indices = cosine_similarities.argsort()[::-1]
    
    top_results = []
    for i, idx in enumerate(related_docs_indices):
        disease = df.iloc[idx]['Disease']
        description = df.iloc[idx]['Description']
        diagnosis = df.iloc[idx]['Diagnosis']
        differential_diagnosis = df.iloc[idx]['Differential Diagnosis']
        treatment = df.iloc[idx]['Treatment']
        additional_note = df.iloc[idx]['Additional note']
        cause = df.iloc[idx]['Cause']
        symptoms = df.iloc[idx]['Symptoms']
        complications = df.iloc[idx]['Complications']
        similarity = cosine_similarities[idx]

        if(similarity <= 0.0):
            continue

        result = f"Disease: {disease}\n\n" \
                 f"Similarity: {similarity}\n\n"\
                 f"Description: {description}\n\n" \
                 f"Treatment: {treatment}\n\n" \
                 f"Additional Note: {additional_note}\n\n" \
                 f"Cause: {cause}\n\n" \
                 f"Symptoms: {symptoms}\n\n" \
                 f"Complications: {complications}\n\n" \
        
        top_results.append(result)
        
        if i >= 4:  # Return top 5 results
            break
    
    return top_results

In [12]:
query = "How can I cure my acne?"
results = retrieve_information(query, tfidf_matrix, tfidf_vectorizer, df)

if len(results) == 0:
    print("No information found.")
else:
    for result in results:
        print(result)
        print("-----")

Disease: acne

Similarity: 0.25

Description: Acne is a common chronic disorder affecting the hair follicle and sebaceous gland,in which there is expansion and blockage of the follicle and inflammation. There are several variants.

Treatment: Mild acne: Topical anti-acne agents,such as benzoyl peroxide,azelaic acid,and tretinoin or adapalene gel. New bioactive proteins may also prove successful. Low-dose combined oral contraceptive. Antiseptic or keratolytic washes containing salicylic acid. Light/laser therapy. Moderate acne: As for mild acne plus a tetracycline such as doxycycline 50–200 mg daily for 6 months or so. Erythromycin or trimethoprim if doxycycline intolerant. Antiandrogen therapy with long-term cyproterone acetate + ethinylestradiol or spironolactone may be considered in women not responding to low-dose combined oral contraceptive,particularly for women with polycystic ovaries. Isotretinoin is often used if acne is persistent or treatment-resistant. Severe acne: Referral 