Reading CSV file

In [None]:
import csv
import ast

def read_csv(file_path):
    with open(file_path, mode='r') as file:
        reader = csv.DictReader(file)
        return [row for row in reader]


Read the datasets from CSV files

In [None]:
symptoms_data = read_csv('data/symtoms_df.csv')
description_data = read_csv('data/description.csv')
diet_data = read_csv('data/diets.csv')
medications_data = read_csv('data/medications.csv')
precautions_data = read_csv('data/precautions_df.csv')
workout_data = read_csv('data/workout_df.csv')

Convert string representation of lists into actual lists

In [None]:
def parse_list_column(column_value):
    """Convert a string representation of a list into an actual list."""
    try:
        return ast.literal_eval(column_value)
    except (SyntaxError, ValueError):
        return [] 

Parsing all columns and Create the combined dataset

In [None]:

def merge_data(symptoms, descriptions, diets, medications, precautions, workouts):
    combined_data = []
    for symptom in symptoms:
        disease = symptom['Disease']
        description = next((desc['description'] for desc in descriptions if desc['Disease'] == disease), "")
    
        diet_list = parse_list_column(next((diet['Diet'] for diet in diets if diet['Disease'] == disease), "[]"))
        medication_list = parse_list_column(next((med['Medication'] for med in medications if med['Disease'] == disease), "[]"))

        precautions_list = {f'Precaution_{i+1}': next((precaution[f'Precaution_{i+1}'] for precaution in precautions if precaution['Disease'] == disease), "") for i in range(4)}
        
        workout = next((workout['workout'] for workout in workouts if workout['Disease'] == disease), "")

        combined_row = {
            'Disease': disease,
            'description': description,
            'workout': workout,
            'Symptom_1': symptom['Symptom_1'],
            'Symptom_2': symptom['Symptom_2'],
            'Symptom_3': symptom['Symptom_3'],
            'Symptom_4': symptom['Symptom_4'],
        }

        for i in range(4):
            combined_row[f'Diet_{i+1}'] = diet_list[i] if i < len(diet_list) else ""

        for i in range(4):
            combined_row[f'Medication_{i+1}'] = medication_list[i] if i < len(medication_list) else ""

        combined_row.update(precautions_list)

        combined_data.append(combined_row)

    return combined_data

Create and save the combined dataset

In [None]:
combined_data = merge_data(symptoms_data, description_data, diet_data, medications_data, precautions_data, workout_data)

Save combined dataset to CSV

In [None]:
def save_combined_dataset(combined_data, file_path):
    keys = combined_data[0].keys()
    with open(file_path, mode='w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=keys)
        writer.writeheader()
        writer.writerows(combined_data)

In [None]:

save_combined_dataset(combined_data, 'data/combined_dataset.csv')

print("Combined dataset created and saved successfully!")

Combined dataset created and saved successfully!


Tokenizing

In [None]:
import pandas as pd
import re
import numpy as np
import pickle


combined_dataset = pd.read_csv('data/combined_dataset.csv')  # Adjust the filename if needed

def tokenize(text):
    return re.findall(r'\b\w+\b', text)

def tokenize_columns(dataset):
    for column in dataset.columns:
        dataset[column] = dataset[column].apply(lambda x: tokenize(x) if isinstance(x, str) else x)
    return dataset

combined_dataset = tokenize_columns(combined_dataset)

Lowercasing

In [None]:
def lowercase_data(dataset):
    for column in dataset.columns:
        dataset[column] = dataset[column].apply(lambda x: [word.lower() for word in x] if isinstance(x, list) else x)
    return dataset

combined_dataset = lowercase_data(combined_dataset)

Stopwords Removal

In [None]:
stopwords = set([
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", 
    "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", 
    "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", 
    "theirs", "themselves", "what", "which", "who", "whom", "this", "that", 
    "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", 
    "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", 
    "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", 
    "at", "by", "for", "with", "about", "against", "between", "into", "through", 
    "during", "before", "after", "above", "below", "to", "from", "up", "down", 
    "in", "out", "on", "off", "over", "under", "again", "further", "then", 
    "once", "here", "there", "when", "where", "why", "how", "all", "any", 
    "both", "each", "few", "more", "most", "other", "some", "such", "no", 
    "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", 
    "t", "can", "will", "just", "don", "should", "now"
])

def remove_stopwords(dataset):
    for column in dataset.columns:
        dataset[column] = dataset[column].apply(lambda x: [word for word in x if word not in stopwords] if isinstance(x, list) else x)
    return dataset

combined_dataset = remove_stopwords(combined_dataset)


Advanced Stemming - Porter Stemming Algorithm

In [None]:
def stem(word):

    suffixes = {
        'sses': 'ss', 'ies': 'i', 'eed': 'ee', 'ed': '', 'ing': '',
        'es': '', 's': '', 'ment': '', 'tion': '', 'ness': '', 'ity': '', 'ly': ''
    }
    for suffix, replacement in suffixes.items():
        if word.endswith(suffix):
            return word[:-len(suffix)] + replacement
    return word

def advanced_stemming(dataset):
    for column in dataset.columns:
        dataset[column] = dataset[column].apply(lambda x: [stem(word) for word in x] if isinstance(x, list) else x)
    return dataset

combined_dataset = advanced_stemming(combined_dataset)


Normalization

In [None]:
import re

def normalize_data(dataset):
    for column in dataset.columns:
        dataset[column] = dataset[column].apply(
            lambda x: x.strip().replace(r'\s+', ' ') if isinstance(x, str) else x  # Use raw string for regex
        )
    return dataset

combined_dataset = normalize_data(combined_dataset)


Saving the preprocessed dataset

In [None]:
def save_dataset(dataset, filename):
    dataset.to_csv(filename, index=False)

save_dataset(combined_dataset, 'data/preprocessed_combined_dataset.csv')


Read the Preprocessed Dataset

In [None]:
preprocessed_dataset = pd.read_csv('data/combined_dataset.csv')


Creation of vocabulary and checking

In [None]:
import csv
import re

# Load your data
with open('data/combined_dataset.csv', 'r') as file:
    reader = csv.DictReader(file)
    data = [row for row in reader]


def create_vocabulary(data, columns_to_use):
    vocabulary = {}
    index = 0
    for row in data:
        for column in columns_to_use:
            words = row[column].split()  
            for word in words:
                if word not in vocabulary:
                    vocabulary[word] = index
                    index += 1
    return vocabulary


columns_to_use = ['description', 'Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4', 'Diet_1', 'Diet_2', 'Diet_3', 'Diet_4', 'Medication_1', 'Medication_2', 'Medication_3', 'Medication_4', 'Precaution_1', 'Precaution_2', 'Precaution_3', 'Precaution_4']
vocabulary = create_vocabulary(data, columns_to_use)


print("Vocabulary Size:", len(vocabulary))
print("Sample Vocabulary:", dict(list(vocabulary.items())[:10]))  # Display first 10 words and their indices


Vocabulary Size: 683
Sample Vocabulary: {'Fungal': 0, 'infection': 1, 'is': 2, 'a': 3, 'common': 4, 'skin': 5, 'condition': 6, 'caused': 7, 'by': 8, 'fungi.': 9}


Saving Vocabulary

In [None]:
import csv

def save_vocabulary(vocabulary, filename):
    with open(filename, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Word', 'Index'])
        for word, index in vocabulary.items():
            writer.writerow([word, index])

save_vocabulary(vocabulary, 'vocabulary.csv')


Implementation of Bag of Words (BoW)

In [None]:
from collections import Counter


def bag_of_words(text, vocabulary):
    word_count = Counter(text)
    return [word_count.get(word, 0) for word in vocabulary]

def create_bow_dataset(data, columns, vocabulary):
    bow_dataset = []
    for row in data:
        bow_representation = []
        for column in columns:
            text = row[column]
            if isinstance(text, str):
                words = re.findall(r'\b\w+\b', text.lower())  # Tokenize
                bow_representation.extend(bag_of_words(words, vocabulary))
        bow_dataset.append(bow_representation)
    return bow_dataset

# Example usage
bow_data = create_bow_dataset(data, columns_to_use, list(vocabulary.keys()))
print("Bag of Words representation for first entry:", bow_data[0])


Bag of Words representation for first entry: [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

Implementation of TF-IDF

In [None]:
import math

def compute_tf(text, vocabulary):
    word_count = Counter(text)
    total_words = len(text)
    return {word: count / total_words for word, count in word_count.items()}

def compute_idf(data, vocabulary):
    document_count = len(data)
    idf = {}
    for word in vocabulary:
        docs_containing_word = sum(1 for row in data if word in row)
        idf[word] = math.log(document_count / (1 + docs_containing_word))
    return idf


def compute_tfidf(data, columns, vocabulary):
    tfidf_data = []
    idf = compute_idf(data, vocabulary)
    for row in data:
        tfidf_representation = []
        for column in columns:
            text = row[column]
            if isinstance(text, str):
                words = re.findall(r'\b\w+\b', text.lower())  # Tokenize
                tf = compute_tf(words, vocabulary)
                tfidf_representation.extend([tf.get(word, 0) * idf.get(word, 0) for word in vocabulary])
        tfidf_data.append(tfidf_representation)
    return tfidf_data

tfidf_data = compute_tfidf(data, columns_to_use, list(vocabulary.keys()))
print("TF-IDF representation for first entry:", tfidf_data[0])


TF-IDF representation for first entry: [0.0, 0.8501063809486354, 0.8501063809486354, 0.8501063809486354, 0.8501063809486354, 0.8501063809486354, 0.8501063809486354, 0.8501063809486354, 0.8501063809486354, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 

Saving BoW and TF-IDF

In [None]:
def save_bow_data(bow_data, filename):
    with open(filename, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['BoW_' + str(i) for i in range(len(bow_data[0]))])  # Header for BoW columns
        for row in bow_data:
            writer.writerow(row)

save_bow_data(bow_data, 'bow_data.csv')

In [None]:

def save_tfidf_data(tfidf_data, filename):
    with open(filename, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['TF-IDF_' + str(i) for i in range(len(tfidf_data[0]))])  # Header for TF-IDF columns
        for row in tfidf_data:
            writer.writerow(row)

save_tfidf_data(tfidf_data, 'tfidf_data.csv')


In [None]:
import csv
import re


embedding_dim = 100 
window_size = 2     
min_freq = 2         

word_freq = {}  
vocabulary = {}  
word_count = 0   


data = []
with open('data/combined_dataset.csv', 'r') as file:
    reader = csv.DictReader(file)
    for row in reader:
        data.append(row)

        
        for column in ['description', 'Diet_1','Diet_2','Diet_3','Diet_4', 
                       'Medication_1','Medication_2','Medication_3','Medication_4', 
                       'Precaution_1', 'Precaution_2', 'Precaution_3', 'Precaution_4', 
                       'Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4', 'workout']:
            text = row[column]
            if isinstance(text, str):
                words = re.findall(r'\b\w+\b', text.lower()) 
                for word in words:
                    word_freq[word] = word_freq.get(word, 0) + 1  

# Filter words by frequency threshold and create vocabulary
for word, count in word_freq.items():
    if count >= min_freq:  # Only keep words that appear more than or equal to the minimum frequency
        vocabulary[word] = word_count
        word_count += 1


vocab_size = len(vocabulary)

print(f"Vocabulary Size (after frequency threshold): {vocab_size}")
print("Sample Vocabulary:", dict(list(vocabulary.items())[:10]))  # Display first 10 words and their indices


Vocabulary Size (after frequency threshold): 632
Sample Vocabulary: {'fungal': 0, 'infection': 1, 'is': 2, 'a': 3, 'common': 4, 'skin': 5, 'condition': 6, 'caused': 7, 'by': 8, 'fungi': 9}


In [None]:
import csv


def save_vocabulary(vocabulary, filename):
    with open(filename, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Word', 'Index'])
        for word, index in vocabulary.items():
            writer.writerow([word, index])


save_vocabulary(vocabulary, 'vocabulary.csv')


Generating all the Bi-Grams

In [None]:
import csv
import re
from collections import defaultdict


with open('data/combined_dataset.csv', 'r') as file:
    reader = csv.DictReader(file)
    data = [row for row in reader]


def generate_bigrams(text):
    words = re.findall(r'\b\w+\b', text.lower())
    bigrams = [(words[i], words[i+1]) for i in range(len(words)-1)]
    return bigrams


def generate_bigrams_for_dataset(data, columns_to_use):
    dataset_bigrams = defaultdict(list)
    for row in data:
        for column in columns_to_use:
            text = row[column]
            if isinstance(text, str):
                bigrams = generate_bigrams(text)
                dataset_bigrams[column].extend(bigrams)
    return dataset_bigrams


columns_to_use = [
    'description', 'Diet_1', 'Diet_2', 'Diet_3', 'Diet_4',
    'Medication_1', 'Medication_2', 'Medication_3', 'Medication_4',
    'Precaution_1', 'Precaution_2', 'Precaution_3', 'Precaution_4',
    'Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4', 'workout'
]


bigrams_dataset = generate_bigrams_for_dataset(data, columns_to_use)


sample_column = 'description'
print(f"Bigrams for column '{sample_column}':", bigrams_dataset[sample_column][:10])  # Display first 10 bigrams


Bigrams for column 'description': [('fungal', 'infection'), ('infection', 'is'), ('is', 'a'), ('a', 'common'), ('common', 'skin'), ('skin', 'condition'), ('condition', 'caused'), ('caused', 'by'), ('by', 'fungi'), ('fungal', 'infection')]


In [None]:
import csv

def save_bigrams_to_csv(bigrams_dataset, output_file):
    with open(output_file, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Column', 'Word1', 'Word2'])  # Header
        for column, bigrams in bigrams_dataset.items():
            for bigram in bigrams:
                writer.writerow([column, bigram[0], bigram[1]])


save_bigrams_to_csv(bigrams_dataset, 'bigrams.csv')

print("Bigrams have been saved to 'bigrams.csv'.")


Bigrams have been saved to 'bigrams.csv'.


In [87]:
import csv

# Load bigrams from a CSV file
def load_bigrams_from_csv(input_file):
    bigrams_dataset = {}
    with open(input_file, mode='r') as file:
        reader = csv.reader(file)
        next(reader)  # Skip the header
        for row in reader:
            column, word1, word2 = row
            if column not in bigrams_dataset:
                bigrams_dataset[column] = []
            bigrams_dataset[column].append((word1, word2))
    return bigrams_dataset

# Load bigrams from 'bigrams.csv'
bigrams_dataset = load_bigrams_from_csv('bigrams.csv')

# Display a sample of the loaded bigrams
print("Loaded bigrams for each column:")
for column, bigrams in bigrams_dataset.items():
    print(f"Column: {column}, Bigrams: {bigrams[:5]}")


Loaded bigrams for each column:
Column: description, Bigrams: [('fungal', 'infection'), ('infection', 'is'), ('is', 'a'), ('a', 'common'), ('common', 'skin')]
Column: Diet_1, Bigrams: [('antifungal', 'diet'), ('antifungal', 'diet'), ('antifungal', 'diet'), ('antifungal', 'diet'), ('antifungal', 'diet')]
Column: Diet_2, Bigrams: [('omega', '3'), ('3', 'rich'), ('rich', 'foods'), ('omega', '3'), ('3', 'rich')]
Column: Diet_3, Bigrams: [('vitamin', 'c'), ('c', 'rich'), ('rich', 'foods'), ('vitamin', 'c'), ('c', 'rich')]
Column: Diet_4, Bigrams: [('coconut', 'oil'), ('coconut', 'oil'), ('coconut', 'oil'), ('coconut', 'oil'), ('coconut', 'oil')]
Column: Medication_1, Bigrams: [('antifungal', 'cream'), ('antifungal', 'cream'), ('antifungal', 'cream'), ('antifungal', 'cream'), ('antifungal', 'cream')]
Column: Medication_2, Bigrams: [('h2', 'blockers'), ('h2', 'blockers'), ('h2', 'blockers'), ('h2', 'blockers'), ('h2', 'blockers')]
Column: Medication_3, Bigrams: [('integrase', 'inhibitors'), (

Generation of ebeddings and training with skip-gram approach

In [None]:
import numpy as np
import re

vocab_size = len(vocabulary)
embedding_dim = 100
window_size = 2


embeddings = np.random.uniform(-0.5, 0.5, (vocab_size, embedding_dim))


def generate_training_data(data, columns, window_size):
    pairs = []
    for row in data:
        for column in columns:
            text = row[column]
            if isinstance(text, str):
                words = re.findall(r'\b\w+\b', text.lower())
                for i, word in enumerate(words):
                    if word in vocabulary:
                        start = max(0, i - window_size)
                        end = min(len(words), i + window_size + 1)
                        context = [words[j] for j in range(start, end) if j != i and words[j] in vocabulary]
                        for context_word in context:
                            pairs.append((word, context_word))
    return pairs


columns_to_use = [
    'description', 'Diet_1', 'Diet_2', 'Diet_3', 'Diet_4',
    'Medication_1', 'Medication_2', 'Medication_3', 'Medication_4',
    'Precaution_1', 'Precaution_2', 'Precaution_3', 'Precaution_4',
    'Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4', 'workout'
]
pairs = generate_training_data(data, columns_to_use, window_size)


def train_embeddings(pairs, vocabulary, embeddings, embedding_dim, learning_rate=0.01, epochs=10):
    for epoch in range(epochs):
        total_loss = 0
        for word, context in pairs:
            if word in vocabulary and context in vocabulary:
                word_idx = vocabulary[word]
                context_idx = vocabulary[context]
                
    
                word_vector = embeddings[word_idx]
                context_vector = embeddings[context_idx]
                dot_product = np.dot(word_vector, context_vector)
                prediction = 1 / (1 + np.exp(-dot_product))  
                
                
                error = 1 - prediction  
                total_loss += -np.log(prediction + 1e-7)  
                gradients = learning_rate * error
                
                
                embeddings[word_idx] += gradients * context_vector
                embeddings[context_idx] += gradients * word_vector
        
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}")


train_embeddings(pairs, vocabulary, embeddings, embedding_dim)


Epoch 1/10, Loss: 16798.6316
Epoch 2/10, Loss: 1319.1855
Epoch 3/10, Loss: 685.1501
Epoch 4/10, Loss: 454.7084
Epoch 5/10, Loss: 336.8711
Epoch 6/10, Loss: 265.8598
Epoch 7/10, Loss: 218.6373
Epoch 8/10, Loss: 185.0895
Epoch 9/10, Loss: 160.0965
Epoch 10/10, Loss: 140.7967


In [105]:
import pickle

# Save the embeddings to a file
def save_embeddings(embeddings, vocabulary, filename="embeddings.pkl"):
    with open(filename, "wb") as f:
        pickle.dump({"embeddings": embeddings, "vocabulary": vocabulary}, f)
    print(f"Embeddings saved to {filename}")

# Save embeddings and vocabulary
save_embeddings(embeddings, vocabulary)


Embeddings saved to embeddings.pkl


In [None]:

def load_embeddings(filename="embeddings.pkl"):
    with open(filename, "rb") as f:
        data = pickle.load(f)
    print(f"Embeddings loaded from {filename}")
    return data["embeddings"], data["vocabulary"]

loaded_embeddings, loaded_vocabulary = load_embeddings()


Embeddings loaded from embeddings.pkl


Testing

In [None]:

sample_word = "disease"
if sample_word in loaded_vocabulary:
    print(f"Embedding for '{sample_word}': {loaded_embeddings[loaded_vocabulary[sample_word]]}")
else:
    print(f"'{sample_word}' not found in loaded vocabulary.")


Embedding for 'disease': [-0.10054018  0.10284898 -0.60790361 -0.01700412  0.02333419 -0.51426736
 -0.535675    0.13848301  0.4233241  -0.44649756 -0.26827167  0.20818069
  0.19079463 -0.39416587 -0.51196245  1.30865582 -0.13976438  0.54903058
  0.3639182   0.37242392 -0.7502198  -0.01976446 -0.0246828   0.3327786
  0.08731805 -0.49553125  0.30094634  0.07316593  0.73843593  0.35641281
 -0.6303207  -0.69798242  0.57554557  0.14621616  0.60849268 -0.02991949
  0.24946712 -0.22230875 -0.80820388  0.38109037 -0.30266972 -0.5637002
  0.68037392  0.26794855 -0.1386903   0.24813732 -0.43528649 -0.61644687
  0.74692797 -0.14782086 -1.28959942  0.84152336 -0.51372691  0.10948487
 -0.07024075 -0.25500919  0.17359464 -0.98217481  0.70681954  0.68528325
 -0.2010956  -0.24347601 -0.44796341 -0.45983275 -0.4075873  -0.40895375
 -0.14841514  0.21981456  0.44933993  0.40458238 -0.28440485  0.53138137
 -0.56113667  0.49335556  0.53588773  0.04016816  0.05513432  0.57180697
  0.07047294 -0.28377546 -1.

Cosine Similarity

In [None]:
from numpy.linalg import norm
import numpy as np


def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (norm(vec1) * norm(vec2))


def check_word_embedding(word, vocabulary, embeddings):
    word_idx = vocabulary.get(word)
    if word_idx is not None:
        return embeddings[word_idx]
    else:
        print(f"Word '{word}' not found in vocabulary.")
        return None


word1_vec = check_word_embedding("disease", vocabulary, embeddings)
word2_vec = check_word_embedding("medication", vocabulary, embeddings)

if word1_vec is not None and word2_vec is not None:
    similarity = cosine_similarity(word1_vec, word2_vec)
    print(f"Cosine Similarity between 'disease' and 'medication': {similarity:.4f}")


Cosine Similarity between 'disease' and 'medication': 0.1492


Model selection and training

In [None]:
import csv
import re
import random
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

def prepare_data_for_naive_bayes(data, columns):
    X, y = [], []
    for row in data:
        symptoms = []
        for column in columns:
            text = row[column]
            if isinstance(text, str):
                symptoms.extend(re.findall(r'\b\w+\b', text.lower()))
        X.append(" ".join(symptoms))
        y.append(row['Disease'])  
    return X, y


columns_to_use = ['Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4']
X, y = prepare_data_for_naive_bayes(data, columns_to_use)


vectorizer = CountVectorizer()
X_vec = vectorizer.fit_transform(X)

model = MultinomialNB()
model.fit(X_vec, y)


def predict_disease(user_input):
    input_vec = vectorizer.transform([user_input])
    predicted_disease = model.predict(input_vec)[0]
    return predicted_disease


user_input = "hey there i am sufering with fatigue and yellowish_skin"
predicted_disease = predict_disease(user_input)
print(f"Predicted Disease for input '{user_input}': {predicted_disease}")


Predicted Disease for input 'hey there i am sufering with fatigue and yellowish_skin': Hepatitis B


Model Evaluation

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(X, y, model):
    X_vec = vectorizer.transform(X)
    y_pred = model.predict(X_vec)
    
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred, average='weighted', zero_division=1)
    recall = recall_score(y, y_pred, average='weighted', zero_division=1)
    f1 = f1_score(y, y_pred, average='weighted', zero_division=1)

    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")

evaluate_model(X, y, model)


Accuracy: 0.9951219512195122
Precision: 0.9952380952380953
Recall: 0.9951219512195122
F1 Score: 0.9951204258759286


In [None]:
# import csv
# import re
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.feature_extraction.text import CountVectorizer


# def load_data(file_path):
#     data = []
#     with open(file_path, 'r') as file:
#         reader = csv.DictReader(file)
#         for row in reader:
#             data.append(row)
#     return data


# def prepare_data_for_naive_bayes(data, symptom_columns):
#     X, y = [], []
#     for row in data:
#         symptoms = []
#         for column in symptom_columns:
#             text = row[column]
#             if isinstance(text, str):
#                 symptoms.extend(re.findall(r'\b\w+\b', text.lower()))
#         X.append(" ".join(symptoms))
#         y.append(row['Disease'])  
#     return X, y

# def map_disease_info(data):
#     disease_info = {}
#     for row in data:
#         disease_name = row['Disease']
#         if disease_name not in disease_info:
#             disease_info[disease_name] = {
#                 "description": row['description'],
#                 "workout": row['workout'],
#                 "diets": [row[f'Diet_{i}'] for i in range(1, 5) if row[f'Diet_{i}']],
#                 "medications": [row[f'Medication_{i}'] for i in range(1, 5) if row[f'Medication_{i}']],
#                 "precautions": [row[f'Precaution_{i}'] for i in range(1, 5) if row[f'Precaution_{i}']]
#             }
#     return disease_info


# def predict_disease_info(user_input, model, vectorizer, disease_info):
#     input_vec = vectorizer.transform([user_input])
#     predicted_disease = model.predict(input_vec)[0]
#     return {
#         "disease": predicted_disease,
#         "details": disease_info.get(predicted_disease, {})
#     }

# data_file = 'data/combined_dataset.csv'  
# symptom_columns = ['Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4']


# data = load_data(data_file)

# X, y = prepare_data_for_naive_bayes(data, symptom_columns)


# vectorizer = CountVectorizer()
# X_vec = vectorizer.fit_transform(X)


# model = MultinomialNB()
# model.fit(X_vec, y)


# disease_info = map_disease_info(data)


# user_input = "I have suffering from acidity"
# result = predict_disease_info(user_input, model, vectorizer, disease_info)

# # Display results
# print(f"Predicted Disease: {result['disease']}")
# print("Details:")
# print(f"  Description: {result['details'].get('description', 'N/A')}")
# print(f"  Workout: {result['details'].get('workout', 'N/A')}")
# print(f"  Diets: {', '.join(result['details'].get('diets', []))}")
# print(f"  Medications: {', '.join(result['details'].get('medications', []))}")
# print(f"  Precautions: {', '.join(result['details'].get('precautions', []))}")


Predicted Disease: Migraine
Details:
  Description: Migraine is a type of headache that often involves severe pain and sensitivity to light and sound.
  Workout: Identify and avoid trigger foods
  Diets: Migraine Diet, Low-Tyramine Diet, Caffeine withdrawal, Hydration
  Medications: Analgesics, Triptans, Ergotamine derivatives, Preventive medications
  Precautions: meditation, reduce stress, use poloroid glasses in sun, consult doctor
