### In this notebook, a classification model is implemented and cosine similarity is performed.

In [ ]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import pickle
from numpy import dot
from numpy.linalg import norm

In [ ]:
# Load the stop words list using pickle
with open('stop_words.ob', 'rb') as fp:
    domain_stop_word = pickle.load(fp)

In [ ]:
# Load the data file
file_path = 'diseases_with_description.csv'
df = pd.read_csv(file_path)
print(df.head())

In [ ]:
def clean_text_func(text):
    
    """This function cleans and preprocesses the data."""

    text = str(text)
    text = text.lower()
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!?.\/'+]", " ", text)
    text = re.sub(r"\+", " ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ", text)
    text = re.sub(r"\?", " ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"[0-9]", " ", text)
    final_text = ""
    for x in text.split():
        if x not in domain_stop_word:
            final_text = final_text + x  +" "
    return final_text

df['Description'] = df['Description'].apply(lambda x: clean_text_func(x))
df.head()

# Word Embedding:

In [ ]:
cv = CountVectorizer(stop_words="english")
cv_tfidf = TfidfVectorizer(stop_words="english")

X = cv.fit_transform(df['Description'])
X_tfidf = cv_tfidf.fit_transform(df['Description'])

In [ ]:
df_cv = pd.DataFrame(X.toarray(), columns=cv.get_feature_names_out())
df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=cv_tfidf.get_feature_names_out())

In [ ]:
print(df_cv.shape)
cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))

### Cosine Similarity

In [ ]:
new_text = ["dizziness loss of balance vomiting tinnitus of hearing in the high frequency range in one ear difficulty focusing your eyes"]
new_text_cv = cv.transform(new_text).toarray()[0]
new_text_tfidf = cv_tfidf.transform(new_text).toarray()[0]

for chapter_number in range(df.shape[0]):
    print(f"Chapter number: {chapter_number}")
    print(f"Cosine similarity with CountVectorizer: {cosine(df_cv.iloc[chapter_number], new_text_cv)}")
    print(f"Cosine similarity with TFIDF: {cosine(df_tfidf.iloc[chapter_number], new_text_tfidf)}")

### Implementation of the classification model: Logistic Regression

In [ ]:
X_train = df.Description
y_train = df.D_Name

In [ ]:
cv1 = CountVectorizer()
X_train_cv1 = cv1.fit_transform(X_train)
pd_cv1 = pd.DataFrame(X_train_cv1.toarray(), columns=cv1.get_feature_names_out())

In [ ]:
lr = LogisticRegression()
lr.fit(X_train_cv1, y_train)

In [ ]:
X_test = "Difficulty sleeping or staying asleep Fever Fluid draining from ear Loss of balance. Hearing difficulties. Ear pain"
cleaned_text = clean_text_func(X_test)

In [ ]:
X_test_cv3 = cv1.transform([cleaned_text])
y_pred_cv3 = lr.predict(X_test_cv3)
print(y_pred_cv3)

### Conclusion: 
The model is capable of predicting the disease based on the provided user input.