<a href="https://colab.research.google.com/github/qaisalzaghal/HealingCall/blob/main/Healing_Call.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#**Collecting data**

In [None]:
df=pd.read_csv('/content/Symptom2Disease.csv')
df.head()

In [None]:
df.tail(20)

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.columns

In [None]:
df["label"].value_counts()

## Explore data balance

In [None]:
plt.figure(figsize=(18, 6))

plt.subplot(1, 2, 1)
ax = sns.barplot(x=df["label"].value_counts().index,
                 y=df["label"].value_counts(),
                 palette="rocket")
plt.xlabel("Disease", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.title("Count of Diseases", size=16)
plt.xticks(rotation=90)
plt.tight_layout()



plt.subplot(1, 2, 2)
counts = df["label"].value_counts()
plt.pie(counts,
        labels=counts.index,
        startangle=90,
        wedgeprops={'edgecolor': 'white', 'linewidth': 1})
plt.title("Distribution of Diseases", size=16)
plt.tight_layout()

plt.show()

## Take "text" as feature, "label" as label and drop "Unnamed: 0"

In [None]:
df=df.drop("Unnamed: 0",axis=1)
df

In [None]:
#df.to_csv('new_dataframe_disease.csv', index=False)

In [None]:
#df=pd.read_csv("new_dataframe_disease.csv")

# Preprocessing data

In [None]:
import nltk
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger")
nltk.download('punkt_tab')

## remove stopwords and punctuations

In [None]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def preprocess_data(text):
  text =''.join([char for char in text if char not in string.punctuation])
  text=' '.join([word for word in text.split() if word not in stopwords.words('english')])
  text=text.lower()
  words = word_tokenize(text)
  return " ".join(words)

df["cleaned_text"] = df["text"].apply(preprocess_data)

df.head()


In [None]:
#df.to_csv('new_dataframe_myproject_preprocess.csv', index=False)

In [None]:
#df=pd.read_csv("new_dataframe_myproject_preprocess.csv")

## text length before and after preprocessing

In [None]:
df["text_length"] = df["text"].apply(lambda x:len(x.split()))
df["cleaned_text_length"] = df["cleaned_text"].apply(lambda x:len(x.split()))
df=df[["text","text_length","cleaned_text","cleaned_text_length","label"]]
df.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fig=plt.figure(figsize=(10, 6))
fig.add_subplot(1, 2, 1)
sns.set_style("whitegrid")
x_labels = ["text_length", "cleaned_text_length"]
y_values = [df["text_length"].sum(), df["cleaned_text_length"].sum()]
sns.barplot(x=x_labels, y=y_values)
plt.title("Total Word Count")
for i, value in enumerate(y_values):
    plt.text(i, value , str(value), ha='center', va='bottom')


fig.add_subplot(1, 2, 2)
sns.set_style("whitegrid")
plt.hist(df["text_length"], bins=50, label="Original Text")
plt.hist(df["cleaned_text_length"], bins=50, alpha=0.5, label="Cleaned Text")
plt.xlabel("Word Count")
plt.ylabel("Frequency")
plt.title("Word Count Distribution")
plt.legend()



## ---WordCloud--- before and after preprocessing

In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

plt.figure(figsize=(15, 7))

plt.subplot(1, 2, 1)
text = " ".join(df["text"].astype(str).tolist())
wordcloud = WordCloud(width=800, height=400, background_color="black", stopwords=STOPWORDS).generate(text)
plt.imshow(wordcloud)
plt.axis("off")
plt.title("Original Text")

plt.subplot(1, 2, 2)
text_cleaned = " ".join(df["cleaned_text"].astype(str).tolist())
wordcloud_cleaned = WordCloud(width=800, height=400, background_color="black", stopwords=STOPWORDS).generate(text_cleaned)
plt.imshow(wordcloud_cleaned)
plt.axis("off")
plt.title("Cleaned Text")

plt.tight_layout()
plt.show()

In [None]:
from nltk.stem import WordNetLemmatizer, SnowballStemmer

def get_lemmatized(text):
  lemmatizer = WordNetLemmatizer()
  lemmatized_words = [lemmatizer.lemmatize(word) for word in text.split()]
  return " ".join(lemmatized_words)

df["lemmatized_text"] = df["cleaned_text"].apply(get_lemmatized)

def get_stemmed(text):
  stemmer = SnowballStemmer('english')
  stemmed_words = [stemmer.stem(word) for word in text.split()]
  return " ".join(stemmed_words)

df["stemmed_text"] = df["lemmatized_text"].apply(get_stemmed)

df=df[["text","text_length","cleaned_text","cleaned_text_length","lemmatized_text","stemmed_text","label"]]

df.head()

In [None]:
#!pip install gensim


# preparing data for ML models

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["label"] = le.fit_transform(df["label"])
df.head()

In [None]:
#df.to_csv('new_dataframe_myproject_stemmed_and_labeled.csv', index=False)


In [None]:
#df=pd.read_csv("new_dataframe_myproject_stemmed_and_labeled.csv")

In [None]:
#!pip install sentence-transformers


## Using Sentence Transformer Vectorization

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import cross_val_score


texts = df['cleaned_text'] #text
labels = df['label']

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(texts)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.2, random_state=42)

print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

In [None]:
"""
np.save("myproject_embeddings_using(all-MiniLM-L6-v2).npy", embeddings)
df['label'].to_csv("myproject_labels.csv", index=False)
"""
"""
X = np.load("myproject_embeddings_using(all-MiniLM-L6-v2).npy")
y = pd.read_csv("myproject_labels.csv").values.ravel()
"""

## training Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=1000)
scores_LogisticRegression = cross_val_score(clf,  X_train, y_train, cv=5)

clf.fit(X_train, y_train)

y_pred_LogisticRegression = clf.predict(X_test)

print("Cross-validation scores:", scores_LogisticRegression)
print("Average Accuracy:", scores_LogisticRegression.mean())
print("Best Accuracy:", scores_LogisticRegression.max())

## training Random Forest model

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf_1 = RandomForestClassifier(n_estimators=100, random_state=42)
scores_RandomForestClassifier = cross_val_score(clf_1,  X_train, y_train, cv=5)

print("Cross-validation scores:", scores_RandomForestClassifier)
print("Average Accuracy:", np.mean(scores_RandomForestClassifier))
print("Best Accuracy:", scores_RandomForestClassifier.max())

## using Gris Search CV to inhuncing the Logistic regression model

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'saga']
}

grid = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5)
grid.fit( X_train, y_train)

print("Best params:", grid.best_params_)
print("Best accuracy:", grid.best_score_)


## compare the accuracy of all models

In [None]:
score_values=[np.mean(scores_LogisticRegression), np.mean(scores_RandomForestClassifier), grid.best_score_]

plt.figure(figsize=(10, 6))
sns.set_style("whitegrid")
plt.bar(
    ["LogisticRegression", "Random Forest", "GridSearchCV"],
    score_values,
    color = ['darkred', 'midnightblue', 'darkgreen']
)
for i, value in enumerate(score_values):
    plt.text(i, value + 0.01 , f"{value * 100:.0f}%" , ha='center', va='bottom',size=14)

plt.xlabel("Models",size=13)
plt.ylabel("Accuracy",size=13)
plt.title("Model Accuracy Comparison",size=15)
plt.ylim(0, 1)
plt.yticks(np.arange(0.1, 1.0, 0.05))
plt.tight_layout()

In [None]:
#import joblib
"""
joblib.dump(grid, 'myproject_grid_model_trained.joblib')
"""

"""
loaded_model = joblib.load('myproject_grid_model_trained.joblib')
"""

In [None]:
"""
joblib.dump(X_train, 'X_train_myproject.pkl')
joblib.dump(X_test, 'X_test_myproject.pkl')
joblib.dump(y_train, 'y_train_myproject.pkl')
joblib.dump(y_test, 'y_test_myproject.pkl')
"""
"""
X_train = joblib.load('X_train_myproject.pkl')
X_test = joblib.load('X_test_myproject.pkl')
y_train = joblib.load('y_train_myproject.pkl')
y_test = joblib.load('y_test_myproject.pkl')
"""

In [None]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = grid.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:
from sklearn.metrics import confusion_matrix



cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix", size=20)
plt.xlabel("Predicted", size=14)
plt.ylabel("Actual", size=14)
plt.show()


In [None]:
df_1=pd.read_csv("/content/Symptom2Disease.csv")
label_count_dict = dict(zip(df["label"].value_counts().index, df_1["label"].value_counts().index))


In [None]:
test_texts = ["I feel very dizzy and tired, my head hurts me very much"]
test_texts_1=["I feel pain, swelling and heaviness in the legs, and it is often accompanied by a burning sensation and discomfort."]
test_texts_2=["I sneeze a lot, my eyes itch and water, and my skin breaks out in rashes after exposure to allergens."]
test_texts_3=["I feel itchy red spots all over my body, with fever, fatigue, and blisters that burst and form scabs."]

def pred(text):
  model_test = SentenceTransformer('all-MiniLM-L6-v2')
  test_embeddings = model_test.encode(text)

  output = grid.predict(test_embeddings)
  predicted_label = output[0]
  print(f"Predicted Label: {label_count_dict[predicted_label]}")
pred(test_texts)
pred(test_texts_1)
pred(test_texts_2)
pred(test_texts_3)

"""
Predicted Label: Hypertension
Predicted Label: Varicose Veins
Predicted Label: allergy
Predicted Label: Chicken pox
"""

In [None]:
label_count_dict