# LM TECH Hub Hackaton Group A

* Group A - 2025

# PROBLEM STATEMENT:

* Develop a model that predicts Sicknesses from Symptoms
* 

# Import Necessary Libraries

In [55]:
import pandas as pd
import numpy as np
import re
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Load the dataset

In [2]:
stm = pd.read_csv('Symptom2Disease.csv')

### Display maximum columns, by default.

In [3]:
pd.set_option('display.max_rows', None)

In [13]:
stm.head(10)

Unnamed: 0,label,text
0,Psoriasis,i have been experiencing a skin rash on my arm...
1,Psoriasis,my skin has been peeling especially on my knee...
2,Psoriasis,i have been experiencing joint pain in my fing...
3,Psoriasis,there is a silver like dusting on my skin espe...
4,Psoriasis,my nails have small dents or pits in them and ...
5,Psoriasis,the skin on my palms and soles is thickened an...
6,Psoriasis,the skin around my mouth nose and eyes is red ...
7,Psoriasis,my skin is very sensitive and reacts easily to...
8,Psoriasis,i have noticed a sudden peeling of skin at dif...
9,Psoriasis,the skin on my genitals is red and inflamed it...


### Transform all text to lower case

In [6]:
stm['text'] = stm['text'].apply(lambda x: x.lower())

### Remove all punctuation marks

In [7]:
stm['text'] = stm['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

In [12]:
stm.head(10)

Unnamed: 0,label,text
0,Psoriasis,i have been experiencing a skin rash on my arm...
1,Psoriasis,my skin has been peeling especially on my knee...
2,Psoriasis,i have been experiencing joint pain in my fing...
3,Psoriasis,there is a silver like dusting on my skin espe...
4,Psoriasis,my nails have small dents or pits in them and ...
5,Psoriasis,the skin on my palms and soles is thickened an...
6,Psoriasis,the skin around my mouth nose and eyes is red ...
7,Psoriasis,my skin is very sensitive and reacts easily to...
8,Psoriasis,i have noticed a sudden peeling of skin at dif...
9,Psoriasis,the skin on my genitals is red and inflamed it...


### Drop the "Unnamed: 0" Column

In [9]:
stm.drop(['Unnamed: 0'], inplace=True, axis=1)

In [11]:
stm.head(10)

Unnamed: 0,label,text
0,Psoriasis,i have been experiencing a skin rash on my arm...
1,Psoriasis,my skin has been peeling especially on my knee...
2,Psoriasis,i have been experiencing joint pain in my fing...
3,Psoriasis,there is a silver like dusting on my skin espe...
4,Psoriasis,my nails have small dents or pits in them and ...
5,Psoriasis,the skin on my palms and soles is thickened an...
6,Psoriasis,the skin around my mouth nose and eyes is red ...
7,Psoriasis,my skin is very sensitive and reacts easily to...
8,Psoriasis,i have noticed a sudden peeling of skin at dif...
9,Psoriasis,the skin on my genitals is red and inflamed it...


### Check for null values
* We found no null values

In [14]:
stm.isnull().sum()

label    0
text     0
dtype: int64

### Check for duplicated values

In [15]:
stm.duplicated().sum()

47

### Drop the Duplicated values, but keep the first occurence

In [16]:
stm.drop_duplicates(keep='first', inplace=True)

In [17]:
stm.duplicated().sum()

0

### Check Data Types and confirm the rows reduced, because of the dropped duplicated rows

In [18]:
stm.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1153 entries, 0 to 1199
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   1153 non-null   object
 1   text    1153 non-null   object
dtypes: object(2)
memory usage: 27.0+ KB


### Prepare the data

In [19]:
X = stm['text'] # Symptoms
y = stm['label'] # Sicknesses

### Split data into training and testing sets

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

### Create a TF-IDF vectorizer

* Vectorization converts the text data into numerical vectors

In [21]:
vectorizer = TfidfVectorizer(max_features=1000) # Limit features to 1000 for simplicity

### Fit the vectorizer to the training data and transform both the training and testing data

In [22]:
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

### Check that the Train set is consistent

In [26]:
X_train_tfidf.shape

(922, 1000)

In [27]:
y_train.shape

(922,)

### Check that the Test set is consistent

In [28]:
X_test_tfidf.shape

(231, 1000)

In [29]:
y_test.shape

(231,)

# (1) Train Model - using Multinomial Naive Bayes

In [30]:
nb = MultinomialNB()

In [31]:
nb.fit(X_train_tfidf, y_train)

### Make Predictions on the testing data

In [32]:
nb_pred = nb.predict(X_test_tfidf)

### Evaluate the Model's Performance

In [33]:
accuracy = accuracy_score(y_test, nb_pred)

In [34]:
accuracy

0.9653679653679653

# (2) Train model - using Logistic Regression

In [36]:
lr = LogisticRegression(max_iter=900)

In [37]:
lr.fit(X_train_tfidf, y_train)

In [38]:
lr_pred = lr.predict(X_test_tfidf)

In [39]:
lr_pred

array(['Cervical spondylosis', 'Dimorphic Hemorrhoids', 'Jaundice',
       'Bronchial Asthma', 'Common Cold', 'drug reaction', 'Impetigo',
       'Malaria', 'Fungal infection', 'Pneumonia',
       'gastroesophageal reflux disease', 'diabetes', 'drug reaction',
       'allergy', 'gastroesophageal reflux disease',
       'urinary tract infection', 'Fungal infection', 'Dengue',
       'Hypertension', 'Dengue', 'allergy', 'Bronchial Asthma', 'Dengue',
       'Typhoid', 'Arthritis', 'Varicose Veins',
       'gastroesophageal reflux disease', 'Common Cold', 'Typhoid',
       'Acne', 'gastroesophageal reflux disease', 'Psoriasis',
       'Common Cold', 'allergy', 'urinary tract infection', 'Chicken pox',
       'Typhoid', 'peptic ulcer disease', 'Typhoid',
       'Dimorphic Hemorrhoids', 'Fungal infection', 'Acne',
       'Varicose Veins', 'urinary tract infection', 'allergy',
       'Cervical spondylosis', 'Dimorphic Hemorrhoids', 'Jaundice',
       'Chicken pox', 'drug reaction', 'drug reac

In [40]:
accuracy_lr = accuracy_score(y_test, lr_pred)

In [41]:
accuracy_lr

0.9783549783549783

# (3) Train model - using Random Forest Classifier

In [43]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)

In [44]:
rf.fit(X_train_tfidf, y_train)

In [45]:
rf_pred = rf.predict(X_test_tfidf)

In [46]:
rf_pred

array(['Cervical spondylosis', 'Dimorphic Hemorrhoids', 'Jaundice',
       'Bronchial Asthma', 'Common Cold', 'drug reaction', 'Impetigo',
       'Malaria', 'Acne', 'Pneumonia', 'gastroesophageal reflux disease',
       'drug reaction', 'drug reaction', 'allergy',
       'gastroesophageal reflux disease', 'urinary tract infection',
       'Fungal infection', 'Dengue', 'Hypertension', 'Dengue', 'allergy',
       'Bronchial Asthma', 'Psoriasis', 'Typhoid', 'Arthritis',
       'Varicose Veins', 'gastroesophageal reflux disease', 'Common Cold',
       'Typhoid', 'Acne', 'gastroesophageal reflux disease', 'Psoriasis',
       'Common Cold', 'allergy', 'urinary tract infection', 'Chicken pox',
       'Typhoid', 'peptic ulcer disease', 'Typhoid',
       'Dimorphic Hemorrhoids', 'Fungal infection', 'Acne',
       'Varicose Veins', 'urinary tract infection', 'Common Cold',
       'Cervical spondylosis', 'Dimorphic Hemorrhoids', 'Jaundice',
       'Chicken pox', 'drug reaction', 'drug reaction', 

In [47]:
accuracy_rf = accuracy_score(y_test, rf_pred)

In [48]:
accuracy_rf

0.9567099567099567

# SUMMARY OF ACCURACY SCORES
* Naive Bayes = 0.97
* Logistic Regression = 0.98
* Random Forest Classifier = 0.96

### The highest predictor algorithm is the LOGISTIC REGRESSION

###
* Print Classification Report of our Logistic Regression model's performance

In [50]:
print(classification_report(y_test, lr_pred))

                                 precision    recall  f1-score   support

                           Acne       1.00      1.00      1.00        11
                      Arthritis       1.00      1.00      1.00         6
               Bronchial Asthma       0.91      1.00      0.95        10
           Cervical spondylosis       1.00      1.00      1.00        11
                    Chicken pox       0.89      1.00      0.94         8
                    Common Cold       1.00      1.00      1.00        11
                         Dengue       1.00      0.91      0.95        11
          Dimorphic Hemorrhoids       1.00      1.00      1.00         9
               Fungal infection       1.00      1.00      1.00        12
                   Hypertension       1.00      1.00      1.00         9
                       Impetigo       1.00      1.00      1.00        11
                       Jaundice       1.00      1.00      1.00         8
                        Malaria       1.00      1.

###
* The 'precision', 'recall' and 'f1-score' confirms that the Logistic Regression actually performs well.

## Next...

### We use Joblib to dump / save our file (with the LOGISTIC REGRESSION train set)

In [56]:
joblib.dump(lr, 'log_symptom.pkl')

['log_symptom.pkl']

### Dump the Vectorizer file as well

In [54]:
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']