## Import Dataset

In [39]:
import json
import pandas as pd

In [40]:
df_label= pd.read_csv('/home/coeai/Downloads/piyo/symptom_dataset.csv')

In [41]:
df_label

Unnamed: 0,text,label
0,I have been having migraines and headaches. I ...,Drug Reaction
1,I have asthma and I get wheezing and breathing...,Allergy
2,Signs and symptoms of primary ovarian insuffic...,Premature Ovarian Failure
3,"cough,high_fever,breathlessness,family_history...",Bronchial Asthma
4,"chills,vomiting,high_fever,sweating,headache,n...",Malaria
...,...,...
8238,"I have rashes and skin irritations, especially...",diabetes
8239,"Along with impaired eyesight, increased appeti...",Migraine
8240,"Recently, I've been scratching myself a lot, a...",Fungal infection
8241,"I'm running a high temperature. Near my nose, ...",Impetigo


In [42]:
len(df_label['label'].unique())

1092

In [43]:
df_label['label'].value_counts()

label
Malaria                        221
Psoriasis                      221
Arthritis                      221
Varicose Veins                 221
Impetigo                       221
                              ... 
Primary Progressive Aphasia      1
Milk Allergy                     1
Achalasia                        1
Hemolytic Uremic Syndrome        1
Acromegaly                       1
Name: count, Length: 1092, dtype: int64



## Handle Imbalance Dataset

In [44]:
x = df_label.iloc[:,:1]
y = df_label.iloc[:,1:2]

In [45]:
x.reset_index(drop=True)

Unnamed: 0,text
0,I have been having migraines and headaches. I ...
1,I have asthma and I get wheezing and breathing...
2,Signs and symptoms of primary ovarian insuffic...
3,"cough,high_fever,breathlessness,family_history..."
4,"chills,vomiting,high_fever,sweating,headache,n..."
...,...
8238,"I have rashes and skin irritations, especially..."
8239,"Along with impaired eyesight, increased appeti..."
8240,"Recently, I've been scratching myself a lot, a..."
8241,"I'm running a high temperature. Near my nose, ..."


In [46]:
y.reset_index(drop=True)

Unnamed: 0,label
0,Drug Reaction
1,Allergy
2,Premature Ovarian Failure
3,Bronchial Asthma
4,Malaria
...,...
8238,diabetes
8239,Migraine
8240,Fungal infection
8241,Impetigo


In [47]:
print(x.shape)
print(y.shape)

(8243, 1)
(8243, 1)


In [48]:
df_label.isnull().values.any()

False

#### Over Sampling

In [49]:
from imblearn.over_sampling import RandomOverSampler

In [50]:
from collections import Counter
class_counts = Counter(y['label'])
sampling_strategy = {cls: (30*count if count <2 else count) for cls, count in class_counts.items()}

In [51]:
os = RandomOverSampler(sampling_strategy=sampling_strategy)

In [52]:
x_res2,y_res2=os.fit_resample(x,y)

In [53]:
x_res2.shape,y_res2.shape

((38316, 1), (38316, 1))

In [54]:
df_balanced=pd.concat([x_res2,y_res2],axis=1)

In [55]:
df_balanced

Unnamed: 0,text,label
0,I have been having migraines and headaches. I ...,Drug Reaction
1,I have asthma and I get wheezing and breathing...,Allergy
2,Signs and symptoms of primary ovarian insuffic...,Premature Ovarian Failure
3,"cough,high_fever,breathlessness,family_history...",Bronchial Asthma
4,"chills,vomiting,high_fever,sweating,headache,n...",Malaria
...,...,...
38311,Symptoms of Zollinger-Ellison syndrome may inc...,Zollinger Ellison Syndrome
38312,Symptoms of Zollinger-Ellison syndrome may inc...,Zollinger Ellison Syndrome
38313,Symptoms of Zollinger-Ellison syndrome may inc...,Zollinger Ellison Syndrome
38314,Symptoms of Zollinger-Ellison syndrome may inc...,Zollinger Ellison Syndrome


In [56]:
from sklearn.preprocessing import LabelEncoder

In [57]:
label_encoder = LabelEncoder()

# Fit label encoder to disease names
df_balanced['encoded_labels'] = label_encoder.fit_transform(df_balanced['label'])

# Check the mapping of diseases to numbers
label_map = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
print(label_map)
len(label_map)

{'(Vertigo) Paroymsal  Positional Vertigo': 0, 'Abdominal Aortic Aneurysm': 1, 'Acanthosis Nigricans': 2, 'Achalasia': 3, 'Achilles Tendinitis': 4, 'Achilles Tendon Rupture': 5, 'Acl Injury': 6, 'Acne': 7, 'Acoustic Neuroma': 8, 'Acromegaly': 9, 'Actinic Keratosis': 10, 'Acute Coronary Syndrome': 11, 'Acute Flaccid Myelitis': 12, 'Acute Liver Failure': 13, 'Acute Lymphocytic Leukemia': 14, 'Acute Myelogenous Leukemia': 15, 'Acute Sinusitis': 16, 'Addisons Disease': 17, 'Adenomyosis': 18, 'Adhd': 19, 'Adjustment Disorders': 20, 'Adrenal Cancer': 21, 'Adult Adhd': 22, 'Adult Congenital Heart Disease': 23, 'Adult Stills Disease': 24, 'Age Spots': 25, 'Agoraphobia': 26, 'Aids': 27, 'Airplane Ear': 28, 'Albinism': 29, 'Alcohol Intolerance': 30, 'Alcohol Poisoning': 31, 'Alcohol Use Disorder': 32, 'Alcoholic Hepatitis': 33, 'Allergies': 34, 'Allergy': 35, 'Alpha Gal Syndrome': 36, 'Alzheimers Disease': 37, 'Ambiguous Genitalia': 38, 'Ameloblastoma': 39, 'Amenorrhea': 40, 'Amnesia': 41, 'Ampu

1092

In [58]:
df_balanced

Unnamed: 0,text,label,encoded_labels
0,I have been having migraines and headaches. I ...,Drug Reaction,311
1,I have asthma and I get wheezing and breathing...,Allergy,35
2,Signs and symptoms of primary ovarian insuffic...,Premature Ovarian Failure,802
3,"cough,high_fever,breathlessness,family_history...",Bronchial Asthma,149
4,"chills,vomiting,high_fever,sweating,headache,n...",Malaria,600
...,...,...,...
38311,Symptoms of Zollinger-Ellison syndrome may inc...,Zollinger Ellison Syndrome,1085
38312,Symptoms of Zollinger-Ellison syndrome may inc...,Zollinger Ellison Syndrome,1085
38313,Symptoms of Zollinger-Ellison syndrome may inc...,Zollinger Ellison Syndrome,1085
38314,Symptoms of Zollinger-Ellison syndrome may inc...,Zollinger Ellison Syndrome,1085


In [59]:
y=list(df_balanced['encoded_labels'])

In [60]:
len(y)

38316

In [61]:
x=list(df_balanced['text'])

In [62]:
len(x)

38316

### Train Test Split

In [63]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 42,shuffle=True)


## Model Train

In [64]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')



In [65]:
train_encodings = tokenizer(x_train, truncation=True, padding=True)
test_encodings = tokenizer(x_test, truncation=True, padding=True)

In [66]:
import tensorflow as tf
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
))

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
))

In [67]:
from transformers import TFDistilBertForSequenceClassification
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(label_map))

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [68]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

In [70]:
history = model.fit(
    train_dataset.shuffle(100).batch(16),
    epochs=5,
    validation_data=test_dataset.batch(16)
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [71]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

Physical devices cannot be modified after being initialized


In [76]:
model.predict(test_dataset)



TFSequenceClassifierOutput(loss=None, logits=array([[-40.571964 , -32.209625 , -34.385403 , ..., -34.66711  ,
        -33.431705 , -28.729927 ],
       [-40.41717  , -31.402586 , -41.161594 , ..., -33.683872 ,
        -28.154911 , -26.344643 ],
       [-17.055494 , -13.5229845, -11.987849 , ..., -12.517168 ,
        -14.525091 , -12.302404 ],
       ...,
       [-33.39124  , -31.607574 , -30.303644 , ..., -29.730427 ,
        -29.686369 , -28.84664  ],
       [-39.30135  , -38.899715 , -39.558247 , ..., -40.81652  ,
        -35.330658 , -32.942158 ],
       [-31.24191  , -27.738544 , -29.06586  , ..., -26.248901 ,
        -25.750252 , -24.034447 ]], dtype=float32), hidden_states=None, attentions=None)

In [80]:
predictions = model.predict(test_dataset)
print(predictions)
print(len(predictions)) # Check the length of the returned tuple/object

TFSequenceClassifierOutput(loss=None, logits=array([[-40.571964 , -32.209625 , -34.385403 , ..., -34.66711  ,
        -33.431705 , -28.729927 ],
       [-40.41717  , -31.402586 , -41.161594 , ..., -33.683872 ,
        -28.154911 , -26.344643 ],
       [-17.055494 , -13.5229845, -11.987849 , ..., -12.517168 ,
        -14.525091 , -12.302404 ],
       ...,
       [-33.39124  , -31.607574 , -30.303644 , ..., -29.730427 ,
        -29.686369 , -28.84664  ],
       [-39.30135  , -38.899715 , -39.558247 , ..., -40.81652  ,
        -35.330658 , -32.942158 ],
       [-31.24191  , -27.738544 , -29.06586  , ..., -26.248901 ,
        -25.750252 , -24.034447 ]], dtype=float32), hidden_states=None, attentions=None)
1


In [84]:
predictions=predictions.logits

In [85]:
print(predictions.shape)

(7664, 1092)


In [89]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score

predicted_classes = tf.math.argmax(predictions, axis=1)
# Convert tensor to numpy array
predicted_classes = predicted_classes.numpy()

# Calculate the confusion matrix
cm = confusion_matrix(y_test, predicted_classes)
print(cm)

# Calculate accuracy
accuracy = accuracy_score(y_test, predicted_classes)
print(f"Accuracy: {accuracy:.4f}")

# Calculate F1 score
f1 = f1_score(y_test, predicted_classes, average='weighted')  # Use 'weighted' for multi-class
print(f"F1 Score: {f1:.4f}")

# Calculate precision
precision = precision_score(y_test, predicted_classes, average='weighted')
print(f"Precision: {precision:.4f}")

# Calculate recall
recall = recall_score(y_test, predicted_classes, average='weighted')
print(f"Recall: {recall:.4f}")

[[26  0  0 ...  0  0  0]
 [ 0  5  0 ...  0  0  0]
 [ 0  0  9 ...  0  0  0]
 ...
 [ 0  0  0 ... 12  0  0]
 [ 0  0  0 ...  0 13  0]
 [ 0  0  0 ...  0  0  9]]
Accuracy: 0.9901
F1 Score: 0.9903
Precision: 0.9916
Recall: 0.9901


In [92]:
model.save('/home/coeai/Downloads/piyo/sympomt2Dieses',save_format='tf')

INFO:tensorflow:Assets written to: /home/coeai/Downloads/piyo/sympomt2Dieses/assets


INFO:tensorflow:Assets written to: /home/coeai/Downloads/piyo/sympomt2Dieses/assets
