In [1]:
import numpy as np
import pandas as pd 

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

In [4]:
import tensorflow as tf
from tensorflow.keras.models import Sequential, save_model
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import ModelCheckpoint

In [5]:
df = pd.read_csv('dataset.csv')
df.head()

Unnamed: 0,condition,drugName
0,Left Ventricular Dysfunction,Valsartan
1,ADHD,Guanfacine
2,Birth Control,Lybrel
3,Birth Control,Ortho Evra
4,Opiate Dependence,Buprenorphine / naloxone


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161297 entries, 0 to 161296
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   condition  160398 non-null  object
 1   drugName   161297 non-null  object
dtypes: object(2)
memory usage: 2.5+ MB


In [7]:
df.shape

(161297, 2)

In [8]:
df.describe()

Unnamed: 0,condition,drugName
count,160398,161297
unique,884,3436
top,Birth Control,Levonorgestrel
freq,28788,3657


In [9]:
X = df['condition']
y = df['drugName']

In [10]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
num_unique_conditions = len(df['condition'].unique())

In [13]:
X_train_encoded = pd.get_dummies(X_train)
X_test_encoded = pd.get_dummies(X_test)

In [14]:
X_train_encoded, X_test_encoded = X_train_encoded.align(X_test_encoded, axis=1, fill_value=0)

In [15]:
X_train_encoded = X_train_encoded.astype(np.float32)
X_test_encoded = X_test_encoded.astype(np.float32)

In [16]:
model = Sequential([
    Dense(256, input_dim=X_train_encoded.shape[1], activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [17]:
model.summary()

In [18]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [19]:
checkpoint_filepath = 'best_model.keras'
model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

In [21]:
history = model.fit(X_train_encoded, y_train, epochs=100, batch_size=64, 
                    validation_data=(X_test_encoded, y_test),
                    callbacks=[model_checkpoint_callback])

Epoch 1/100
[1m2017/2017[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 10ms/step - accuracy: 0.1243 - loss: 4.2101 - val_accuracy: 0.1720 - val_loss: 3.5221
Epoch 2/100
[1m2017/2017[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 10ms/step - accuracy: 0.1378 - loss: 3.8844 - val_accuracy: 0.1754 - val_loss: 3.4270
Epoch 3/100
[1m2017/2017[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 10ms/step - accuracy: 0.1444 - loss: 3.7670 - val_accuracy: 0.1728 - val_loss: 3.3878
Epoch 4/100
[1m2017/2017[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 10ms/step - accuracy: 0.1439 - loss: 3.7039 - val_accuracy: 0.1783 - val_loss: 3.3578
Epoch 5/100
[1m2017/2017[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 10ms/step - accuracy: 0.1468 - loss: 3.6596 - val_accuracy: 0.1769 - val_loss: 3.3489
Epoch 6/100
[1m2017/2017[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 10ms/step - accuracy: 0.1469 - loss: 3.6291 - val_accuracy: 0.1740 - val_loss: 3.338

In [22]:
# model.save('neural_network_model.h5')

In [26]:
best_model = tf.keras.models.load_model(checkpoint_filepath)

In [28]:
y_pred = model.predict(X_test_encoded)
y_pred_classes = np.argmax(y_pred, axis=1)

[1m1009/1009[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step


In [32]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test, y_pred_classes)*100
precision = precision_score(y_test, y_pred_classes, average='weighted')*100
recall = recall_score(y_test, y_pred_classes, average='weighted')*100
f1 = f1_score(y_test, y_pred_classes, average='weighted')*100

print("Accuracy:", accuracy, "%")
print("Precision:", precision, "%")
print("Recall:", recall, "%")
print("F1 Score:", f1, "%")

Accuracy: 18.490390576565403 %
Precision: 11.743138693893426 %
Recall: 18.490390576565403 %
F1 Score: 10.621198224046752 %


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
