In [1]:
import os
import json
import pickle
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam





In [2]:
with open('data/chatdata.json') as file:
    data = json.load(file)
questions = [chat["Question"] for chat in data["Chats"]]
answers = [chat["Reply"] for chat in data["Chats"]]



In [3]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(answers)
categorical_labels = to_categorical(encoded_labels)

# Save the LabelEncoder
with open('label_encoder.pkl', 'wb') as le_file:
    pickle.dump(label_encoder, le_file)

In [4]:
os.environ['TFHUB_CACHE_DIR'] = 'A:/CLZ-FILES/YEAR_3/Desertation_project/Breast Cancer/breast cancer/data/transhub'
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
questions_embeddings = embed(questions)













In [5]:
embedding_dim = questions_embeddings.shape[1]
model = Sequential([
    Dense(512, activation='relu', input_shape=(embedding_dim,)),
    Dropout(0.2),
    Dense(256, activation='relu'),
    Dense(len(np.unique(encoded_labels)), activation='softmax')  # Output layer for classification
])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [6]:
model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

In [7]:
X_train, X_test, y_train, y_test = train_test_split(questions_embeddings.numpy(), categorical_labels, test_size=0.2, random_state=42)
model.fit(X_train, y_train, epochs=100, validation_data=(X_test, y_test))  # Reduced epochs for faster demonstration


Epoch 1/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - accuracy: 0.0021 - loss: 6.5831 - val_accuracy: 0.0000e+00 - val_loss: 6.6094
Epoch 2/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.0357 - loss: 6.5469 - val_accuracy: 0.0000e+00 - val_loss: 6.7796
Epoch 3/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.0104 - loss: 6.4425 - val_accuracy: 0.0000e+00 - val_loss: 7.7788
Epoch 4/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.0160 - loss: 6.1969 - val_accuracy: 0.0000e+00 - val_loss: 8.6661
Epoch 5/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.0232 - loss: 5.9017 - val_accuracy: 0.0000e+00 - val_loss: 10.9065
Epoch 6/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.0617 - loss: 5.3513 - val_accuracy: 0.0000e+00 - val_loss: 12.5678
Epoch 7

<keras.src.callbacks.history.History at 0x29eff675c50>

In [8]:
model.save('chatmodel.keras')


In [9]:
def predict_response(question):
    question_embedding = embed([question]).numpy()
    predictions = model.predict(question_embedding)
    predicted_label = np.argmax(predictions, axis=1)
    return label_encoder.inverse_transform(predicted_label)[0]

# Testing the model
question = "Hey! Can breast cancer be detected through a blood test?"
response = predict_response(question)
print(f"Response: {response}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
Response: Currently, there isn't a specific blood test for detecting breast cancer. Imaging tests, such as mammograms and biopsies, are commonly used for diagnosis.
