In [3]:
!pip install tensorflow_hub

Collecting tensorflow_hub
  Using cached tensorflow_hub-0.16.1-py2.py3-none-any.whl.metadata (1.3 kB)
Collecting tf-keras>=2.14.1 (from tensorflow_hub)
  Using cached tf_keras-2.17.0-py3-none-any.whl.metadata (1.6 kB)
Collecting tensorflow<2.18,>=2.17 (from tf-keras>=2.14.1->tensorflow_hub)
  Using cached tensorflow-2.17.0-cp311-cp311-win_amd64.whl.metadata (3.2 kB)
Collecting tensorflow-intel==2.17.0 (from tensorflow<2.18,>=2.17->tf-keras>=2.14.1->tensorflow_hub)
  Using cached tensorflow_intel-2.17.0-cp311-cp311-win_amd64.whl.metadata (5.0 kB)
Collecting tensorboard<2.18,>=2.17 (from tensorflow-intel==2.17.0->tensorflow<2.18,>=2.17->tf-keras>=2.14.1->tensorflow_hub)
  Using cached tensorboard-2.17.1-py3-none-any.whl.metadata (1.6 kB)
Using cached tensorflow_hub-0.16.1-py2.py3-none-any.whl (30 kB)
Using cached tf_keras-2.17.0-py3-none-any.whl (1.7 MB)
Using cached tensorflow-2.17.0-cp311-cp311-win_amd64.whl (2.0 kB)
Using cached tensorflow_intel-2.17.0-cp311-cp311-win_amd64.whl (385.0

  You can safely remove it manually.

[notice] A new release of pip is available: 24.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import dash
from dash import dcc, html
import pandas as pd
import re
import nltk
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.preprocessing import LabelBinarizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import base64
from io import BytesIO

# Download NLTK data (if not already downloaded)
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Define your local paths for train dataset
train_path = "C:\\Users\\roelr\\OneDrive\\Documents\\ADAN\\7431\\nlp_disaster_dashboard\\train.csv"

# Load the train dataset
train_data = pd.read_csv(train_path)

# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = word_tokenize(text)
    words = [word for word in words if word not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Apply preprocessing to the text column
train_data['cleaned_text'] = train_data['text'].apply(preprocess_text)

# Tokenizer and BERT model from Hugging Face
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Tokenize the dataset
def tokenize_texts(texts):
    return tokenizer(
        texts.tolist(), padding=True, truncation=True, max_length=128, return_tensors='tf'
    )

X_train_tokenized = tokenize_texts(train_data['cleaned_text'])

# Extract BERT features
bert_features = bert_model(X_train_tokenized.input_ids)[1]  # Pooled output from BERT

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(bert_features, train_data['target'], test_size=0.2, random_state=42)

# Neural Network on top of BERT features
nn_model = Sequential()
nn_model.add(Dense(128, activation='relu', input_shape=(bert_features.shape[1],)))  # Input layer
nn_model.add(Dropout(0.3))  # Dropout to prevent overfitting
nn_model.add(Dense(64, activation='relu'))  # Hidden layer
nn_model.add(Dropout(0.3))
nn_model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the model
nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
nn_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Make predictions
y_pred_bert_nn = (nn_model.predict(X_test) > 0.5).astype("int32")

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_bert_nn)
classification_rep = classification_report(y_test, y_pred_bert_nn)

# Output results
print(f"BERT + NN Accuracy: {accuracy:.2f}")
print(f"BERT + NN Classification Report:\n{classification_rep}")





[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\roelr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\roelr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\roelr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.6009 - loss: 0.6456 - val_accuracy: 0.7991 - val_loss: 0.4507
Epoch 2/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8667 - loss: 0.3278 - val_accuracy: 0.7846 - val_loss: 0.4667
Epoch 3/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9103 - loss: 0.2357 - val_accuracy: 0.7728 - val_loss: 0.5571
Epoch 4/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9405 - loss: 0.1641 - val_accuracy: 0.7531 - val_loss: 0.6593
Epoch 5/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9600 - loss: 0.1102 - val_accuracy: 0.7584 - val_loss: 0.7719
Epoch 6/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9706 - loss: 0.0849 - val_accuracy: 0.7492 - val_loss: 0.8745
Epoch 7/10
[1m191/191[0m 










RuntimeError: Op type not registered 'CaseFoldUTF8' in binary running on ROELPC. Make sure the Op and Kernel are registered in the binary running in this process. Note that if you are loading a saved graph which used ops from tf.contrib (e.g. `tf.contrib.resampler`), accessing should be done before importing the graph, as contrib ops are lazily registered when the module is first accessed.