In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SimpleRNN
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf


ModuleNotFoundError: No module named 'pandas'

In [None]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)


In [None]:
# Load dataset
data = pd.read_csv('/Applications/MITREArticle/annotatedMITRE.csv')

# Preprocess the data
X = data['text'].values
y = data['annotation'].values

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [None]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)


In [None]:
# Tokenization and Padding for LSTM/RNN
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)


In [None]:
max_length = max([len(x) for x in X_train_seq])
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length)


In [None]:
# LSTM Model
def create_lstm_model():
    model = Sequential()
    model.add(Embedding(input_dim=10000, output_dim=128, input_length=max_length))
    model.add(LSTM(128, return_sequences=True))
    model.add(Dropout(0.5))
    model.add(LSTM(64))
    model.add(Dense(len(np.unique(y_encoded)), activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
# Train LSTM Model
lstm_model = create_lstm_model()
lstm_model.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_split=0.2)


In [None]:
# Evaluate LSTM Model
lstm_pred = lstm_model.predict(X_test_pad)
lstm_pred_classes = np.argmax(lstm_pred, axis=1)
print("LSTM Classification Report:\n", classification_report(y_test, lstm_pred_classes))


In [None]:
# RNN Model
def create_rnn_model():
    model = Sequential()
    model.add(Embedding(input_dim=10000, output_dim=128, input_length=max_length))
    model.add(SimpleRNN(128))
    model.add(Dropout(0.5))
    model.add(Dense(len(np.unique(y_encoded)), activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


In [None]:
# Train RNN Model
rnn_model = create_rnn_model()
rnn_model.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_split=0.2)


In [None]:
# Evaluate RNN Model
rnn_pred = rnn_model.predict(X_test_pad)
rnn_pred_classes = np.argmax(rnn_pred, axis=1)
print("RNN Classification Report:\n", classification_report(y_test, rnn_pred_classes))


In [None]:
# BERT Model
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def encode_bert(texts):
    return bert_tokenizer(texts.tolist(), padding=True, truncation=True, return_tensors="tf")

# Prepare data for BERT
train_encodings = encode_bert(X_train)
test_encodings = encode_bert(X_test)


In [None]:
# Create BERT model
def create_bert_model():
    input_ids = tf.keras.Input(shape=(None,), dtype='int32')
    attention_mask = tf.keras.Input(shape=(None,), dtype='int32')
    bert_model = TFBertModel.from_pretrained('bert-base-uncased')
    outputs = bert_model(input_ids, attention_mask=attention_mask)
    x = outputs.last_hidden_state[:, 0, :]  # Take the CLS token representation
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = Dense(len(np.unique(y_encoded)), activation='softmax')(x)
    model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=x)
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
# Train BERT Model
bert_model = create_bert_model()
bert_model.fit({'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask']}, y_train, epochs=5, batch_size=16)


In [None]:
# Evaluate BERT Model
bert_pred = bert_model.predict({'input_ids': test_encodings['input_ids'], 'attention_mask': test_encodings['attention_mask']})
bert_pred_classes = np.argmax(bert_pred, axis=1)
print("BERT Classification Report:\n", classification_report(y_test, bert_pred_classes))