In [10]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

# Load the text data from the file
with open('text3.txt', 'r') as f:
    text_data = f.read()

# Tokenize the text data
tokens = text_data.split()

# Remove stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word.lower() not in stop_words]

# Perform stemming or lemmatization (optional)
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(word) for word in tokens]

# Join the tokens back into a string
text_data = ' '.join(tokens)

# Create a CountVectorizer object
vectorizer = CountVectorizer()

# Fit the vectorizer to the text data and transform it into a matrix
X = vectorizer.fit_transform([text_data])

# Get the feature names (i.e., the unique words in the document)
feature_names = vectorizer.get_feature_names_out()  # Use get_feature_names_out() instead

# Print the feature names and their corresponding frequencies
for feature, freq in zip(feature_names, X.toarray()[0]):
    print(f"{feature}: {freq}")

ability: 2
accounting: 1
accuracy: 2
accurate: 1
accurately: 1
across: 1
action: 1
actionable: 1
actions: 1
activities: 1
activity: 1
additionally: 1
addressed: 1
addressing: 2
adjusting: 1
advanced: 5
advancement: 1
advancing: 1
age: 1
ai: 34
alert: 1
algorithm: 6
algorithms: 3
allow: 1
allowing: 2
also: 2
amount: 2
analysis: 8
analytical: 1
analytics: 5
analyze: 5
analyzing: 3
anomaly: 1
application: 1
applications: 2
area: 1
artificial: 3
aspect: 1
ass: 1
automated: 1
automatically: 2
automating: 2
automation: 2
autonomous: 2
available: 1
aware: 1
bandwidth: 1
based: 2
becomes: 1
benefit: 1
bias: 4
brings: 1
brought: 1
business: 1
capabilities: 3
capability: 3
care: 1
centralized: 1
chain: 1
characteristic: 1
chart: 1
cleaning: 4
clinical: 1
closer: 1
cloud: 1
combination: 1
competitive: 1
complex: 2
component: 1
computer: 3
computing: 2
concerns: 3
conclusion: 1
constitutes: 1
continues: 1
continuously: 1
convergence: 1
core: 1
correct: 1
course: 1
create: 1
credit: 1
critical: 1
c

In [6]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the dataset
with open('text3.txt', 'r') as f:
    text_data = f.readlines()

# Split the data into input text and labels
# Split the data into input text and labels
text = []
labels = []
label_map = {}  # Create a label map to store unique labels
label_index = 0  # Initialize a label index
for line in text_data:
    parts = line.split('\t')
    if len(parts) > 1:
        text.append(parts[0])
        label = parts[1].strip()
    else:
        text.append(parts[0])
        label = 'default_label'  # Replace with your default label

    if label not in label_map:
        label_map[label] = label_index
        label_index += 1

    labels.append(label_map[label])

# Split the data into training and testing sets
train_text, test_text, train_labels, test_labels = train_test_split(text, labels, test_size=0.2, random_state=42)

# One-hot encode the labels
num_classes = len(label_map)
train_labels_onehot = tf.keras.utils.to_categorical(train_labels, num_classes)
test_labels_onehot = tf.keras.utils.to_categorical(test_labels, num_classes)

# Create a tokenizer to split the text into words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_text)

# Convert the text data into sequences of words
train_sequences = tokenizer.texts_to_sequences(train_text)
test_sequences = tokenizer.texts_to_sequences(test_text)

# Pad the sequences to have the same length
max_length = 200
padded_train = pad_sequences(train_sequences, maxlen=max_length)
padded_test = pad_sequences(test_sequences, maxlen=max_length)

# One-hot encode the labels
num_classes = len(set(labels))
train_labels_onehot = tf.keras.utils.to_categorical(train_labels, num_classes)
test_labels_onehot = tf.keras.utils.to_categorical(test_labels, num_classes)

# Define the Bag of Words model
bow_model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(len(tokenizer.word_index) + 1, 64, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

# Compile the model
bow_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
bow_model.fit(padded_train, train_labels_onehot, epochs=10, batch_size=32, validation_data=(padded_test, test_labels_onehot))

# Evaluate the model
loss, accuracy = bow_model.evaluate(padded_test, test_labels_onehot)
print(f'Bag of Words model accuracy: {accuracy:.3f}')

# Define the TF-IDF model
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_train = tfidf_vectorizer.fit_transform(train_text)
tfidf_test = tfidf_vectorizer.transform(test_text)

# Define the TF-IDF model
tfidf_model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(5000,)),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

# Compile the model
tfidf_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
tfidf_model.fit(tfidf_train, train_labels_onehot, epochs=10, batch_size=32, validation_data=(tfidf_test, test_labels_onehot))

# Evaluate the model
loss, accuracy = tfidf_model.evaluate(tfidf_test, test_labels_onehot)
print(f'TF-IDF model accuracy: {accuracy:.3f}')

Epoch 1/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 160ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 2/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 3/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 4/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 5/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 6/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+

ValueError: Exception encountered when calling Sequential.call().

[1mInput 0 of layer "dense_6" is incompatible with the layer: expected axis -1 of input shape to have value 5000, but received input with shape (None, 374)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(None, 374), dtype=float32)
  • training=True
  • mask=None

In [11]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the dataset
with open('text3.txt', 'r') as f:
    text_data = f.readlines()

# Split the data into input text and labels
text = []
labels = []
label_map = {}  # Create a label map to store unique labels
label_index = 0  # Initialize a label index
for line in text_data:
    parts = line.split('\t')
    if len(parts) > 1:
        text.append(parts[0])
        label = parts[1].strip()
    else:
        text.append(parts[0])
        label = 'default_label'  # Replace with your default label

    if label not in label_map:
        label_map[label] = label_index
        label_index += 1

    labels.append(label_map[label])

# Split the data into training and testing sets
train_text, test_text, train_labels, test_labels = train_test_split(text, labels, test_size=0.2, random_state=42)

# Create a tokenizer to split the text into words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_text)

# Convert the text data into sequences of words
train_sequences = tokenizer.texts_to_sequences(train_text)
test_sequences = tokenizer.texts_to_sequences(test_text)

# Pad the sequences to have the same length
max_length = 200
padded_train = pad_sequences(train_sequences, maxlen=max_length)
padded_test = pad_sequences(test_sequences, maxlen=max_length)

# One-hot encode the labels
num_classes = len(label_map)
train_labels_onehot = tf.keras.utils.to_categorical(train_labels, num_classes)
test_labels_onehot = tf.keras.utils.to_categorical(test_labels, num_classes)

# Define the Bag of Words model
bow_model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(len(tokenizer.word_index) + 1, 64, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

# Compile the model
bow_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
bow_model.fit(padded_train, train_labels_onehot, epochs=10, batch_size=32, validation_data=(padded_test, test_labels_onehot))

# Evaluate the model
loss, accuracy = bow_model.evaluate(padded_test, test_labels_onehot)
print(f'Bag of Words model accuracy: {accuracy:.3f}')

# Define the TF-IDF model
tfidf_vectorizer = TfidfVectorizer(max_features=374)  # Adjusted max_features to match the shape of tfidf_train
tfidf_train = tfidf_vectorizer.fit_transform(train_text)
tfidf_test = tfidf_vectorizer.transform(test_text)

# Define the TF-IDF model
tfidf_model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(374,)),  # Adjusted input shape to match the shape of tfidf_train
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

# Compile the model
tfidf_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
tfidf_model.fit(tfidf_train, train_labels_onehot, epochs=10, batch_size=32, validation_data=(tfidf_test, test_labels_onehot))

# Evaluate the model
loss, accuracy = tfidf_model.evaluate(tfidf_test, test_labels_onehot)
print(f'TF-IDF model accuracy: {accuracy:.3f}')

# Word2Vec Model
from gensim.models import Word2Vec

# Load the dataset
with open('text3.txt', 'r') as f:
    text_data = f.readlines()

# Split the data into input text
text = [line.split('\t')[0] for line in text_data]

# Split the text into words
words = [line.split() for line in text]

# Create a Word2Vec model
model = Word2Vec(words, size=100, window=5, min_count=1)

# Get the word vectors
word_vectors = model.wv

# Explore semantic similarity between words
print(word_vectors.similarity('word1', 'word2'))  # Replace 'word1' and 'word2' with the words you want to compare

Epoch 1/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 154ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 2/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 3/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 4/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 5/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 6/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+

InvalidArgumentError: Graph execution error:

Detected at node RaggedGather_1/RaggedGather defined at (most recent call last):
<stack traces unavailable>
indices[11] = 36 is not in [0, 36)
	 [[{{node RaggedGather_1/RaggedGather}}]]
	 [[IteratorGetNext]] [Op:__inference_one_step_on_iterator_8973]

In [14]:
# Import required libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import string

# Download required NLTK resources
'''nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
'''
# Read the content of the file
file_path = 'text3.txt'
with open(file_path, 'r') as file:
    text_data = file.read()

# Initialize the stopwords, lemmatizer, and stemmer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Function to preprocess text data
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    
    # Remove punctuation and stopwords, and perform lemmatization and stemming
    processed_tokens = []
    for word in tokens:
        if word not in stop_words and word not in string.punctuation:
            lemmatized_word = lemmatizer.lemmatize(word)  # Lemmatization
            stemmed_word = stemmer.stem(lemmatized_word)  # Stemming
            processed_tokens.append(stemmed_word)
    
    # Join tokens back into a single string
    return ' '.join(processed_tokens)

# Preprocess the text data
preprocessed_text = preprocess_text(text_data)

# Initialize the CountVectorizer (Bag of Words model)
vectorizer = CountVectorizer()

# Fit and transform the preprocessed text data to create the BoW model
X = vectorizer.fit_transform([preprocessed_text])

# Extract the vocabulary (words and their corresponding indices)
vocabulary = vectorizer.get_feature_names_out()

# Convert the BoW model to an array for easy viewing
bow_array = X.toarray()

# Print the vocabulary and its corresponding vector
print("Vocabulary:\n", vocabulary)
print("\nBag of Words (Vectorization):\n", bow_array)


Vocabulary:
 ['abil' 'account' 'accur' 'accuraci' 'across' 'action' 'activ' 'addit'
 'address' 'adjust' 'advanc' 'age' 'ai' 'alert' 'algorithm' 'allow' 'also'
 'amount' 'analysi' 'analyt' 'analyz' 'anomali' 'applic' 'area' 'artifici'
 'aspect' 'ass' 'autom' 'automat' 'autonom' 'avail' 'awar' 'bandwidth'
 'base' 'becom' 'benefit' 'bia' 'bring' 'brought' 'busi' 'capabl' 'care'
 'central' 'chain' 'characterist' 'chart' 'clean' 'clinic' 'closer'
 'cloud' 'combin' 'competit' 'complex' 'compon' 'comput' 'concern'
 'conclus' 'constitut' 'continu' 'converg' 'core' 'correct' 'cours'
 'creat' 'credit' 'critic' 'crucial' 'custom' 'cybersecur' 'dashboard'
 'data' 'dataset' 'decis' 'decision' 'deep' 'deeper' 'demand' 'deploy'
 'design' 'detect' 'develop' 'devic' 'diagnosi' 'digit' 'discriminatori'
 'disrupt' 'divers' 'drive' 'driven' 'dynam' 'earli' 'edg' 'effici'
 'effort' 'embrac' 'enabl' 'enhanc' 'ensur' 'entiti' 'error' 'essay'
 'essenti' 'establish' 'ethic' 'evalu' 'evolv' 'exampl' 'execut' 'e

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
# Import required libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import tensorflow as tf
import string

# Download required NLTK resources
'''nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
'''
# Read the content of the file
file_path = 'text3.txt'
with open(file_path, 'r') as file:
    text_data = file.read()

# Initialize the stopwords, lemmatizer, and stemmer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Function to preprocess text data (Tokenization, Lemmatization, Stemming, Stopwords Removal)
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    
    # Remove punctuation and stopwords, and perform lemmatization and stemming
    processed_tokens = []
    for word in tokens:
        if word not in stop_words and word not in string.punctuation:
            lemmatized_word = lemmatizer.lemmatize(word)  # Lemmatization
            stemmed_word = stemmer.stem(lemmatized_word)  # Stemming
            processed_tokens.append(stemmed_word)
    
    # Join tokens back into a single string
    return ' '.join(processed_tokens)

# Preprocess the text data
preprocessed_text = preprocess_text(text_data)

# =======================
# BAG OF WORDS (BoW) PART
# =======================

# Initialize the CountVectorizer (Bag of Words model)
vectorizer = CountVectorizer()

# Fit and transform the preprocessed text data to create the BoW model
X = vectorizer.fit_transform([preprocessed_text])

# Extract the vocabulary (words and their corresponding indices)
vocabulary = vectorizer.get_feature_names_out()

# Convert the BoW model to an array for easy viewing
bow_array = X.toarray()

# Print the BoW results
print("Bag of Words (BoW) Vocabulary:\n", vocabulary)
print("\nBag of Words (Vectorization):\n", bow_array)

# ============================
# TENSORFLOW TF-IDF PART STARTS
# ============================

# Initialize the TensorFlow Tokenizer
tokenizer = tf.keras.preprocessing.text.Tokenizer()

# Fit tokenizer on the preprocessed text
tokenizer.fit_on_texts([preprocessed_text])

# Convert text to sequences of integers (word indices)
sequences = tokenizer.texts_to_sequences([preprocessed_text])

# Get the word index (vocabulary mapping)
word_index = tokenizer.word_index

# Convert sequences to one-hot encoded form (Bag of Words equivalent)
one_hot_results = tokenizer.texts_to_matrix([preprocessed_text], mode='count')

# Convert sequences to TF-IDF form
tfidf_results = tokenizer.texts_to_matrix([preprocessed_text], mode='tfidf')

# ====================
# OUTPUT THE RESULTS
# ====================

# Print TensorFlow's vocabulary and TF-IDF results
print("\nTensorFlow Word Index (Vocabulary):", word_index)
print("\nBag of Words (One-Hot Encoding) from TensorFlow:\n", one_hot_results)
print("\nTF-IDF Representation from TensorFlow:\n", tfidf_results)


Bag of Words (BoW) Vocabulary:
 ['abil' 'account' 'accur' 'accuraci' 'across' 'action' 'activ' 'addit'
 'address' 'adjust' 'advanc' 'age' 'ai' 'alert' 'algorithm' 'allow' 'also'
 'amount' 'analysi' 'analyt' 'analyz' 'anomali' 'applic' 'area' 'artifici'
 'aspect' 'ass' 'autom' 'automat' 'autonom' 'avail' 'awar' 'bandwidth'
 'base' 'becom' 'benefit' 'bia' 'bring' 'brought' 'busi' 'capabl' 'care'
 'central' 'chain' 'characterist' 'chart' 'clean' 'clinic' 'closer'
 'cloud' 'combin' 'competit' 'complex' 'compon' 'comput' 'concern'
 'conclus' 'constitut' 'continu' 'converg' 'core' 'correct' 'cours'
 'creat' 'credit' 'critic' 'crucial' 'custom' 'cybersecur' 'dashboard'
 'data' 'dataset' 'decis' 'decision' 'deep' 'deeper' 'demand' 'deploy'
 'design' 'detect' 'develop' 'devic' 'diagnosi' 'digit' 'discriminatori'
 'disrupt' 'divers' 'drive' 'driven' 'dynam' 'earli' 'edg' 'effici'
 'effort' 'embrac' 'enabl' 'enhanc' 'ensur' 'entiti' 'error' 'essay'
 'essenti' 'establish' 'ethic' 'evalu' 'evolv' '

In [16]:
# Import required libraries
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import tensorflow as tf
import string

# Download required NLTK resources
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')

# Read the content of the file
file_path = 'text3.txt'
with open(file_path, 'r') as file:
    text_data = file.read()

# Initialize the stopwords, lemmatizer, and stemmer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Function to preprocess text data (Tokenization, Lemmatization, Stemming, Stopwords Removal)
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    
    # Remove punctuation and stopwords, and perform lemmatization and stemming
    processed_tokens = []
    for word in tokens:
        if word not in stop_words and word not in string.punctuation:
            lemmatized_word = lemmatizer.lemmatize(word)  # Lemmatization
            stemmed_word = stemmer.stem(lemmatized_word)  # Stemming
            processed_tokens.append(stemmed_word)
    
    # Join tokens back into a single string
    return ' '.join(processed_tokens)

# Preprocess the text data
preprocessed_text = preprocess_text(text_data)

# ============================
# TENSORFLOW TF-IDF PART STARTS
# ============================

# Initialize the TensorFlow Tokenizer
tokenizer = tf.keras.preprocessing.text.Tokenizer()

# Fit tokenizer on the preprocessed text
tokenizer.fit_on_texts([preprocessed_text])

# Convert text to TF-IDF form
tfidf_results = tokenizer.texts_to_matrix([preprocessed_text], mode='tfidf')

# ============================
# DEFINE ANN MODEL
# ============================

# For demo purposes, we create a mock label (you can replace it with your real labels)
labels = np.array([1])  # Assuming binary classification (0 or 1), change based on your data

# Define the ANN model
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(tfidf_results.shape[1],)),  # Input layer (TF-IDF input size)
    tf.keras.layers.Dense(64, activation='relu'),  # First hidden layer with 64 neurons
    tf.keras.layers.Dense(32, activation='relu'),  # Second hidden layer with 32 neurons
    tf.keras.layers.Dense(1, activation='sigmoid')  # Output layer (binary classification)
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print model summary
model.summary()

# Train the model on the TF-IDF data
history = model.fit(tfidf_results, labels, epochs=10, verbose=1)

# ====================
# OUTPUT OF ANN TRAINING
# ====================
# Evaluate model performance
loss, accuracy = model.evaluate(tfidf_results, labels, verbose=1)
print(f"\nFinal loss: {loss}")
print(f"Final accuracy: {accuracy}")




Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 826ms/step - accuracy: 1.0000 - loss: 0.4962
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 1.0000 - loss: 0.3755
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 1.0000 - loss: 0.2739
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 1.0000 - loss: 0.1912
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 1.0000 - loss: 0.1314
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 1.0000 - loss: 0.0946
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 1.0000 - loss: 0.0697
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 1.0000 - loss: 0.0509
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 