In [4]:
# Import required libraries
import nltk
import numpy as np
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
import tensorflow as tf
import string

# Download required NLTK resources (uncomment if not already downloaded)
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

# Initialize the stopwords, lemmatizer, and stemmer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Function to preprocess text data (Tokenization, Lemmatization, Stemming, Stopwords Removal)
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    
    # Remove punctuation and stopwords, and perform lemmatization and stemming
    processed_tokens = []
    for word in tokens:
        if word not in stop_words and word not in string.punctuation:
            lemmatized_word = lemmatizer.lemmatize(word)  # Lemmatization
            stemmed_word = stemmer.stem(lemmatized_word)  # Stemming
            processed_tokens.append(stemmed_word)
    
    return processed_tokens

# Read the text data file
file_path = 'text2.txt'  # You can change this to your actual file path
with open(file_path, 'r', encoding='utf-8') as file:
    text_data = file.read()

# Preprocess the text data
tokens = preprocess_text(text_data)

# ============================
# WORD2VEC PART
# ============================

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=[tokens], vector_size=100, window=5, min_count=1, sg=0)  # sg=0 for CBOW

# Get the list of all unique words in the vocabulary
vocab = list(word2vec_model.wv.index_to_key)

# Initialize a matrix to store similarity scores
similarity_matrix = np.zeros((len(vocab), len(vocab)))

# Compute pairwise similarity for all words in the vocabulary
for i, word1 in enumerate(vocab):
    for j, word2 in enumerate(vocab):
        similarity_matrix[i, j] = word2vec_model.wv.similarity(word1, word2)

# Convert similarity matrix into a pandas DataFrame for easy visualization
similarity_df = pd.DataFrame(similarity_matrix, index=vocab, columns=vocab)

# Display the first few rows of the similarity matrix
print(similarity_df.head())

# ============================
# NEURAL NETWORK PART
# ============================

# Convert tokens to Word2Vec embeddings
word_embeddings = np.array([word2vec_model.wv[word] for word in tokens if word in word2vec_model.wv])

# Aggregate Word2Vec embeddings by averaging
average_embedding = np.mean(word_embeddings, axis=0)

# Reshape the averaged embedding to match the input expected by the model
average_embedding = average_embedding.reshape(1, -1)  # Reshape to (1, 100)

# For demo purposes, we create a mock label (you can replace it with your real labels)
labels = np.array([1])  # Assuming binary classification (0 or 1), change based on your data

# Define the ANN model
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(average_embedding.shape[1],)),  # Input layer based on averaged Word2Vec vector size
    tf.keras.layers.Dense(64, activation='relu'),  # First hidden layer with 64 neurons
    tf.keras.layers.Dense(32, activation='relu'),  # Second hidden layer with 32 neurons
    tf.keras.layers.Dense(1, activation='sigmoid')  # Output layer (binary classification)
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print model summary
model.summary()

# Train the model on the averaged Word2Vec embedding
history = model.fit(average_embedding, labels, epochs=10, verbose=1)

# Evaluate model performance
loss, accuracy = model.evaluate(average_embedding, labels, verbose=1)
print(f"\nFinal loss: {loss}")
print(f"Final accuracy: {accuracy}")


            one     child        ''      time      like   without     littl  \
one    1.000000 -0.005589 -0.036006 -0.100524 -0.021266 -0.039723  0.029068   
child -0.005589  1.000000 -0.019693  0.067477  0.002063  0.015178 -0.108766   
''    -0.036006 -0.019693  1.000000 -0.003567  0.172956  0.083302  0.159254   
time  -0.100524  0.067477 -0.003567  1.000000 -0.048268  0.142675  0.052523   
like  -0.021266  0.002063  0.172956 -0.048268  1.000000  0.145309  0.040206   

           life      word     could  ...    vitiat     brain      real  \
one    0.107384  0.035972  0.222857  ... -0.022165  0.125102  0.026518   
child -0.108151  0.034731 -0.094406  ... -0.046126 -0.048296 -0.017403   
''     0.014497  0.207438 -0.025448  ... -0.067862 -0.069103 -0.085746   
time  -0.003863  0.078271 -0.163900  ... -0.103645  0.090741 -0.041755   
like  -0.023750 -0.067008 -0.174552  ... -0.180237 -0.130274 -0.028213   

         physic     moral    situat      draw      back  insurmount  \
one    0.



Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 1.0000 - loss: 0.6929
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 1.0000 - loss: 0.6909
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 1.0000 - loss: 0.6883
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 1.0000 - loss: 0.6856
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 1.0000 - loss: 0.6830
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 1.0000 - loss: 0.6804
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 1.0000 - loss: 0.6776
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 1.0000 - loss: 0.6748
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m