<a href="https://colab.research.google.com/github/shahabday/NLP_learning/blob/main/DSR_41_Introduction_NLP_deeplearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Notes go here


In [None]:
import tensorflow as tf
from tensorflow.keras import models , layers
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt



In [None]:
#!nvidia-smi


In [None]:
datasets,info = tfds.load(

          "imdb_reviews/plain_text",
          split=['train','test[:50%]','test[50%:]'], # only in tensorflow, would
                                                     #be nice to have it in numpy
          as_supervised = True,
          with_info = True
) # we have three datasets , training , test, validate

In [None]:
datasets_train_original = datasets [0]
dataset_validate_original = datasets [1]
dataset_test_original = datasets [2]

In [None]:
for x,y in datasets_train_original.take(4):
  print( x.numpy().decode('utf-8') )
  print (y.numpy())

In [None]:
lengths = []
for x, _ in datasets_train_original:
  length = len(x.numpy().decode('utf-8').split())
  lengths.append(length)


In [None]:
plt.hist(lengths, bins = 100)
plt.show()

In [None]:
shortest_sample = ' ' * 100_000
longest_sample = ''

for x,y in datasets_train_original :
  x = x.numpy().decode('utf-8')
  if len(x) < len(shortest_sample) :
    shortest_sample = x
  if len(x) > len(longest_sample) :
    longest_sample = x

print (shortest_sample)
print (longest_sample)


In [None]:
labels = []

for _,y in datasets_train_original :
  labels.append(y.numpy())

plt.hist(labels)
plt.show()

In [None]:
vocabulary_size = 1000

encoder = layers.TextVectorization(
    max_tokens = vocabulary_size,
    standardize = 'lower_and_strip_punctuation',
    split = "whitespace",
    output_mode= 'multi_hot'
)

encoder.adapt(datasets_train_original.map(lambda text, label : text).batch(2048))

In [None]:
encoder.get_vocabulary().index('world')

In [None]:
print(encoder.get_vocabulary())

In [None]:
print(list(encoder('I am very happy to be learning at DSR. MOdSHT').numpy()))

In [None]:
# model = models.Sequential()
# model.add(encoder)
# model.build(input_shape=(None,))
# model.summary()

# model.predict(tf.constant(['hello world because I want to']))

In [None]:
dataset_train = datasets_train_original.cache() # lift it to memory
dataset_train = dataset_train.shuffle(25000)
dataset_train = dataset_train.batch(32)

dataset_validate = dataset_test_original.cache()
dataset_validate = dataset_validate.batch(32)

dataset_test = dataset_test_original.cache()
dataset_test = dataset_test.batch(32)



In [None]:
model = models.Sequential()
model.add(encoder)
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1,activation = 'sigmoid'))
#model.add(layers.Dense(2,activation = 'softmax'))
model.build(input_shape=(None,))
model.summary()

model.predict(tf.constant(['hello world because I want to']))
model.compile(
              optimizer = 'adam',
              loss='binary_crossentropy',
              metrics = ['accuracy']
)


print(model.evaluate(dataset_test))

history = model.fit(

          dataset_train ,
          epochs= 10 ,
          validation_data=dataset_validate

)




print(model.evaluate(dataset_test))



In [None]:
def render_history(history):
  plt.plot(history["loss"],label='loss')
  plt.plot(history['val_loss'], label = 'val_loss')
  plt.legend()
  plt.show()
  plt.close()

  plt.plot(history["accuracy"],label='accuracy')
  plt.plot(history['val_accuracy'], label = 'val_accuracy')
  plt.legend()

render_history(history.history)

# Word Embedding

In [None]:
vocabulary_size = 10_000
sequence_length = 128 # AKA context size

encoder = layers.TextVectorization(
                                  max_tokens = vocabulary_size,
                                  output_sequence_length  = sequence_length, # new !
                                   standardize = 'lower_and_strip_punctuation',
                                   split = 'whitespace',
                                  output_mode = 'int' # new

)

encoder.adapt(datasets_train_original.map(lambda text, label : text).batch(2048))




In [None]:
print(encoder.get_vocabulary()[:20])

In [None]:
encoder('Is this the real life isthis just fantasy caught in a landslide . no escape from reality. seeee , MAMA rapsody integrity complexity rare reality matrix mask offside corner steven gerrard table')

In [None]:
model = models.Sequential()
model.add(encoder)
model.add(layers.Embedding(input_dim=vocabulary_size, output_dim=32))
model.add(layers.Reshape((2048,)))
model.add(layers.Dense(32, activation="relu"))
model.add(layers.Dense(1, activation="sigmoid"))
model.build(input_shape=(None,))
model.summary()

model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

#print(model.evaluate(dataset_test))

history = model.fit(
    dataset_train,
    epochs=10,
    validation_data=dataset_validate
)

render_history(history.history)

In [None]:

model.layers[1].get_weights()[0].shape

In [None]:

plt.imshow(model.layers[1].get_weights()[0][:100,::],cmap="inferno")

In [None]:
mport numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# List of target words
words = ['germany', 'france', 'england', 'berlin', 'paris', 'london']
words = [
    'time', 'person', 'year', 'way', 'day', 'thing', 'man', 'world', 'life', 'hand',
    'part', 'child', 'eye', 'woman', 'place', 'work', 'week', 'case', 'point', 'government',
    'company', 'number', 'group', 'problem', 'fact', 'be', 'have', 'do', 'say', 'get'
]


# Retrieve the vocabulary from the embedding layer
vocab = model.layers[0].get_vocabulary()

# Get indices for the target words
indices = [vocab.index(word) for word in words if word in vocab]

# Extract embeddings for the target words
embeddings = model.layers[1].get_weights()[0]  # Assuming the embedding layer is the second layer
selected_embeddings = np.array([embeddings[idx] for idx in indices])

# Initialize t-SNE with desired parameters
tsne = TSNE(n_components=2, perplexity=1, n_iter=1000, random_state=42)

# Fit and transform the embeddings
X_embedded = tsne.fit_transform(selected_embeddings)


In [None]:
# Create a scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(X_embedded[:, 0], X_embedded[:, 1], color='blue')

# Annotate each point with the corresponding word
for i, word in enumerate(words):
    plt.annotate(word, (X_embedded[i, 0], X_embedded[i, 1]), fontsize=12)

# Set plot title and labels
plt.title('t-SNE Visualization of Word Embeddings')
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.grid(True)
plt.show()
