In [1]:
import tensorflow as tf
import numpy as np

In [2]:
# Load the data
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  7318k      0  0:00:11  0:00:11 --:--:-- 15.3M


In [3]:
!ls aclImdb

imdbEr.txt  imdb.vocab	README	test  train


In [4]:
!ls aclImdb/train

labeledBow.feat  pos	unsupBow.feat  urls_pos.txt
neg		 unsup	urls_neg.txt   urls_unsup.txt


In [5]:
!cat aclImdb/train/pos/6248_7.txt

Being an Austrian myself this has been a straight knock in my face. Fortunately I don't live nowhere near the place where this movie takes place but unfortunately it portrays everything that the rest of Austria hates about Viennese people (or people close to that region). And it is very easy to read that this is exactly the directors intention: to let your head sink into your hands and say "Oh my god, how can THAT be possible!". No, not with me, the (in my opinion) totally exaggerated uncensored swinger club scene is not necessary, I watch porn, sure, but in this context I was rather disgusted than put in the right context.<br /><br />This movie tells a story about how misled people who suffer from lack of education or bad company try to survive and live in a world of redundancy and boring horizons. A girl who is treated like a whore by her super-jealous boyfriend (and still keeps coming back), a female teacher who discovers her masochism by putting the life of her super-cruel "lover" 

In [6]:
!rm -r aclImdb/train/unsup

In [7]:
#Generating training,validation and test datasets
batch_size = 32
raw_train_data = tf.keras.utils.text_dataset_from_directory(
    "aclImdb/train",
    batch_size=batch_size,
    validation_split=0.2,
    subset="training",
    seed=1337,
)
raw_val_data = tf.keras.utils.text_dataset_from_directory(
    "aclImdb/train",
    batch_size=batch_size,
    validation_split=0.2,
    subset="validation",
    seed=1337,
)
raw_test_data = tf.keras.utils.text_dataset_from_directory(
    "aclImdb/test", batch_size=batch_size
)

print(f"batches in raw_training_dataset is: {raw_train_data.cardinality()}")
print(f"batches in raw_validation_dataset is: {raw_val_data.cardinality()}")
print(f"batches in raw_test_dataset is: {raw_test_data.cardinality()}")

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.
batches in raw_training_dataset is: 625
batches in raw_validation_dataset is: 157
batches in raw_test_dataset is: 782


In [8]:
for text_batch, label_batch in raw_train_data.take(2):
    for i in range(6):
        print(text_batch.numpy()[i])
        print(label_batch.numpy()[i])

b'I\'ve seen tons of science fiction from the 70s; some horrendously bad, and others thought provoking and truly frightening. Soylent Green fits into the latter category. Yes, at times it\'s a little campy, and yes, the furniture is good for a giggle or two, but some of the film seems awfully prescient. Here we have a film, 9 years before Blade Runner, that dares to imagine the future as somthing dark, scary, and nihilistic. Both Charlton Heston and Edward G. Robinson fare far better in this than The Ten Commandments, and Robinson\'s assisted-suicide scene is creepily prescient of Kevorkian and his ilk. Some of the attitudes are dated (can you imagine a filmmaker getting away with the "women as furniture" concept in our oh-so-politically-correct-90s?), but it\'s rare to find a film from the Me Decade that actually can make you think. This is one I\'d love to see on the big screen, because even in a widescreen presentation, I don\'t think the overall scope of this film would receive its

In [9]:
#Preparing the data
from tensorflow.keras.layers import TextVectorization
import string
import re
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, f"[{re.escape(string.punctuation)}]", ""
    )

In [10]:
# Model constants.
max_features = 20000
embedding_dim = 128
sequence_length = 500

vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)
text_ds = raw_train_data.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)

In [11]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

In [12]:
# Vectorizing the data.
train_data = raw_train_data.map(vectorize_text)
val_data = raw_val_data.map(vectorize_text)
test_data = raw_test_data.map(vectorize_text)

In [13]:
# perform  prefetch buffering on the data.
train_data = train_data.cache().prefetch(buffer_size=10)
val_data = val_data.cache().prefetch(buffer_size=10)
test_data = test_data.cache().prefetch(buffer_size=10)


In [14]:
from tensorflow.keras import layers

# A integer input for vocab indices.
inputs = tf.keras.Input(shape=(None,), dtype="int64")

# Next, we add a layer to map those vocab indices into a space of dimensionality
# 'embedding_dim'.
x = layers.Embedding(max_features, embedding_dim)(inputs)
x = layers.Dropout(0.5)(x)

# Conv1D + global max pooling
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)

# We add a vanilla hidden layer:
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)

# We project onto a single unit output layer, and squash it with a sigmoid:
predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)

model = tf.keras.Model(inputs, predictions)

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])


In [15]:
# A integer input for vocab indices.
inputs = tf.keras.Input(shape=(None,), dtype="int64")

# Next, we add a layer to map those vocab indices into a space of dimensionality
# 'embedding_dim'.
embedding_dim = 64
x = layers.Embedding(max_features, embedding_dim)(inputs)
x = layers.Dropout(0.5)(x)

# Conv1D + global max pooling
filters = 128
kernel_size = 5
x = layers.Conv1D(filters, kernel_size, padding="valid", activation="relu", strides=1)(x)
x = layers.MaxPooling1D(pool_size=2)(x)
x = layers.Conv1D(filters, kernel_size, padding="valid", activation="relu", strides=1)(x)
x = layers.GlobalMaxPooling1D()(x)

# We add a vanilla hidden layer:
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)

# We project onto a single unit output layer, and squash it with a sigmoid:
predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)

model = tf.keras.Model(inputs, predictions)

# Compile the model with binary crossentropy loss and an adam optimizer.
learning_rate = 1e-4
model.compile(loss="binary_crossentropy", optimizer=tf.keras.optimizers.Adam(lr=learning_rate), metrics=["accuracy"])




In [16]:
#Train the model
epochs = 3

# Fit the model using the train and test datasets.
model.fit(train_data, validation_data=val_data, epochs=epochs)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f580e47fc10>

In [17]:
#Evaluate the model
model.evaluate(test_data)



[0.3285134434700012, 0.871999979019165]

Task 2 : Seq2seq is a family of machine-learning approaches used for natural language processing. Applications include language translation, image captioning, conversational models, and text summarization. "ted_multi_translate" is a multilingual (60 language) data set derived from TED Talk transcripts. Construct a seq2seq model, train it with "tec_multi_translate" training data split, and report your model performance with the test data split. You can use various techniques, such as LSTM, transformers, attention, and pretraining/fine-tuning. This task is open-ended. Useful links:
https://www.tensorflow.org/text/tutorials/nmt_with_attention
https://www.tensorflow.org/datasets/catalog/ted_multi_translate

In [18]:
#Import the packages
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.models import Model

In [19]:
!curl -O http://phontron.com/data/ted_talks.tar.gz
!tar -xf ted_talks.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  335M  100  335M    0     0  32.2M      0  0:00:10  0:00:10 --:--:-- 35.4M


In [20]:
# Read the data
training_data = pd.read_csv('all_talks_train.tsv', sep='\t').head(1000)
testing_data = pd.read_csv('all_talks_test.tsv', sep='\t').head(1000)

In [21]:
# Preprocess the data
training_data["en"] = training_data["en"].str.lower()
training_data["de"] = training_data["de"].str.lower()

In [22]:
# Tokenize the text
english_tokenize = Tokenizer()
english_tokenize.fit_on_texts(training_data["en"])
german_tokenize = Tokenizer()
german_tokenize.fit_on_texts(training_data["de"])

In [23]:
# Convert text to sequences
english_seq = english_tokenize.texts_to_sequences(training_data["en"])
german_seq = german_tokenize.texts_to_sequences(training_data["de"])

In [24]:
# Pad the sequences to a fixed length
max_len = 100
fixed_english_seq = pad_sequences(english_seq, maxlen=max_len, padding="post", truncating="post")
fixed_german_seq = pad_sequences(german_seq, maxlen=max_len + 1, padding="post", truncating="post")

In [25]:
# Split the data into training and testing sets
test_size = 0.2
english_train, english_test, german_train, german_test = train_test_split(
    fixed_english_seq, fixed_german_seq, test_size=test_size, random_state=42)

In [26]:
# Define input and output sequence length
input_seq_len = english_train.shape[1]
output_seq_len = english_train.shape[1]

# Define vocabulary size for English and French
vocab_size_english = len(english_tokenize.word_index) + 1
vocab_size_german = len(german_tokenize.word_index) + 1

# Define input layer
input_layer = Input(shape=(input_seq_len,))

# Define embedding size
embedding_size = 256

# Define embedding layer for encoder
encoder_embedding = Embedding(vocab_size_english, embedding_size, mask_zero=True)(input_layer)

# Define LSTM layer for encoder
lstm_layer_encoder = LSTM(256, return_state=True)

# Get encoder outputs and states
encoder_outputs, state_h, state_c = lstm_layer_encoder(encoder_embedding)


In [27]:
#Define input layer for decoder
decoder_inputs = Input(shape=(output_seq_len,))

#Define embedding layer for decoder
decoder_embedding = Embedding(vocab_size_german, 256, mask_zero=True)(decoder_inputs)

#Define LSTM layer for decoder
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)

#Deine decoder outputs and states
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])

#define decoder_dense with softmax activation
decoder_dense = Dense(vocab_size_german, activation='softmax')

#define outputs 
outputs = decoder_dense(decoder_outputs)

In [28]:
#define model
model = Model([input_layer, decoder_inputs], outputs)

#compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

#print model summary
print(model.summary())

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 100)]        0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 100)]        0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, 100, 256)     804096      ['input_3[0][0]']                
                                                                                                  
 embedding_3 (Embedding)        (None, 100, 256)     990720      ['input_4[0][0]']                
                                                                                            

In [29]:
#define batch_size and epochs
batch_size = 256
epochs = 10
#train the model using english and german training sets and validate with english and german testing sets
model.fit([english_train, german_train[:, :-1]],german_train[:, 1:],batch_size=batch_size,epochs=epochs,validation_data=([english_test, german_test[:, :-1]], german_test[:, 1:]))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f57f99697e0>

In [30]:
#evaluate the model
model.evaluate([english_test, german_test[:, :-1]], german_test[:, 1:])



7.1454033851623535

In [31]:
from nltk.translate.bleu_score import corpus_bleu
import numpy as np


# Predict on test data
preds = model.predict([english_test, german_test[:, :-1]])
preds = np.argmax(preds, axis=-1)

# Convert integer-encoded predictions to text
# Define target_word_index
target_word_index = german_tokenize.word_index
reverse_target_word_index = {v: k for k, v in target_word_index.items()}
preds_text = []
for pred in preds:
    pred_text = " ".join([reverse_target_word_index.get(idx, "<OOV>") for idx in pred])
    preds_text.append(pred_text)

# Convert integer-encoded actuals to text
actuals_text = []
for actual in german_test[:, 1:]:
    actual_text = " ".join([reverse_target_word_index.get(idx, "<OOV>") for idx in actual])
    actuals_text.append(actual_text)

# Calculate BLEU score
bleu_score = corpus_bleu([[actual_text] for actual_text in actuals_text], preds_text)

# Calculate accuracy
accuracy = np.mean(np.array(preds_text) == np.array(actuals_text))

# Print results
print("BLEU score:", bleu_score)
print("Accuracy:", accuracy)


BLEU score: 0.8428206297233874
Accuracy: 0.03
