In [None]:
#Importing dependencies
import numpy as np
import tensorflow as tf
import pandas as pd
import tensorflow.keras as keras
#!pip install gensim
#!pip install google.cloud
#import gzip
import gensim
import logging

# Bag of Words

## Sample Data

In [None]:
corpus = ['king is a strong man','queen is a wise woman','boy is a young man',
          'girl is a young woman','prince is a young','prince will be strong',
          'princess is young','man is strong','woman is pretty', 'prince is a boy',
          'prince will be king', 'princess is a girl', 'princess will be queen']
print(corpus)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
DTM = cv.fit_transform(corpus)
DTM = pd.DataFrame(DTM.toarray(), columns=cv.get_feature_names_out())
DTM

## DTM on Review Data

In [None]:
data = {'review': ['I loved this movie!', 'It was okay.', 'I hated it.', 'It was amazing!', 'I was disappointed.',
                   'It was a great experience.', 'I fell asleep during the movie.', 'It was a total waste of time.',
                   'I highly recommend this movie.', 'I would not recommend this movie.'],
       'sentiment': ['positive', 'neutral', 'negative', 'positive', 'negative',
                      'positive', 'negative', 'negative', 'positive', 'negative']}
df = pd.DataFrame(data)
df

In [None]:
# Convert the input data into a DTM
cv = CountVectorizer()
dtm = cv.fit_transform(df['review'])
dtm = pd.DataFrame(dtm.toarray(), columns=cv.get_feature_names_out())
dtm["y_value"]=df["sentiment"]
# Print the DTM
dtm

# Word Embeddings

In [None]:
statements = [
"Trees tall",
"Trees green",
"Trees majestic",
"Trees essential",
"Trees diverse",
"Trees oxygen-giving",
"computers fast",
"computers smart",
"computers useful",
"computers powerful",
"computers everywhere",
"computers changing"
]

In [None]:
statements_list = []
for statement in statements:
  statements_list.append(statement.split())
print(statements_list)
from gensim.parsing.preprocessing import STOPWORDS
documents = [[word for word in document if word not in STOPWORDS] for document in statements_list]

In [None]:
from gensim.models import Word2Vec
model = Word2Vec(documents, min_count=1, vector_size=3, window = 3)
#size： size of word vector, hidden layer
#min-count：discard words that appear less than # times
#window：Context Window size

## Hyperparameters

### size
The hidden nodes size. The size of the dense vector to represent each token or word. If you have very limited data, then size should be a much smaller value. If you have lots of data, its good to experiment with various sizes.

### window
Context window size. The maximum distance between the target word and its neighboring word. If your neighbor's position is greater than the maximum window width to the left and the right, then, some neighbors are not considered as being related to the target word. In theory, a smaller window should give you terms that are more related. If you have lots of data, then the window size should not matter too much, as long as its a decent sized window.

### min_count
Minimium frequency count of words. The model would ignore words that do not statisfy the min_count. Extremely infrequent words are usually unimportant, so its best to get rid of those. Unless your dataset is really tiny, this does not really affect the model.

## Checking the word2vec output

In [None]:
for word, vector in zip(model.wv.index_to_key, model.wv.vectors):
  print(word, vector)

In [None]:
import matplotlib.pyplot as plt
# Visualize the word vectors in 3D space using PCA
vectors = model.wv.vectors


fig = plt.figure(figsize=(15,10))
ax = plt.axes(projection='3d')
ax = plt.axes(projection='3d')

xdata = vectors[:, 0]
ydata = vectors[:, 1]
zdata = vectors[:, 2]
names=model.wv.index_to_key

ax.scatter3D(xdata, ydata, zdata, s=200 , c=xdata)
for names, x, y, z in zip(names, xdata, ydata, zdata):
    label = names
    ax.text(x, y, z, label )
plt.show()


# Word2Vec Example-2

In [None]:
import urllib.request
urllib.request.urlretrieve("https://raw.githubusercontent.com/venkatareddykonasani/Datasets/master/Amazon_Yelp_Reviews/Review_Data.csv", "Review_Data.csv")
data_file="Review_Data.csv"

In [None]:
def read_input(input_file):
    with open (input_file, 'rb') as f:
        for i, line in enumerate (f):
            # do some pre-processing and return a list of words for each review text
            yield gensim.utils.simple_preprocess (line)
            # read the tokenized reviews into a list
            # each review item becomes a series of words
            # so this becomes a list of lists
    print("File reading done !!")
documents = list (read_input (data_file))

In [None]:
print(documents)

In [None]:
from gensim.models import Word2Vec
model = Word2Vec(documents, min_count=1, vector_size=10)
#size： size of word vector, hidden layer
#min-count：discard words that appear less than # times
#window：Context Window size

In [None]:
# Print the vectors for a couple of words

for word, vector in zip(model.wv.index_to_key, model.wv.vectors):
  if word in ["good", "bad", "money"]:
    print(word, vector)

# Word Embeddings on TensorFlow - Sentiment Analysis Project

In [None]:
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers import GlobalAveragePooling1D, Dropout, Embedding, Dense

## Data Downloading

In [None]:
!gdown https://drive.google.com/u/0/uc?id=1DWm8nOX2nIXU4-1trQE--AcqkmcM8J_K -O aclImdb_v1.tar.gz
!tar -zxvf 'aclImdb_v1.tar.gz' # Untar the dataset
!ls ./aclImdb/ # Display the contents of the folder

## Pre-processing

In [None]:
train_dir="/content/aclImdb/train"
shutil.rmtree(os.path.join(train_dir, 'unsup')) # Removing 'unsup' folder. Not required here

train_datagen = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', # Train data Folder
    validation_split=0.2,  #80% Train and 20% Test
    subset='training',
    seed=55 #Seed is used to to make sure that evertime we get the same train and test data
    )

test_datagen = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train',
    validation_split=0.2,
    subset='validation',
    seed=55)

#Data Pre-processing - Also known as "standardizing"
def pre_process(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')  #Remove HTML tags like <br />
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),  #Remove HTML links
                                  '')

#Mapping words to numbers - Also known as TextVectorization
vocab_size = 20000    #Limiting the maximum vocab size in the overall data. take only top frequent words
max_sequence_length = 250 #Limiting the maximum length of input sequence of words in a single review

vectorize_layer = TextVectorization(
    standardize=pre_process,
    max_tokens=vocab_size,
    output_sequence_length=max_sequence_length # Truncate large sequences or pad with '0' if the sequence is short
    )

# Define "vectorize_text" function to create the sequnce of integers
train_text = train_datagen.map(lambda x, y: x) # Take only x data, reviews only ; ignore y data - lables
vectorize_layer.adapt(train_text) #Adapt function used to convert the index of strings to integers.
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1) #One extra dimention will be added at the end. Which will be later used to fill the output col
  return vectorize_layer(text), label #Output is the number sequence and label

# Final Mapping on Train and Test data
train_df = train_datagen.map(vectorize_text)
test_df = test_datagen.map(vectorize_text)

## Model building

In [None]:
embedding_dim = 16  # Lenghth of Embeddings
model = tf.keras.Sequential()
model.add(Embedding(input_dim=vocab_size + 1, output_dim=embedding_dim))
# input_dim =(1 + maximum integer index occurring in the input data)
# output_dim= Lenghth of Embeddings
model.add(Dropout(0.3))
model.add(GlobalAveragePooling1D())
model.add(Dropout(0.3))
model.add(Dense(1))
model.summary()

In [None]:
model.compile(loss=losses.BinaryCrossentropy(), optimizer='adam', metrics = ['accuracy'])
model.fit(train_df,validation_data=test_df,epochs=10)

## Saving and loading the model

In [None]:
model.save_weights('Senti_model_word2Vec_10epochs.h5')

## Loading a pre-trained model

In [None]:
!wget raw.githubusercontent.com/venkatareddykonasani/Datasets/master/Saved_models/Senti_model_word2Vec_10epochs.h5
model.load_weights('Senti_model_word2Vec_10epochs.h5')

## The model for final prediction

In [None]:
#Final model for prediction is more than simple prediction from ANN, it we need to include pre-processing also
final_model = tf.keras.Sequential([
  vectorize_layer, # Vectorization layer
  model, # ANN Model
  layers.Activation('sigmoid') # Result
])

final_model.compile(loss=losses.BinaryCrossentropy(), optimizer="adam", metrics=['accuracy'])

## Prediction using the model

In [None]:
examples = [
  "Best indian movie ever.an amazing directer SS Rajamouli made a movie in 2015 with a sequel in 2017  that managed to hit theatres all over the world and also   non indians watched it and gave good reviews.all genres are well mixed -action,drama,romance a little bit of comedy makes bahubali the best.Amazing CGI  says IN YOUR FACE to other indian movies.amazing cinematography,story,makeup and BGM.",
  "Might be an expensive movie, but acting was horrible and no plot whatsoever. Very predictable throughout. And on top of that bad direction, acting was extremely bad. Wasted three hours of my life.",
  "What a fantastic performance from all the actors especially Prabhas , putting all his effort and skill in making this fantasy come alive and yet so captivating, I love the wardrobe functions on all the actors , the elegance and pure magic put together just brings this movie to another level. The producers, directors & choreographers and all extra stunt mans have done such an amazing job , HATS OFF TO ALL OF YOU .In a nut shell Baahubali ranks top on all the  fantasy movies listed."]

In [None]:
predictions=final_model.predict(examples)
print(["pos" if i >0.5 else "neg" for i in predictions])