<a href="https://colab.research.google.com/github/saishshinde15/NLP/blob/main/Sentiment_analysis_Word2Vec(genesis).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec
import numpy as np
import tensorflow as tf

In [2]:
dataset=pd.read_csv('/content/movie.csv')

In [3]:
dataset

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1
...,...,...
39995,"""Western Union"" is something of a forgotten cl...",1
39996,This movie is an incredible piece of work. It ...,1
39997,My wife and I watched this movie because we pl...,0
39998,"When I first watched Flatliners, I was amazed....",1


In [4]:
dataset.isnull().sum()

text     0
label    0
dtype: int64

In [5]:
X=dataset['text']

In [6]:
X

0        I grew up (b. 1965) watching and loving the Th...
1        When I put this movie in my DVD player, and sa...
2        Why do people who do not know what a particula...
3        Even though I have great interest in Biblical ...
4        Im a die hard Dads Army fan and nothing will e...
                               ...                        
39995    "Western Union" is something of a forgotten cl...
39996    This movie is an incredible piece of work. It ...
39997    My wife and I watched this movie because we pl...
39998    When I first watched Flatliners, I was amazed....
39999    Why would this film be so good, but only gross...
Name: text, Length: 40000, dtype: object

In [7]:
y=dataset['label'].values

In [8]:
y

array([0, 0, 0, ..., 0, 1, 1])

In [9]:
X_train_input, X_test_input, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(X_train_input)
X_train = tokenizer.texts_to_sequences(X_train_input)
X_train = pad_sequences(X_train)
x_dim = X_train.shape[1]

In [11]:
X_test = tokenizer.texts_to_sequences(X_test_input)
X_test = pad_sequences(X_test, maxlen=x_dim)

In [12]:
# Convert words to Word2Vec embeddings
sentences = [sentence.split() for sentence in X_train_input]
word2vec_model = Word2Vec(sentences, vector_size=50, window=5, min_count=1, workers=4)
word_vectors = word2vec_model.wv

In [13]:
# Create embedding matrix
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, 50))
for word, i in tokenizer.word_index.items():
    if word in word_vectors:
        embedding_matrix[i] = word_vectors[word]


In [14]:
rnn = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=50, weights=[embedding_matrix],
                              input_length=x_dim, trainable=False),
    tf.keras.layers.LSTM(units=32, activation='tanh', return_sequences=True),
    tf.keras.layers.Dropout(rate=0.5),
    tf.keras.layers.LSTM(units=64, activation='tanh'),
    tf.keras.layers.Dropout(rate=0.5),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(units=100, activation='relu'),
    tf.keras.layers.Dropout(rate=0.5),
    tf.keras.layers.Dense(units=1, activation='sigmoid')
])

In [15]:
# Compile and train your model
rnn.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(), metrics=['accuracy'])
history = rnn.fit(X_train, y_train, epochs=1, batch_size=32, validation_data=(X_test, y_test))



In [25]:

word_index = tokenizer.word_index
word_to_index = {word:index for word,index in word_index.items()}
word_to_index

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'to': 5,
 'is': 6,
 'br': 7,
 'in': 8,
 'it': 9,
 'i': 10,
 'this': 11,
 'that': 12,
 'was': 13,
 'as': 14,
 'movie': 15,
 'for': 16,
 'with': 17,
 'but': 18,
 'film': 19,
 'on': 20,
 'not': 21,
 'you': 22,
 'are': 23,
 'his': 24,
 'have': 25,
 'be': 26,
 'one': 27,
 'he': 28,
 'at': 29,
 'all': 30,
 'by': 31,
 'an': 32,
 'they': 33,
 'so': 34,
 'from': 35,
 'who': 36,
 'like': 37,
 'or': 38,
 'just': 39,
 'her': 40,
 'about': 41,
 'out': 42,
 'if': 43,
 "it's": 44,
 'has': 45,
 'some': 46,
 'there': 47,
 'what': 48,
 'good': 49,
 'more': 50,
 'when': 51,
 'very': 52,
 'up': 53,
 'no': 54,
 'my': 55,
 'time': 56,
 'even': 57,
 'would': 58,
 'she': 59,
 'really': 60,
 'which': 61,
 'only': 62,
 'see': 63,
 'story': 64,
 'their': 65,
 'had': 66,
 'can': 67,
 'me': 68,
 'well': 69,
 'were': 70,
 'much': 71,
 'than': 72,
 'we': 73,
 'been': 74,
 'get': 75,
 'bad': 76,
 'other': 77,
 'also': 78,
 'will': 79,
 'great': 80,
 'do': 81,
 'into': 82,
 'p

In [19]:

# Extract the embedding layer
embedding_layer = rnn.layers[0]

# Get the weights of the embedding layer
embedding_weights = embedding_layer.get_weights()[0]

# Get the word index dictionary
word_index = tokenizer.word_index

# Get the most positive and negative words
positive_words = [word for word, index in word_index.items() if embedding_weights[index][0] > 0]
negative_words = [word for word, index in word_index.items() if embedding_weights[index][0] < 0]

# Print the top 10 most positive and negative words
print("Top 10 most positive words:")
print(positive_words[:50])

print("Top 10 most negative words:")
print(negative_words[:50])


Top 10 most positive words:
['of', 'is', 'was', 'are', 'his', 'he', 'by', 'from', 'like', 'her', 'about', 'out', 'some', 'there', 'very', 'even', 'she', 'really', 'had', 'were', 'than', 'been', 'bad', 'other', 'also', 'do', 'made', 'could', 'think', 'seen', 'character', 'many', 'being', 'acting', 'did', 'know', 'does', 'better', 'man', 'say', 'here', 'such', "i'm", 'watching', 'actors', 'actually', 'old', '10', 'makes', 'director']
Top 10 most negative words:
['the', 'and', 'a', 'to', 'in', 'it', 'i', 'this', 'that', 'as', 'movie', 'for', 'with', 'but', 'film', 'on', 'not', 'you', 'have', 'be', 'one', 'at', 'all', 'an', 'they', 'so', 'who', 'or', 'just', 'if', "it's", 'has', 'what', 'good', 'more', 'when', 'up', 'no', 'my', 'time', 'would', 'which', 'only', 'see', 'story', 'their', 'can', 'me', 'well', 'much']
