Project -Sentment Analysis On-IMDB Movie reviews

# Importing all required libraries

In [3]:
import pandas as pd
# Import various modules for string cleaning
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
# Download the punkt tokenizer for sentence splitting
import nltk.data
#nltk.download()   
from gensim.models import word2vec
#word2vec tries to learn relationships between words and embeds them in a lower-dimensional vector space
#The Word2Vec model is used to extract the notion of relatedness across words or products such as semantic relatedness, synonym detection, concept categorization, selectional preferences, and analogy. A Word2Vec model learns meaningful relations and encodes the relatedness into vector similarity
#Word2vec is a neural network structure to generate word embedding by training the model on a supervised classification problem
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Importing data using read_csv

In [4]:
import pandas as pd
train=pd.read_csv("labeledTraindata.tsv",header=0, delimiter="\t", quoting=3)
test=pd.read_csv("testData.tsv", header=0, delimiter="\t",  quoting=3)
print(test.shape)

(25000, 2)


In [5]:
train.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [7]:

train.columns.values

array(['id', 'sentiment', 'review'], dtype=object)

In [8]:
train.shape

(25000, 3)

In [None]:
print

In [14]:
print("Read %d labeled train reviews, %d test reviews, " % (train["review"].size,test["review"].size))

Read 25000 labeled train reviews, 25000 test reviews, 


# Function to clean the data, using beautiful soup to clean HTML tags

When we do data scrping from websites we recieve html tags, HTML language helps design and develop maintain websites. can use regular expression but beautiful soup is faster and less time

In [4]:
def review_to_wordlist( review, remove_stopwords=False ):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(review).get_text()
    #  
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)

In [5]:
pip install gensim

Note: you may need to restart the kernel to use updated packages.


In [6]:
from gensim.models import word2vec

# Creating lists with tokenized sentence where each list is a list of words

In [15]:

# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# Define a function to split a review into parsed sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( review_to_wordlist( raw_sentence, \
              remove_stopwords ))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

# Running the functions to clean the data and create a corpus of sentences

In [8]:
sentences = []  # Initialize an empty list of sentences

print("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)

print("Parsing sentences from unlabeled set")
for review in test["review"]:
    sentences += review_to_sentences(review, tokenizer)

Parsing sentences from training set




Parsing sentences from unlabeled set




In [9]:
print(len(sentences))

527951


In [10]:
print (sentences[0])

['with', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', 've', 'started', 'listening', 'to', 'his', 'music', 'watching', 'the', 'odd', 'documentary', 'here', 'and', 'there', 'watched', 'the', 'wiz', 'and', 'watched', 'moonwalker', 'again']


bold text- sentences (iterable of iterables, optional) – The sentences iterable can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. See BrownCorpus, Text8Corpus or LineSentence in word2vec module for such examples. See also the tutorial on data streaming in Python. If you don’t supply sentences, the model is left uninitialized – use if you plan to initialize it in some other way.

corpus_file (str, optional) – Path to a corpus file in LineSentence format. You may use this argument instead of sentences to get performance boost. Only one of sentences or corpus_file arguments need to be passed (or none of them, in that case, the model is left uninitialized).
vector_size (int, optional) – Dimensionality of the word vectors.

window (int, optional) – Maximum distance between the current and predicted word within a sentence.

min_count (int, optional) – Ignores all words with total frequency lower than this.

workers (int, optional) – Use these many worker threads to train the model (=faster training with multicore machines).

sample (float, optional) – The threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5).

### Creating the word2Vec model

In [16]:
# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

In [17]:
# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(sentences, workers=num_workers,vector_size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

2022-06-02 16:44:25,120 : INFO : collecting all words and their counts
2022-06-02 16:44:25,123 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-06-02 16:44:25,234 : INFO : PROGRESS: at sentence #10000, processed 225803 words, keeping 17776 word types


Training model...


2022-06-02 16:44:25,337 : INFO : PROGRESS: at sentence #20000, processed 451892 words, keeping 24948 word types
2022-06-02 16:44:25,459 : INFO : PROGRESS: at sentence #30000, processed 671315 words, keeping 30034 word types
2022-06-02 16:44:25,572 : INFO : PROGRESS: at sentence #40000, processed 897815 words, keeping 34348 word types
2022-06-02 16:44:25,677 : INFO : PROGRESS: at sentence #50000, processed 1116963 words, keeping 37761 word types
2022-06-02 16:44:25,780 : INFO : PROGRESS: at sentence #60000, processed 1338404 words, keeping 40723 word types
2022-06-02 16:44:25,883 : INFO : PROGRESS: at sentence #70000, processed 1561580 words, keeping 43333 word types
2022-06-02 16:44:25,987 : INFO : PROGRESS: at sentence #80000, processed 1780887 words, keeping 45714 word types
2022-06-02 16:44:26,090 : INFO : PROGRESS: at sentence #90000, processed 2004996 words, keeping 48135 word types
2022-06-02 16:44:26,196 : INFO : PROGRESS: at sentence #100000, processed 2226966 words, keeping 50

2022-06-02 16:44:34,526 : INFO : EPOCH 0 - PROGRESS: at 4.99% examples, 204136 words/s, in_qsize 7, out_qsize 0
2022-06-02 16:44:35,593 : INFO : EPOCH 0 - PROGRESS: at 8.18% examples, 220003 words/s, in_qsize 7, out_qsize 0
2022-06-02 16:44:36,615 : INFO : EPOCH 0 - PROGRESS: at 11.54% examples, 232208 words/s, in_qsize 7, out_qsize 0
2022-06-02 16:44:37,687 : INFO : EPOCH 0 - PROGRESS: at 14.08% examples, 224953 words/s, in_qsize 7, out_qsize 0
2022-06-02 16:44:38,711 : INFO : EPOCH 0 - PROGRESS: at 16.97% examples, 226452 words/s, in_qsize 8, out_qsize 0
2022-06-02 16:44:39,745 : INFO : EPOCH 0 - PROGRESS: at 19.72% examples, 225362 words/s, in_qsize 7, out_qsize 0
2022-06-02 16:44:40,756 : INFO : EPOCH 0 - PROGRESS: at 22.79% examples, 228473 words/s, in_qsize 7, out_qsize 0
2022-06-02 16:44:41,785 : INFO : EPOCH 0 - PROGRESS: at 26.23% examples, 233502 words/s, in_qsize 7, out_qsize 0
2022-06-02 16:44:42,837 : INFO : EPOCH 0 - PROGRESS: at 29.86% examples, 238998 words/s, in_qsize 

2022-06-02 16:45:48,029 : INFO : EPOCH 2 - PROGRESS: at 15.62% examples, 210315 words/s, in_qsize 7, out_qsize 0
2022-06-02 16:45:49,062 : INFO : EPOCH 2 - PROGRESS: at 18.52% examples, 213448 words/s, in_qsize 8, out_qsize 1
2022-06-02 16:45:50,071 : INFO : EPOCH 2 - PROGRESS: at 21.43% examples, 216481 words/s, in_qsize 7, out_qsize 0
2022-06-02 16:45:51,114 : INFO : EPOCH 2 - PROGRESS: at 24.63% examples, 221008 words/s, in_qsize 7, out_qsize 0
2022-06-02 16:45:52,139 : INFO : EPOCH 2 - PROGRESS: at 27.08% examples, 218076 words/s, in_qsize 7, out_qsize 0
2022-06-02 16:45:53,149 : INFO : EPOCH 2 - PROGRESS: at 29.54% examples, 216666 words/s, in_qsize 7, out_qsize 0
2022-06-02 16:45:54,152 : INFO : EPOCH 2 - PROGRESS: at 32.33% examples, 217917 words/s, in_qsize 8, out_qsize 0
2022-06-02 16:45:55,162 : INFO : EPOCH 2 - PROGRESS: at 35.31% examples, 219925 words/s, in_qsize 7, out_qsize 0
2022-06-02 16:45:56,179 : INFO : EPOCH 2 - PROGRESS: at 38.44% examples, 222475 words/s, in_qsiz

2022-06-02 16:47:01,982 : INFO : EPOCH 4 - PROGRESS: at 21.10% examples, 245511 words/s, in_qsize 8, out_qsize 0
2022-06-02 16:47:02,989 : INFO : EPOCH 4 - PROGRESS: at 23.87% examples, 243716 words/s, in_qsize 7, out_qsize 0
2022-06-02 16:47:04,000 : INFO : EPOCH 4 - PROGRESS: at 27.24% examples, 246759 words/s, in_qsize 5, out_qsize 2
2022-06-02 16:47:05,031 : INFO : EPOCH 4 - PROGRESS: at 30.38% examples, 247398 words/s, in_qsize 7, out_qsize 0
2022-06-02 16:47:06,081 : INFO : EPOCH 4 - PROGRESS: at 32.95% examples, 243106 words/s, in_qsize 8, out_qsize 0
2022-06-02 16:47:07,089 : INFO : EPOCH 4 - PROGRESS: at 35.48% examples, 240334 words/s, in_qsize 7, out_qsize 0
2022-06-02 16:47:08,100 : INFO : EPOCH 4 - PROGRESS: at 38.71% examples, 242230 words/s, in_qsize 7, out_qsize 0
2022-06-02 16:47:09,161 : INFO : EPOCH 4 - PROGRESS: at 41.81% examples, 242452 words/s, in_qsize 8, out_qsize 0
2022-06-02 16:47:10,173 : INFO : EPOCH 4 - PROGRESS: at 44.75% examples, 242526 words/s, in_qsiz

In [22]:
# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

  model.init_sims(replace=True)


In [24]:
# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

2022-06-02 16:47:30,193 : INFO : Word2Vec lifecycle event {'fname_or_handle': '300features_40minwords_10context', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-06-02T16:47:30.193584', 'gensim': '4.2.0', 'python': '3.9.7 (default, Sep 16 2021, 16:59:28) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19043-SP0', 'event': 'saving'}
2022-06-02 16:47:30,197 : INFO : not storing attribute cum_table
2022-06-02 16:47:30,295 : INFO : saved 300features_40minwords_10context


### Testing the model

In [25]:
model.wv.doesnt_match("man woman child kitchen".split())

'kitchen'

In [26]:
model.wv.doesnt_match("france england germany berlin".split())

'berlin'

In [27]:
model.wv.most_similar("moon")

[('jacket', 0.5657969117164612),
 ('sky', 0.5363483428955078),
 ('circle', 0.53548264503479),
 ('sun', 0.5287894010543823),
 ('oz', 0.5033456087112427),
 ('snow', 0.4892703890800476),
 ('snowy', 0.48059624433517456),
 ('spaceship', 0.4780445098876953),
 ('mountain', 0.4759177565574646),
 ('apollo', 0.4758463501930237)]

In [28]:
model.wv.most_similar("queen")

[('princess', 0.7508115768432617),
 ('bride', 0.6475000381469727),
 ('femme', 0.6413792371749878),
 ('nurse', 0.6368417739868164),
 ('goddess', 0.6298828125),
 ('gypsy', 0.62645423412323),
 ('fatale', 0.6238094568252563),
 ('sultry', 0.6188187003135681),
 ('belle', 0.6071270704269409),
 ('carmen', 0.6012665033340454)]

In [29]:
model.wv.doesnt_match("dog cat elephant moon".split())

'moon'

In [31]:
model.wv.most_similar("action")

[('suspense', 0.5038391947746277),
 ('pacing', 0.45792388916015625),
 ('paced', 0.4573788642883301),
 ('exciting', 0.4536362886428833),
 ('adventure', 0.4525851607322693),
 ('tension', 0.43860137462615967),
 ('gore', 0.4380093812942505),
 ('thrilling', 0.43581557273864746),
 ('thrills', 0.43550166487693787),
 ('explosions', 0.43038100004196167)]

In [32]:
model.wv.similarity('france', 'germany')

0.8530463

In [33]:
model.wv["flower"]

array([ 0.04457998,  0.0307649 , -0.0174332 ,  0.06117793,  0.02018438,
       -0.12031978, -0.02739874,  0.10649881, -0.00732701,  0.04741612,
       -0.01702793,  0.03092411,  0.02579388,  0.03395401,  0.02595449,
       -0.04911796, -0.01293617, -0.05300761, -0.03887616,  0.01161817,
       -0.08139922, -0.06123571,  0.10276125,  0.00869878,  0.08679716,
       -0.00914037,  0.07053868, -0.06762625,  0.01005393, -0.07019533,
        0.05286527, -0.08623346, -0.03382185, -0.02542276,  0.04313236,
        0.02382032, -0.00313712,  0.02032855,  0.06043682,  0.07967199,
       -0.07651638,  0.04060363,  0.08804101, -0.05183306, -0.00748405,
        0.05869152, -0.05898987,  0.02436483, -0.1426173 , -0.00806292,
       -0.01102615, -0.00145902, -0.04552931,  0.012781  ,  0.05655881,
        0.08123118,  0.06116894, -0.00588782,  0.03929471,  0.06501327,
       -0.06296386, -0.00251321,  0.0286968 , -0.01467166,  0.05320358,
        0.03404943, -0.08936688,  0.02383353, -0.06103896, -0.04