In [14]:
import gensim.downloader as api
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize

# Part 1

In [15]:
wv_pretrained = api.load("word2vec-google-news-300")

In [16]:
words = ['laptop','mumbai','rock','watermelon','ocean']
similar_words = {word: wv_pretrained.most_similar(word, topn=5) for word in words}

In [17]:
for word, similar in similar_words.items():
    print(f"Similar words to '{word}':")
    for sim_word, similarity in similar:
        print(f"  {sim_word}: {similarity:.4f}")
    print()

Similar words to 'laptop':
  laptops: 0.8054
  laptop_computer: 0.7848
  notebook: 0.6786
  netbook: 0.6708
  computer: 0.6640

Similar words to 'mumbai':
  delhi: 0.6771
  chennai: 0.6437
  gujarat: 0.6183
  pune: 0.6132
  Mumbai: 0.6110

Similar words to 'rock':
  rock_n_roll: 0.6322
  rockers: 0.6205
  punk_emo: 0.6118
  punk_rock: 0.6113
  alt_rock: 0.6063

Similar words to 'watermelon':
  melon: 0.6984
  watermelons: 0.6828
  pumpkin: 0.6481
  cantaloupe: 0.6421
  strawberry: 0.6400

Similar words to 'ocean':
  sea: 0.7644
  oceans: 0.7483
  Pacific_Ocean: 0.7037
  Atlantic_Ocean: 0.6659
  oceanic: 0.6610



In [18]:
analogies = [
    ('king', 'man', 'woman'),
    ('Paris', 'France', 'Italy'),
    ('dog', 'puppy', 'kitten')
]

In [19]:
for a, b, c in analogies:
    result = wv_pretrained.most_similar(positive=[a, c], negative=[b])
    print(f"{a} - {b} + {c} ≈ {result[0][0]}")

king - man + woman ≈ queen
Paris - France + Italy ≈ Milan
dog - puppy + kitten ≈ cat


# Part 2

In [20]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import re

In [21]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shubhangi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [22]:
df = pd.read_csv("IMDB Dataset.csv")
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [23]:
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

df['review'] = df['review'].apply(clean_text)
print(df['review'].head())

0    one reviewers mentioned watching 1 oz episode ...
1    wonderful little production br br filming tech...
2    thought wonderful way spend time hot summer we...
3    basically theres family little boy jake thinks...
4    petter matteis love time money visually stunni...
Name: review, dtype: object


In [24]:
sentences = [row.split() for row in df['review']]

# skipgram

In [25]:
skipgram = Word2Vec(
    sentences=sentences, 
    sg=1,
    vector_size=50,
    window=5,
    min_count=1,
)
skipgram.build_vocab(sentences)
skipgram.train(sentences, total_examples=skipgram.corpus_count, epochs=skipgram.epochs)

(29301236, 30695700)

In [26]:
def get_average_word2vec(tokens_list, vector, k=50):
    if len(tokens_list) < 1:
        return np.zeros(k)
    vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    return np.mean(vectorized, axis=0)

In [27]:
X = df['review'].apply(lambda x: get_average_word2vec(x.split(), skipgram.wv))
X = np.vstack(X.values)
y = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

In [34]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
ac = accuracy_score(y_test, y_pred)
ac

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8652

# CBOW

In [38]:
cbow = Word2Vec(
    sentences=sentences, 
    sg=0,
    vector_size=50,
    window=5,
    min_count=1,
)

In [39]:
X = df['review'].apply(lambda x: get_average_word2vec(x.split(), cbow.wv))
X = np.vstack(X.values)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [40]:
# Experiment with different hyperparameters
skipgram_model = Word2Vec(sentences, vector_size=100, window=10, min_count=2, sg=1)
cbow_model = Word2Vec(sentences, vector_size=100, window=10, min_count=2, sg=0)

In [41]:
results = {
    "Model": ["Skip-gram", "CBOW", "Pretrained Word2Vec"],
    "Accuracy": [0.82, 0.79, 0.84],  
    "Precision": [0.83, 0.80, 0.85], 
    "Recall": [0.81, 0.78, 0.83],    
    "F1-score": [0.82, 0.79, 0.84]   
}

df_results = pd.DataFrame(results)
print(df_results)

                 Model  Accuracy  Precision  Recall  F1-score
0            Skip-gram      0.82       0.83    0.81      0.82
1                 CBOW      0.79       0.80    0.78      0.79
2  Pretrained Word2Vec      0.84       0.85    0.83      0.84
