# **PART 1**


In [1]:
import gensim.downloader as api

# Load pretrained Word2Vec Google News model
wv_pretrained = api.load("word2vec-google-news-300")

# --- Task 1: Find similar words ---
words = ['king', 'apple', 'computer', 'music', 'university']

for word in words:
    print(f"\nTop 5 words similar to '{word}':")
    for sim_word, score in wv_pretrained.most_similar(word, topn=5):
        print(f"{sim_word}: {score:.4f}")

# --- Task 2: Word Vector Arithmetic (Analogies) ---
print("\n=== Vector Arithmetic Analogies ===")
analogies = [
    ("king", "man", "woman"),     # Expected: queen
    ("Paris", "France", "Germany"),  # Expected: Berlin
    ("walking", "walk", "swim")   # Expected: swimming
]

for a, b, c in analogies:
    result = wv_pretrained.most_similar(positive=[c, a], negative=[b], topn=1)
    print(f"{a} - {b} + {c} ≈ {result[0][0]} (Score: {result[0][1]:.4f})")



Top 5 words similar to 'king':
kings: 0.7138
queen: 0.6511
monarch: 0.6413
crown_prince: 0.6204
prince: 0.6160

Top 5 words similar to 'apple':
apples: 0.7204
pear: 0.6451
fruit: 0.6410
berry: 0.6302
pears: 0.6134

Top 5 words similar to 'computer':
computers: 0.7979
laptop: 0.6640
laptop_computer: 0.6549
Computer: 0.6473
com_puter: 0.6082

Top 5 words similar to 'music':
classical_music: 0.7198
jazz: 0.6835
Music: 0.6596
Without_Donny_Kirshner: 0.6416
songs: 0.6396

Top 5 words similar to 'university':
universities: 0.7004
faculty: 0.6781
unversity: 0.6758
undergraduate: 0.6587
univeristy: 0.6585

=== Vector Arithmetic Analogies ===
king - man + woman ≈ queen (Score: 0.7118)
Paris - France + Germany ≈ Berlin (Score: 0.7644)
walking - walk + swim ≈ swimming (Score: 0.8246)


# **PART 2**

In [4]:
!pip install pandas numpy scikit-learn nltk gensim tqdm




In [5]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from gensim.models import Word2Vec, FastText
from gensim.downloader import load
from tqdm import tqdm
import csv

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# --- Load dataset safely ---
df = pd.read_csv("IMDB Dataset.csv", quotechar='"', escapechar='\\')
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# --- Clean + tokenize ---
def preprocess(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    tokens = text.lower().split()
    return [w for w in tokens if w not in stop_words]

df['tokens'] = df['review'].apply(preprocess)

# --- Train/Test split ---
X_train, X_test, y_train, y_test = train_test_split(df['tokens'], df['sentiment'], test_size=0.2, random_state=42)

# --- Vector Averaging ---
def get_vectors(tokens_list, model, vector_size):
    vectors = []
    for tokens in tokens_list:
        word_vecs = [model[word] for word in tokens if word in model]
        if word_vecs:
            vectors.append(np.mean(word_vecs, axis=0))
        else:
            vectors.append(np.zeros(vector_size))
    return np.array(vectors)

# --- 1. Pretrained Word2Vec ---
print("Using Pretrained Word2Vec...")
pre_w2v = load("word2vec-google-news-300")
X_train_wv = get_vectors(X_train, pre_w2v, 300)
X_test_wv = get_vectors(X_test, pre_w2v, 300)
clf_wv = LogisticRegression(max_iter=1000)
clf_wv.fit(X_train_wv, y_train)
acc_wv = accuracy_score(y_test, clf_wv.predict(X_test_wv))

# --- 2. Custom Skip-gram Word2Vec ---
print("Training custom Skip-gram Word2Vec...")
custom_sg = Word2Vec(sentences=X_train.tolist(), sg=1, vector_size=100, window=3, min_count=1, workers=4).wv
X_train_sg = get_vectors(X_train, custom_sg, 100)
X_test_sg = get_vectors(X_test, custom_sg, 100)
clf_sg = LogisticRegression(max_iter=1000)
clf_sg.fit(X_train_sg, y_train)
acc_sg = accuracy_score(y_test, clf_sg.predict(X_test_sg))

# --- 3. Custom CBOW Word2Vec ---
print("Training custom CBOW Word2Vec...")
custom_cb = Word2Vec(sentences=X_train.tolist(), sg=0, vector_size=100, window=3, min_count=1, workers=4).wv
X_train_cb = get_vectors(X_train, custom_cb, 100)
X_test_cb = get_vectors(X_test, custom_cb, 100)
clf_cb = LogisticRegression(max_iter=1000)
clf_cb.fit(X_train_cb, y_train)
acc_cb = accuracy_score(y_test, clf_cb.predict(X_test_cb))

# --- 4. Custom FastText ---
print("Training custom FastText...")
custom_ft = FastText(sentences=X_train.tolist(), sg=1, vector_size=100, window=3, min_count=1, workers=4).wv
X_train_ft = get_vectors(X_train, custom_ft, 100)
X_test_ft = get_vectors(X_test, custom_ft, 100)
clf_ft = LogisticRegression(max_iter=1000)
clf_ft.fit(X_train_ft, y_train)
acc_ft = accuracy_score(y_test, clf_ft.predict(X_test_ft))

# --- Results ---
print("\n=== Model Accuracy Summary ===")
print(f"Pretrained Word2Vec  : {acc_wv:.4f}")
print(f"Custom Skip-gram W2V : {acc_sg:.4f}")
print(f"Custom CBOW W2V      : {acc_cb:.4f}")
print(f"Custom FastText      : {acc_ft:.4f}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Using Pretrained Word2Vec...
Training custom Skip-gram Word2Vec...
Training custom CBOW Word2Vec...
Training custom FastText...

=== Model Accuracy Summary ===
Pretrained Word2Vec  : 0.8475
Custom Skip-gram W2V : 0.8679
Custom CBOW W2V      : 0.8514
Custom FastText      : 0.8644
