In [47]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cell-phones-and-accessories-5-json/Cell_Phones_and_Accessories_5.json
/kaggle/input/sports-and-outdoors-5-json/Sports_and_Outdoors_5.json


## Using gensim for WordToVec

In [48]:
! pip install gensim



In [49]:
import gensim
import pandas as pd

## Reading and Exploring the Dataset
The dataset we are using here is a subset of Amazon reviews from the Cell Phones & Accessories category. The data is stored as a JSON file and can be read using pandas.

Link to the Dataset: [dataset](http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Cell_Phones_and_Accessories_5.json.gz)

In [50]:
df = pd.read_json("/kaggle/input/cell-phones-and-accessories-5-json/Cell_Phones_and_Accessories_5.json", lines=True)
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4,Looks Good,1400630400,"05 21, 2014"
1,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5,Really great product.,1389657600,"01 14, 2014"
2,A2TMXE2AFO7ONB,120401325X,Erica,"[0, 0]",These are awesome and make my phone look so st...,5,LOVE LOVE LOVE,1403740800,"06 26, 2014"
3,AWJ0WZQYMYFQ4,120401325X,JM,"[4, 4]",Item arrived in great time and was in perfect ...,4,Cute!,1382313600,"10 21, 2013"
4,ATX7CZYFXI1KW,120401325X,patrice m rogoza,"[2, 3]","awesome! stays on, and looks great. can be use...",5,leopard home button sticker for iphone 4s,1359849600,"02 3, 2013"


In [51]:
df.shape
# that's alot of data

(194439, 9)

In [52]:
# This is the data where we will be working on 

df.reviewText[0]

"They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again"

In [53]:
# now to preprocess these texts to use in our neural net model
# remoing stop words
# making all the words to smaller case
# removing trailing words

# these are all can be done in one function

processed1 = gensim.utils.simple_preprocess(df.reviewText[0])
# or
processed2 = df.reviewText.apply(gensim.utils.simple_preprocess)

processed1, processed2

(['they',
  'look',
  'good',
  'and',
  'stick',
  'good',
  'just',
  'don',
  'like',
  'the',
  'rounded',
  'shape',
  'because',
  'was',
  'always',
  'bumping',
  'it',
  'and',
  'siri',
  'kept',
  'popping',
  'up',
  'and',
  'it',
  'was',
  'irritating',
  'just',
  'won',
  'buy',
  'product',
  'like',
  'this',
  'again'],
 0         [they, look, good, and, stick, good, just, don...
 1         [these, stickers, work, like, the, review, say...
 2         [these, are, awesome, and, make, my, phone, lo...
 3         [item, arrived, in, great, time, and, was, in,...
 4         [awesome, stays, on, and, looks, great, can, b...
                                 ...                        
 194434    [works, great, just, like, my, original, one, ...
 194435    [great, product, great, packaging, high, quali...
 194436    [this, is, great, cable, just, as, good, as, t...
 194437    [really, like, it, becasue, it, works, well, w...
 194438    [product, as, described, have, wasted

In [54]:
# context word and target are required to train and test the model that we are building, thery are just like features and targets

model = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=4,
)

In [55]:
model.build_vocab(processed2, progress_per=1000)

In [56]:
model.epochs # default epochs are set to 5

5

In [57]:
model.train(processed2, total_examples = model.corpus_count, epochs = model.epochs)

(61507360, 83868975)

In [58]:
model.save("Word2Vec-amazon-cell-accessories-reviews-short.model")

In [59]:
model.wv.most_similar("bad")

# as you can see this is telling everything

[('terrible', 0.6710687875747681),
 ('shabby', 0.6225409507751465),
 ('good', 0.5879166722297668),
 ('horrible', 0.5873714685440063),
 ('crappy', 0.5809982419013977),
 ('okay', 0.5523601770401001),
 ('awful', 0.5496503710746765),
 ('legit', 0.5381524562835693),
 ('cheap', 0.5346452593803406),
 ('mad', 0.5259444713592529)]

### Remember, always test your model

In [60]:
model.wv.similarity(w1='cheap', w2='inexpensive')

0.53510606

In [61]:
model.wv.similarity(w1='good', w2='bad')

0.5879166

In [62]:
model.wv.similarity(w1='great', w2='awesome')

0.73271734

In [63]:
model.wv.similarity(w1='great', w2='product')

-0.033031918

## 1. Concepts and Rationale

The entire goal of this code is to represent text—specifically Amazon reviews—as **numerical vectors** that capture the **semantic meaning and relationship** between words.

| Concept | Explanation | Why We Use It |
| :--- | :--- | :--- |
| **Word Embeddings** | A set of feature learning techniques where words or phrases are mapped to vectors of real numbers. Words that appear in similar contexts have vectors that are closer together in the vector space (e.g., the vector for "great" is close to the vector for "good"). | **Machine Learning models only understand numbers.** Embeddings convert complex textual data into a mathematically usable format while preserving meaning. |
| **Word2Vec** | An efficient predictive model that learns word embeddings. It comes in two main flavors: **Continuous Bag of Words (CBOW)** and **Skip-gram**. It works by training a shallow neural network to predict a word given its context (CBOW) or predict the context given a word (Skip-gram). | It's one of the most popular and effective algorithms for generating high-quality word vectors that capture analogies (e.g., "king - man + woman = queen"). |
| **Tokenization** | The process of breaking down a stream of text into smaller units (tokens), which are usually words. | It separates the text into the basic units that the Word2Vec model will learn vectors for. |

***

## 2. Explanation of the Code Steps

### A. Reading and Exploring the Dataset

| Code | Concept | Explanation |
| :--- | :--- | :--- |
| `import gensim`, `import pandas as pd` | **Imports** | Loads the necessary libraries: **`pandas`** for data manipulation and **`gensim`** for NLP tasks, especially Word2Vec. |
| `df = pd.read_json(..., lines=True)` | **Data Loading** | Reads a large JSON file where each line is a separate JSON object. |
| `df.shape` | **Data Exploration** | Confirms the dataset size (`194439` reviews), ensuring the entire file was loaded correctly. |


### B. Simple Preprocessing & Tokenization

The goal here is to prepare the raw text into a format the Word2Vec algorithm can consume: a list of sentences, where each sentence is a list of individual, cleaned words.

| Code | Concept | Explanation |
| :--- | :--- | :--- |
| `review_text = df.reviewText.apply(...)` | **Preprocessing Application** | Selects the `reviewText` column (the content) and applies a cleaning function to every review. |
| `gensim.utils.simple_preprocess` | **Gensim Tokenizer** | A built-in Gensim utility that performs minimal, necessary preprocessing: **tokenization** (splitting into words), **lowercasing**, and **punctuation removal**. |
| `review_text.loc[0]` | **Verification** | Confirms the output: the raw string is now a list of lowercase tokens, ready for the Word2Vec model. |



### C. Training the Word2Vec Model

| Code | Concept | Rationale / Why We Used It |
| :--- | :--- | :--- |
| `model = gensim.models.Word2Vec(...)` | **Model Initialization** | Creates an instance of the Word2Vec model, setting its training hyperparameters. |
| `window=10` | **Context Window Size** | Defines how many words before and after the current word are considered its context. A size of 10 means the model looks at 20 words total (10 back, 10 forward) to understand the meaning of the center word. **Larger window captures broader, thematic relationships.** |
| `min_count=2` | **Vocabulary Filtering** | Ignores all words that appear fewer than 2 times in the entire dataset. | **Filtering rare words** reduces model size, speeds up training, and improves the quality of vectors for common words (since rare words don't have enough context to learn from). |
| `workers=4` | **Parallel Processing** | Specifies the number of CPU threads to use for training. | **Faster training.** Word2Vec is computationally intensive, and using multiple workers (cores) significantly speeds up the process. |
| `model.build_vocab(review_text, ...)` | **Vocabulary Creation** | Scans the input text once to collect all unique words and assign them an index. | This step is necessary to **define the total dimension** of the vocabulary space before training begins. |
| `model.train(review_text, total_examples=..., epochs=...)` | **Training** | Feeds the preprocessed data to the model for iterative training. | The model updates the weights (the actual word vectors) based on the context window, minimizing the prediction error over multiple passes (**epochs**) to refine the embeddings. |



### D. Finding Similar Words and Similarity

| Code | Concept | Rationale / Why We Used It |
| :--- | :--- | :--- |
| `model.wv.most_similar("bad")` | **Vector Similarity** | Finds the words whose vectors are closest to the vector for "bad" in the embedding space. | This is the primary way to **validate the quality of the embeddings**. If the model is good, words like "terrible" and "horrible" should be closest. |
| `model.wv.similarity(w1, w2)` | **Cosine Similarity** | Calculates the cosine of the angle between two word vectors. The result is a number between $-1$ (opposite meaning) and $1$ (identical meaning). | It provides a **quantifiable measure** of how semantically related two words are, demonstrating that the model has successfully learned their relationship from the review context. |

## Exercise
Train a word2vec model on [the Sports & Outdoors Reviews Dataset](http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Sports_and_Outdoors_5.json.gz) Once you train a model on this, find the words most similar to 'awful' and find similarities between the following word tuples: ('good', 'great'), ('slow','steady')

In [64]:
df = pd.read_json("/kaggle/input/sports-and-outdoors-5-json/Sports_and_Outdoors_5.json", lines=True)

In [65]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,AIXZKN4ACSKI,1881509818,David Briner,"[0, 0]",This came in on time and I am veru happy with ...,5,Woks very good,1390694400,"01 26, 2014"
1,A1L5P841VIO02V,1881509818,Jason A. Kramer,"[1, 1]",I had a factory Glock tool that I was using fo...,5,Works as well as the factory tool,1328140800,"02 2, 2012"
2,AB2W04NI4OEAD,1881509818,J. Fernald,"[2, 2]",If you don't have a 3/32 punch or would like t...,4,"It's a punch, that's all.",1330387200,"02 28, 2012"
3,A148SVSWKTJKU6,1881509818,"Jusitn A. Watts ""Maverick9614""","[0, 0]",This works no better than any 3/32 punch you w...,4,It's a punch with a Glock logo.,1328400000,"02 5, 2012"
4,AAAWJ6LW9WMOO,1881509818,Material Man,"[0, 0]",I purchased this thinking maybe I need a speci...,4,"Ok,tool does what a regular punch does.",1366675200,"04 23, 2013"


In [66]:
df.shape

(296337, 9)

In [67]:
df.columns
df.reviewText

0         This came in on time and I am veru happy with ...
1         I had a factory Glock tool that I was using fo...
2         If you don't have a 3/32 punch or would like t...
3         This works no better than any 3/32 punch you w...
4         I purchased this thinking maybe I need a speci...
                                ...                        
296332    This is a water bottle done right. It is a ver...
296333    If you're looking for an insulated water bottl...
296334    This Hydracentials Sporty 25 OZ, double insula...
296335    As usual I received this item free in exchange...
296336    Hydracentials insulated 25 oz water bottle.Thi...
Name: reviewText, Length: 296337, dtype: object

In [68]:
review = df.reviewText.apply(gensim.utils.simple_preprocess)
review

0         [this, came, in, on, time, and, am, veru, happ...
1         [had, factory, glock, tool, that, was, using, ...
2         [if, you, don, have, punch, or, would, like, t...
3         [this, works, no, better, than, any, punch, yo...
4         [purchased, this, thinking, maybe, need, speci...
                                ...                        
296332    [this, is, water, bottle, done, right, it, is,...
296333    [if, you, re, looking, for, an, insulated, wat...
296334    [this, hydracentials, sporty, oz, double, insu...
296335    [as, usual, received, this, item, free, in, ex...
296336    [hydracentials, insulated, oz, water, bottle, ...
Name: reviewText, Length: 296337, dtype: object

In [69]:
model = gensim.models.Word2Vec(
    window=1000,
    min_count=2,
    workers=4
)
model

<gensim.models.word2vec.Word2Vec at 0x7f4e91aef710>

In [70]:
model.build_vocab(review)

In [71]:
model.train(review, epochs=model.epochs, total_examples=model.corpus_count)

(91338805, 121496535)

In [72]:
model.save("excersize-word2vec.model")

In [73]:
model.wv.most_similar('awful')

[('terrible', 0.5757178664207458),
 ('horribly', 0.5617771744728088),
 ('horrible', 0.5424771308898926),
 ('joke', 0.5214943885803223),
 ('useless', 0.5196235775947571),
 ('bad', 0.49755391478538513),
 ('basically', 0.4808139204978943),
 ('horrid', 0.47051772475242615),
 ('sucks', 0.46662724018096924),
 ('irritated', 0.4641328752040863)]

In [74]:
model.wv.similarity('good', 'great')

0.51705235