In [26]:
import gensim
import pandas as pd

In [27]:
df = pd.read_csv('rotten_tomatoes_critic_reviews.csv')

df.head()

Unnamed: 0,rotten_tomatoes_link,critic_name,top_critic,publisher_name,review_type,review_score,review_date,review_content
0,m/0814255,Andrew L. Urban,False,Urban Cinefile,Fresh,,2010-02-06,A fantasy adventure that fuses Greek mythology...
1,m/0814255,Louise Keller,False,Urban Cinefile,Fresh,,2010-02-06,"Uma Thurman as Medusa, the gorgon with a coiff..."
2,m/0814255,,False,FILMINK (Australia),Fresh,,2010-02-09,With a top-notch cast and dazzling special eff...
3,m/0814255,Ben McEachen,False,Sunday Mail (Australia),Fresh,3.5/5,2010-02-09,Whether audiences will get behind The Lightnin...
4,m/0814255,Ethan Alter,True,Hollywood Reporter,Rotten,,2010-02-10,What's really lacking in The Lightning Thief i...


In [22]:
df.shape

(1130017, 8)

In [23]:
df.review_content[0]

'A fantasy adventure that fuses Greek mythology to contemporary American places and values. Anyone around 15 (give or take a couple of years) will thrill to the visual spectacle'

### Preprocessing

In [55]:
df = df.dropna(subset=['review_content'])

df.shape

(1064211, 8)

In [56]:
reviews = df.review_content.apply(gensim.utils.simple_preprocess)

reviews.shape

(1064211,)

In [57]:
reviews.head()

Unnamed: 0,review_content
0,"[fantasy, adventure, that, fuses, greek, mytho..."
1,"[uma, thurman, as, medusa, the, gorgon, with, ..."
2,"[with, top, notch, cast, and, dazzling, specia..."
3,"[whether, audiences, will, get, behind, the, l..."
4,"[what, really, lacking, in, the, lightning, th..."


### Training Word2vec Model

In [59]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=5,
    workers=5,
)

In [60]:
model.build_vocab(reviews, progress_per=1000)

In [61]:
model.train(reviews, total_examples=model.corpus_count, epochs=model.epochs)

(86610376, 110383870)

In [64]:
model.wv.most_similar('fantasy')

[('adventure', 0.7082133889198303),
 ('fairytale', 0.692206859588623),
 ('fable', 0.6436342000961304),
 ('folklore', 0.6385085582733154),
 ('fantasia', 0.6222707033157349),
 ('mysticism', 0.5907087326049805),
 ('extravaganza', 0.5804046988487244),
 ('romance', 0.572221040725708),
 ('fantasies', 0.5602487921714783),
 ('romp', 0.5587716102600098)]

In [63]:
model.wv.most_similar('nostalgic')

[('nostalgia', 0.7646150588989258),
 ('retro', 0.5993145108222961),
 ('bygone', 0.5735481977462769),
 ('yesteryear', 0.5720280408859253),
 ('cheerful', 0.5562708377838135),
 ('wholesome', 0.5501629710197449),
 ('tuneful', 0.5490938425064087),
 ('funky', 0.5481659173965454),
 ('fond', 0.538236141204834),
 ('euphoric', 0.5339121222496033)]

In [83]:
model.wv.similarity(w1='sorrowful', w2='melancholic')

0.685845

In [47]:
model.save('./word2vec-movie-review.model')