In [None]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from gensim.models import KeyedVectors, Word2Vec
from nltk.tokenize import RegexpTokenizer
from sklearn.decomposition import PCA
from matplotlib import pyplot

In [None]:
embedding_path = "../input/word2vec-google/GoogleNews-vectors-negative300.bin"

In [None]:
forum_posts = pd.read_csv("../input/meta-kaggle/ForumMessages.csv")["Message"].astype('str')

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
data_tokenized = [w.lower() for w in forum_posts.tolist()]
data_tokenized = [tokenizer.tokenize(i) for i in data_tokenized]

In [None]:
model_2 = Word2Vec(size=300, min_count=1)
model_2.build_vocab(data_tokenized)
total_examples = model_2.corpus_count
model_2.intersect_word2vec_format(embedding_path, binary=True, lockf=1.0)
model_2.train(data_tokenized, total_examples=total_examples, epochs=5)

# Todo: Filter down to kaggle specific words somehow and filter out stopwords

In [None]:
from plotly import offline
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
from sklearn.manifold import TSNE
num_words_tsne = 1000
num_words = 200
n_iters = 5000
word_vecs = model_2[list(model_2.wv.index2word)[:num_words_tsne]]
reducer = TSNE(n_components=3, n_iter = n_iters)
reduced_dimensions = reducer.fit_transform(word_vecs)

init_notebook_mode(connected=True)

embeds = go.Scatter3d(
    name = "Original Question",
    x=reduced_dimensions[0:num_words,0],
    y=reduced_dimensions[0:num_words,1],
    z=reduced_dimensions[0:num_words,2],
    mode='markers',
    text = list(model_2.wv.index2word)[:num_words],
    marker=dict(
        size=12,
        line=dict(
            color='rgba(255, 0, 0, 0.14)',
            width=0.1
        ),
        opacity=1.0
    )
)
data = [embeds
       ]
layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='simple-3d-scatter')

In [None]:
model_2.most_similar("rnn")

In [None]:
model_2.most_similar("augmentation")

In [None]:
model_2.most_similar("kaggle")

In [None]:
model_2.most_similar("gm")

In [None]:
model_2.wv.save_word2vec_format("kaggleword2vec.bin", binary = True)