In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec


### Load up our model

In [None]:
corpus = api.load('text8')
model = Word2Vec(corpus)

In [None]:
len(model.wv.vocab)

In [None]:
model.wv["woman"]

In [None]:
model.wv["woman"].shape

In [None]:
result = model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)

In [None]:
from sklearn.manifold import TSNE

vecs = [model.wv["king"], model.wv["queen"], model.wv["woman"], model.wv["man"], (model.wv["woman"]+model.wv["king"]-model.wv["man"])/3]

vecs = TSNE(perplexity=5, random_state=55).fit_transform(vecs)
vecs = pd.DataFrame(vecs)
vecs["colors"] = ["king", "queen", "woman", "man", "woman+king-man"]
vecs.columns = ["x","y","colors"]
vecs

### Visualize our vectors

In [None]:
import altair as alt

base = alt.Chart(vecs).mark_circle(size=100).encode(
    x='x',
    y='y',
    color='colors',
    tooltip=["colors"]
)

for i,pos in vecs.iterrows():
    pos2 = pos.to_frame().T
    pos2 = pos2.append({"x":0,"y":0, "colors":pos2.colors.iloc[0]}, ignore_index=True)
    b = alt.Chart(pos2).mark_line().encode(
        x='x',
        y='y',
        color='colors',
    )
    base += b

base.interactive()

### Train on Turkish data

#### Preprocess (for Word2Vec training)

In [None]:
df = pd.read_csv("../input/duygu-analizi-icin-urun-yorumlari/magaza_yorumlari_duygu_analizi.csv", encoding="utf-16")
df.head()

In [None]:
import string
sents = []
for i,row in df.iterrows():
    s = row["Görüş"]
    try:
        s = "".join([c for c in s if c not in string.punctuation])
    except TypeError:
        continue
    s = s.split(" ")
    s = [w.lower() for w in s]
    sents.append(s)

Here we tokenized sentences based on spaces and removed punctuation.

In [None]:
sents[0]

In [None]:
tr_model = Word2Vec(sentences=sents, size=100, window=4, min_count=5, workers=4)


Resultant vocabulary

In [None]:
len(tr_model.wv.vocab)

#### Preprocess (into vectors)

In [None]:
labels = df.Durum

X_data = []
for i,row in df.iterrows():
    s = row["Görüş"]
    try:
        s = "".join([c for c in s if c not in string.punctuation])
    except TypeError:
        labels.drop(i, inplace=True)
        continue
    s = s.split(" ")
    vecs = []
    for w in s:
        try:
            vecs.append(tr_model.wv[w])
        except KeyError:
            vecs.append(np.zeros(100))

    s = np.mean(vecs, axis=0)
    X_data.append(s)

X_data = np.array(X_data)
print(X_data.shape, labels.shape)

#### Split into train/test

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_data, labels, test_size=0.2)

#### Classify

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier().fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score
y_pred = clf.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")