In [2]:
import pandas as pd
import numpy as np
import re
from re import sub
import multiprocessing

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors


from time import time 
from collections import defaultdict

from unidecode import unidecode

#import spacy 

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [3]:
file = pd.read_csv("polish_sentiment_dataset.csv")
file_cleaned = file.dropna().drop_duplicates().reset_index(drop=True).rename(columns={'description':'title'})

In [5]:
file_cleaned.rate.value_counts()/len(file_cleaned)

 1.0    0.984766
-1.0    0.015233
 0.0    0.000002
Name: rate, dtype: float64

In [6]:
file_cleaned[file_cleaned.rate==0]

Unnamed: 0,title,length,rate
3,0,0.0,0.0


In [7]:
file_cleaned = file_cleaned[file_cleaned.rate!=0]

In [8]:
file_cleaned.rate.value_counts()/len(file_cleaned)

 1.0    0.984767
-1.0    0.015233
Name: rate, dtype: float64

In [10]:
def text_to_word_list(text, remove_polish_letters):
    ''' Pre process and convert texts to a list of words 
    method inspired by method from eliorc github repo: https://github.com/eliorc/Medium/blob/master/MaLSTM.ipynb'''
    text = remove_polish_letters(text)
    text = str(text)
    text = text.lower()

    # Clean the text
    text = sub(r"[^A-Za-z0-9^,!?.\/'+]", " ", text)
    text = sub(r"\+", " plus ", text)
    text = sub(r",", " ", text)
    text = sub(r"\.", " ", text)
    text = sub(r"!", " ! ", text)
    text = sub(r"\?", " ? ", text)
    text = sub(r"'", " ", text)
    text = sub(r":", " : ", text)
    text = sub(r"\s{2,}", " ", text)

    text = text.split()

    return text  

In [11]:
file_cleaned.title = file_cleaned.title.apply(lambda x: text_to_word_list(x, unidecode))

In [12]:
file_model = file_cleaned.copy()
file_model = file_model[file_model.title.str.len()>1]

In [13]:
sent = [row for row in file_model.title]
phrases = Phrases(sent, min_count=1, progress_per=50000)
bigram = Phraser(phrases)
sentences = bigram[sent]
sentences[1]

INFO - 20:51:06: collecting all words and their counts
INFO - 20:51:06: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 20:51:07: PROGRESS: at sentence #50000, processed 611860 words and 169257 word types
INFO - 20:51:08: PROGRESS: at sentence #100000, processed 1200585 words and 265494 word types
INFO - 20:51:10: PROGRESS: at sentence #150000, processed 1811398 words and 347364 word types
INFO - 20:51:11: PROGRESS: at sentence #200000, processed 2470241 words and 436886 word types
INFO - 20:51:12: PROGRESS: at sentence #250000, processed 3058090 words and 509631 word types
INFO - 20:51:13: PROGRESS: at sentence #300000, processed 3656884 words and 571393 word types
INFO - 20:51:14: PROGRESS: at sentence #350000, processed 4268981 words and 638483 word types
INFO - 20:51:15: PROGRESS: at sentence #400000, processed 5028018 words and 758229 word types
INFO - 20:51:17: PROGRESS: at sentence #450000, processed 5756675 words and 869938 word types
INFO - 20:51:18: PROGRE

['bardzo',
 'dobra_komunikacja',
 'sms',
 'i',
 'telefoniczna',
 'zamowiony',
 'towar',
 'wyslany',
 'w',
 'terminie',
 'dobrze_zabezpieczony',
 'polecam',
 'ten',
 'sklep']

- min count = 3 - remove most unusual words from training embeddings, like words 'ssssuuuuuuuppppppeeeeeerrrr', which actually stands for 'super', and doesn't need additional training
- window = 4 - Word2Vec model will learn to predict given word from up to 4 words to the left, and up to 4 words to the right
- size = 300 - size of hidden layer used to predict surroundings of embedded word, which also stands for dimensions of trained embeddings
- sample = 1e-5 - probability baseline for subsampling most frequent words from surrounding of embedded word
- negative = 20 - number of negative (ones that shouldn't have been predicted while modeling selected pair of words) words that will have their corresponding weights updated while training on specific training example, along with positive word 

In [14]:
w2v_model = Word2Vec(min_count=3,
                     window=4,
                     size=300,
                     sample=1e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=multiprocessing.cpu_count()-1)

start = time()

w2v_model.build_vocab(sentences, progress_per=50000)

print('Time to build vocab: {} mins'.format(round((time() - start) / 60, 2)))

INFO - 20:51:47: collecting all words and their counts
INFO - 20:51:47: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 20:51:49: PROGRESS: at sentence #50000, processed 523298 words, keeping 32119 word types
INFO - 20:51:51: PROGRESS: at sentence #100000, processed 1028955 words, keeping 46859 word types
INFO - 20:51:54: PROGRESS: at sentence #150000, processed 1551617 words, keeping 57721 word types
INFO - 20:51:56: PROGRESS: at sentence #200000, processed 2114830 words, keeping 69151 word types
INFO - 20:51:58: PROGRESS: at sentence #250000, processed 2617292 words, keeping 78351 word types
INFO - 20:51:59: PROGRESS: at sentence #300000, processed 3121588 words, keeping 85531 word types
INFO - 20:52:01: PROGRESS: at sentence #350000, processed 3640071 words, keeping 93225 word types
INFO - 20:52:03: PROGRESS: at sentence #400000, processed 4287249 words, keeping 107299 word types
INFO - 20:52:06: PROGRESS: at sentence #450000, processed 4905897 words, keepin

Time to build vocab: 0.7 mins


In [15]:
start = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - start) / 60, 2)))

w2v_model.init_sims(replace=True)

INFO - 20:52:29: training model with 3 workers on 63643 vocabulary and 300 features, using sg=0 hs=0 sample=1e-05 negative=20 window=4
INFO - 20:52:30: EPOCH 1 - PROGRESS: at 3.56% examples, 60557 words/s, in_qsize 0, out_qsize 0
INFO - 20:52:31: EPOCH 1 - PROGRESS: at 7.68% examples, 58097 words/s, in_qsize 1, out_qsize 0
INFO - 20:52:32: EPOCH 1 - PROGRESS: at 11.27% examples, 52567 words/s, in_qsize 0, out_qsize 0
INFO - 20:52:33: EPOCH 1 - PROGRESS: at 14.34% examples, 50268 words/s, in_qsize 0, out_qsize 0
INFO - 20:52:35: EPOCH 1 - PROGRESS: at 17.69% examples, 49875 words/s, in_qsize 0, out_qsize 0
INFO - 20:52:36: EPOCH 1 - PROGRESS: at 21.74% examples, 50974 words/s, in_qsize 0, out_qsize 0
INFO - 20:52:37: EPOCH 1 - PROGRESS: at 25.73% examples, 51718 words/s, in_qsize 1, out_qsize 0
INFO - 20:52:38: EPOCH 1 - PROGRESS: at 29.33% examples, 52630 words/s, in_qsize 1, out_qsize 0
INFO - 20:52:39: EPOCH 1 - PROGRESS: at 33.35% examples, 53334 words/s, in_qsize 0, out_qsize 0
INF

Time to train the model: 16.1 mins


In [17]:
w2v_model.wv.most_similar(positive=["chinszczyzna"])

[('brudzi', 0.7345784306526184),
 ('cienki', 0.725319504737854),
 ('smierdzial', 0.7246689796447754),
 ('chemia', 0.7086781859397888),
 ('jablko', 0.7015652656555176),
 ('ozdoba', 0.6934407949447632),
 ('nadruk', 0.6909717321395874),
 ('rozpuszczalnosc', 0.690258264541626),
 ('matowa', 0.6873468160629272),
 ('popekana', 0.6872731447219849)]

In [21]:
w2v_model.save("word2vec.model")

INFO - 21:11:36: saving Word2Vec object under word2vec.model, separately None
INFO - 21:11:36: storing np array 'vectors' to word2vec.model.wv.vectors.npy
INFO - 21:11:36: not storing attribute vectors_norm
INFO - 21:11:36: storing np array 'syn1neg' to word2vec.model.trainables.syn1neg.npy
INFO - 21:11:37: not storing attribute cum_table
INFO - 21:11:37: saved word2vec.model


Exporting preprocessed dataset for further steps (with replaced bigrams)

In [29]:
file_export = file_model.copy()
file_export['old_title'] = file_export.title
file_export.old_title = file_export.old_title.str.join(' ')
file_export.title = file_export.title.apply(lambda x: ' '.join(bigram[x]))

In [31]:
file_export[['title', 'old_title', 'rate']].to_csv('cleaned_dataset.csv', index=False)