In [2]:
import pickle
import re

import numpy as np
from collections import Counter
from collections import defaultdict  # For word frequency

from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
from gensim.models.callbacks import CallbackAny2Vec
import spacy

import datetime
from time import time
import multiprocessing


import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.debug("test")

DEBUG:root:test


In [3]:
tf. __version__

'2.8.0'

In [None]:
with open('AMM_A350.pkl', 'rb') as f:
    data = pickle.load(f)
    
#sentences = data['PARAfull'][:20]
#data
# 1226552 rows

data = data[data['POStxt'].notna()]
data

In [None]:
data["Count"]=data.groupby("POStxt")["POStxt"].transform('count')
data_reduced = data.drop_duplicates(['POStxt'])
sentences = list(data_reduced['POStxt'])
count = list(data_reduced['Count'])

In [None]:
with open('sentences_with_eos_and_unk.pkl', 'rb') as f:
    sentences, counter = pickle.load(f)

In [None]:
vocab = sorted(counter.keys())
vocab_size = len(vocab)
num_of_words = sum(counter.values())
print('There are %d sentences in our dataset.' % len(sentences))
print('There are %d total words in our dataset.' % num_of_words)
print('There are %d unique words in our dataset.' % vocab_size)

## Preprocessing data: keep stop words

In [None]:
def cleaning_keep_stopwords(doc):
    # Lemmatizes but I don't remove stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])

In [None]:
# Consider correcting the preprocessing to include <eos> and <unk>. I may use https://spacy.io/api/language#add_pipe, or 
# I can re run the sentences without the '<' and '>'.
# I want to keep <eos> and <unk>, so I don't use the breaf cleaning below by now. Moreover,
# the n_threads is no longer used in nlp pipe
#brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in sentences)
#txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_threads=-1)]

t = time()

brief_cleaning = (str(row).lower() for row in sentences)
txt_keep_stopwords = [cleaning_keep_stopwords(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
txt_split_keep_stopwords = [row.split() for row in txt_keep_stopwords]

In [None]:
#print(txt_split[:10])

In [None]:
#import logging
#logger = logging.getLogger()
#logger.setLevel(logging.DEBUG)
#logging.debug("test")

phrases_keep_stopwords = Phrases(txt_split_keep_stopwords, min_count=30, progress_per=10000)
print(phrases_keep_stopwords)

In [None]:
bigram_keep_stopwords = Phraser(phrases_keep_stopwords)

In [None]:
txt_keep_stopwords_final = bigram_keep_stopwords[txt_split_keep_stopwords]

In [None]:
print(txt_keep_stopwords_final[0])
print(txt_keep_stopwords_final[1])
print(txt_keep_stopwords_final[2])
print(len(txt_keep_stopwords_final))

In [None]:
word_freq_keep_stopwords = defaultdict(int)
for sent in txt_keep_stopwords_final:
    for i in sent:
        word_freq_keep_stopwords[i] += 1

count_less_freq_words = 0

for word in word_freq_keep_stopwords.keys():
    if word_freq_keep_stopwords[word] < 5:
        count_less_freq_words += 1
        
print('There are %d words in our dataset after preprocessing (keep stopwords, lemmatize and merge bigrams),' % len(word_freq_keep_stopwords)) 
print('and now there are %d words appearing less than 5 times.' % count_less_freq_words)

## Preprocessing data: remove stop words

In [None]:
def cleaning_remove_stopwords(doc):
    # Lemmatizes but I don't remove stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])

In [None]:
# Consider correcting the preprocessing to include <eos> and <unk>. I may use https://spacy.io/api/language#add_pipe, or 
# I can re run the sentences without the '<' and '>'.
# I want to keep <eos> and <unk>, so I don't use the breaf cleaning below by now. Moreover,
# the n_threads is no longer used in nlp pipe
#brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in sentences)
#txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_threads=-1)]

t = time()

brief_cleaning = (str(row).lower() for row in sentences)
txt_remove_stopwords = [cleaning_remove_stopwords(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
txt_split_remove_stopwords = [row.split() for row in txt_remove_stopwords]

In [None]:
print(txt_split_remove_stopwords[:10])

In [None]:
#import logging
#logger = logging.getLogger()
#logger.setLevel(logging.DEBUG)
#logging.debug("test")

phrases_remove_stopwords = Phrases(txt_split_remove_stopwords, min_count=30, progress_per=10000)
print(phrases_remove_stopwords)

In [None]:
bigram_remove_stopwords = Phraser(phrases_remove_stopwords)

In [None]:
txt_remove_stopwords_final = bigram_remove_stopwords[txt_split_remove_stopwords]

In [None]:
print(txt_remove_stopwords_final[0])
print(txt_remove_stopwords_final[1])
print(txt_remove_stopwords_final[2])
print(len(txt_remove_stopwords_final))

In [None]:
word_freq_remove_stopwords = defaultdict(int)
for sent in txt_remove_stopwords_final:
    for i in sent:
        word_freq_remove_stopwords[i] += 1

count_less_freq_words = 0

for word in word_freq_remove_stopwords.keys():
    if word_freq_remove_stopwords[word] < 5:
        count_less_freq_words += 1
        
print('There are %d words in our dataset after preprocessing (remove stopwords, lemmatize and merge bigrams).' % len(word_freq_remove_stopwords)) 
#print('and now there are %d words appearing less than 5 times.' % count_less_freq_words)

In [None]:
set_remove_stop - set(vocab)

## Compare both vocabularies

In [None]:
dict_keep_stop_words = {k: v for k, v in sorted(word_freq_keep_stopwords.items(), key=lambda item: item[1], reverse = True)}
set_keep_stop = set(dict_keep_stop_words.keys())

dict_remove_stop_words = {k: v for k, v in sorted(word_freq_remove_stopwords.items(), key=lambda item: item[1], reverse = True)}
set_remove_stop = set(dict_remove_stop_words.keys())

In [None]:
print('There are %d words in vocab if we keep stop, and %d if we remove them.' % (len(set_keep_stop), len(set_remove_stop)) )
print('In particular, %d words appear in the set WITH stopwords that disappear when we remove the stopwords' 
      % len(set_keep_stop - set_remove_stop) )
print('However, %d new words appear in the set without stopwords. These are mostly bigrams obtained after removing the stopwords.' 
      % len(set_remove_stop - set_keep_stop) )

In [None]:
# set_keep_stop - set_remove_stop

# Without stop words there are many bigrams that do not appear in the set with stop words:
#set_remove_stop - set_keep_stop

## Train model: keep stop words

In [None]:
# 1) Define model

#loss_logger = LossLogger()

cores = multiprocessing.cpu_count()

w2v_model_keep_stopwords = Word2Vec(min_count=5,
                     window=2,
                     vector_size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007,
                     negative=20,
                     workers=cores-1)

In [None]:
# 2) Build vocab


t = time()

w2v_model_keep_stopwords.build_vocab(txt_keep_stopwords_final, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
print('There are %d words in our dataset after building vocab with min_count = 5.' 
      % len(list(w2v_model_keep_stopwords.wv.key_to_index.keys()))) 
#print(w2v_model)
#print(list(w2v_model.wv.key_to_index.keys()))

In [None]:
# 3) Train model

t = time()

w2v_model_keep_stopwords.train(txt_keep_stopwords_final, total_examples=w2v_model_keep_stopwords.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

## Train model: remove stop words

In [None]:
# 1) Define model

#loss_logger = LossLogger()

cores = multiprocessing.cpu_count()

w2v_model_remove_stopwords = Word2Vec(min_count=5,
                     window=2,
                     vector_size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007,
                     negative=20,
                     workers=cores-1)

In [None]:
# 2) Build vocab


t = time()

w2v_model_remove_stopwords.build_vocab(txt_remove_stopwords_final, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
print('There are %d words in our dataset after building vocab with min_count = 5.' 
      % len(list(w2v_model_remove_stopwords.wv.key_to_index.keys()))) 
#print(w2v_model)
#print(list(w2v_model.wv.key_to_index.keys()))

In [None]:
# 3) Train model

t = time()

w2v_model_remove_stopwords.train(txt_remove_stopwords_final, total_examples=w2v_model_remove_stopwords.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

## Evaluate model

In [None]:
#w2v_model_keep_stopwords.wv.most_similar(positive=["install"])

In [None]:
w2v_model_remove_stopwords.wv['install']

In [None]:
w2v_model_remove_stopwords.wv.most_similar(positive=["install"])

In [None]:
#w2v_model_keep_stopwords.wv.most_similar(positive=["press"])

In [None]:
w2v_model_remove_stopwords.wv.most_similar(positive=["press"])

In [None]:
w2v_model_keep_stopwords.wv.similarity("put", 'tool')

In [None]:
w2v_model_remove_stopwords.wv.similarity("cabin", 'cockpit')

In [None]:
w2v_model_keep_stopwords.wv.doesnt_match(['put', 'install', 'spacer'])