In [None]:
import os
import sys
import pandas as pd
import numpy as np

In [None]:
text_processing_scripts_path = os.path.abspath('TextProcessingScripts')
sys.path.append(text_processing_scripts_path)

scripts_path = os.path.abspath('Scripts')
sys.path.append(scripts_path)

In [None]:
from FeatureExtraction.VocabDict import create_vocab_dict

In [None]:
data_path = os.path.abspath('data')

In [None]:
train_data = pd.read_csv(os.path.join(data_path, 'train.csv')).drop_duplicates()
train_data.index = range(len(train_data))

In [None]:
aliased_snippet = []
for i in range(len(train_data)):
    aliased_snippet.append(train_data['snippet'][i].replace(train_data['company1'][i],'company1').replace(train_data['company2'][i],'company2'))
train_data['snippet'] = aliased_snippet

train_data['snippet'] = train_data['snippet'].str.lower()

In [None]:
grouped_train_data = train_data.groupby(['company1','company2','is_parent'])['snippet'].apply(list)
grouped_train_data = grouped_train_data.to_frame().reset_index()

In [None]:
def word_tokenizer(string):
    return string.split()

In [None]:
all_documents_tokenized_words = [list(set(word_tokenizer(snippet))) for snippet in train_data['snippet']]
all_documents_tokenized_sentences_tokenized_words = [word_tokenizer(snippet) for snippet in train_data['snippet']]

In [None]:
sent_lens = [len(sent) for sent in all_documents_tokenized_sentences_tokenized_words]
sent_lens = sorted(sent_lens)
estimated_sent_len = sent_lens[int(len(sent_lens)*0.90)]

doc_lens = [len(snippet) for snippet in grouped_train_data['snippet']]
doc_lens = sorted(doc_lens)
estimated_doc_len = doc_lens[int(len(doc_lens)*0.90)]

In [None]:
vocab_dict, rev_vocab_dict = create_vocab_dict(all_documents_tokenized_words, min_doc_count=50)

In [None]:
from FeatureExtraction.UnknownWordsProcessing import UnknownWordsProcessing


In [None]:
from FeatureExtraction.Word2VecUtilities import create_word2vector_model, create_embeddings_matrix, save_word2vector_model, load_word2vector_model

In [None]:
unknown_words_processing = UnknownWordsProcessing(vocab_list=vocab_dict.keys(), replace=False)
w2v_training_sentences = unknown_words_processing.remove_or_replace_unkown_word_from_sentences(all_documents_tokenized_sentences_tokenized_words)
w2v_model = create_word2vector_model(w2v_training_sentences, wv_size=50)

In [None]:
from TensorflowInputProcessing.SentenceProcessing import SentenceProcessing
from TensorflowInputProcessing.DocumentProcessing import DocumentProcessing

In [None]:
sentence_processing = SentenceProcessing()
document_processing = DocumentProcessing()

In [None]:
tokenized_sentences_tokenized_words = [word_tokenizer(sent) for sent in grouped_train_data['snippet'][0]]

In [None]:
tokenized_sentences_tokenized_words = unknown_words_processing.remove_or_replace_unkown_word_from_sentences(tokenized_sentences_tokenized_words)
preprocessed_sentences_of_document = sentence_processing.pad_truncate_sent(tokenized_sentences_tokenized_words, estimated_sent_len,  dummy_token='my_dummy')
preprocessed_document = document_processing.pad_truncate_document(preprocessed_sentences_of_document, estimated_doc_len, estimated_sent_len)

In [None]:
embedding_matrix = create_embeddings_matrix(w2v_model, rev_vocab_dict)
embedding_matrix = np.vstack((embedding_matrix, np.zeros((1, embedding_matrix.shape[1]))))

vocab_dict['my_dummy']=len(vocab_dict)
rev_vocab_dict[len(rev_vocab_dict)]='my_dummy'

In [None]:
from TensorflowInputProcessing.MapWordToID  import MapWordToID 

In [None]:
map_word_to_id = MapWordToID(vocab_dict)

In [None]:
id_array = np.asarray(map_word_to_id.word_lists_to_id_lists(preprocessed_document))

In [None]:
id_array