# Word2Vec für nicht übersetzte Daten



Dieses Notebook erstellt die Word Embeddings für die nicht übersetzten Daten.
Schlussendlich werden die umgewandelten Dateneinträge in einem DataFrame gespeichert.

In [1]:
import os
import pandas as pd
import numpy as np
import nltk
import re
import gensim.models.word2vec as w2v
import spacy
import multiprocessing
from nltk.stem import WordNetLemmatizer
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

In [2]:
PATH = 'data/'

In [3]:
df_train = pd.read_csv(os.path.join(PATH, 'nlp_train.csv'))
df_val = pd.read_csv(os.path.join(PATH, 'nlp_valid.csv'))
df_test = pd.read_csv(os.path.join(PATH, 'test.csv'))

In [4]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ramon.koller\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ramon.koller\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [5]:
#Wir laden den Tokenizer und den Lemmatizer ins Notebook

In [6]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

In [7]:
lemmatizer = WordNetLemmatizer()

In [8]:
def sentence_to_wordlist(text):
    sentence = nlp(text)
    output = []
    for word in sentence:
      output.append(lemmatizer.lemmatize(word.text))
    return output

In [9]:
#Testen der Funktion
print(df_train.premise[0])
print(sentence_to_wordlist(df_train.premise[0]))

and these comments were considered in formulating the interim rules.
['and', 'these', 'comment', 'were', 'considered', 'in', 'formulating', 'the', 'interim', 'rule', '.']


In [10]:
#Wir fügen alle Prämissen und Hypothesen in einen Corpus zusammen

In [11]:
sentences = df_train.premise.values.tolist()
sentences.extend(df_train.hypothesis.values.tolist())
sentences.extend(df_val.premise.values.tolist())
sentences.extend(df_val.hypothesis.values.tolist())
sentences.extend(df_test.premise.values.tolist())
sentences.extend(df_test.hypothesis.values.tolist())

In [None]:
len(sentences)

34630

In [None]:
sentences[:10]

['and these comments were considered in formulating the interim rules.',
 'These are issues that we wrestle with in practice groups of law firms, she said. ',
 "Des petites choses comme celles-là font une différence énorme dans ce que j'essaye de faire.",
 "you know they can't really defend themselves like somebody grown uh say my age you know yeah",
 'ในการเล่นบทบาทสมมุติก็เช่นกัน โอกาสที่จะได้แสดงออกและได้เล่นหลายบทบาทไปพร้อมกัน ๆ อาจช่วยให้เด็กจับความคล้ายคลึงและความแตกต่างระหว่างผู้คนในด้านความปรารถนา ความเชื่อ และความรู้สึกได้',
 'Bir çiftlikte birisinin, ağıla kapatılmış bu öküzleri kesmeliyiz dediğini duyabilirsiniz bu muhtemelen şu anlama gelir, yüklenecek olanları ayırın.',
 'ریاست ہائے متحدہ امریکہ واپس آنے پر، ہج ایف بی آئی کے ایجنٹوں کے ذریعے ہوائی اڈے پر ملاقات کی، تحقیقات کی، اور اگلے دن وفاقی گرین جوری سے پہلے اسامہ بن لادن کی تحقیقات سے ملاقات کی.',
 "From Cockpit Country to St. Ann's Bay",
 "Look, it's your skin, but you're going to be in trouble if you don't get busy.

In [None]:
corpus = []
for sentence in sentences:
  corpus.append(sentence_to_wordlist(sentence))

In [None]:
token_count = sum([len(sentence) for sentence in corpus])
print('Unser Corpus enthält:', token_count,  'tokens')

Unser Corpus enthält: 537655 tokens


In [None]:
#In einem Vergleich haben wir das Embedding mit einem vortrainierten Word2Vec ausprobiert. Dann kann man das Training üerspringen
#nlp2vec = api.load("glove-wiki-gigaword-300")

In [None]:
#Nun können wir das W2V trainieren. Weiter unten kann man jedoch auch unsere bereits erstellten W2V einlesen

Training

In [None]:
nlp2vec = w2v.Word2Vec(
    sg=1,
    seed=1,
    workers=multiprocessing.cpu_count(),
    size=300,
    min_count=3,
    window=10,
    sample=1e-3
)

In [None]:
nlp2vec.build_vocab(corpus)

In [None]:
print("Word2Vec Vokabular:", len(nlp2vec.wv.vocab))

Word2Vec Vokabular: 21309


In [None]:
#train model on sentneces
nlp2vec.train(corpus, total_examples=nlp2vec.corpus_count, epochs=50)

(19524498, 26882750)

In [None]:
#Das Word2Vec wird gespeichert
#nlp2vec.save('W2V/nlp2vec_own.w2v')

In [14]:
len(nlp2vec.wv['computer'])

300

In [None]:
vocab = nlp2vec.wv.vocab.keys()

In [None]:
len(vocab)

21309

Umwandeln der Daten

In [None]:
#Nun tokenizen wir alle Prämissen/Hypothesen, wandeln jedes Wort in einen Vektor um und speichern die Sequenz als Liste

In [None]:
train_premises_embedded = []
for sentence in df_train.premise.values:
  train_premises_embedded.append(sentence_to_wordlist(sentence))

In [None]:
X_train_premises = []
for sentences in train_premises_embedded:
  sentence = []
  for i in sentences:
    if i in vocab:
      sentence.append(nlp2vec.wv[i])
  X_train_premises.append(np.array(sentence))

In [None]:
train_hypothesis_embedded = []
for sentence in df_train.hypothesis.values:
  train_hypothesis_embedded.append(sentence_to_wordlist(sentence))

In [None]:
X_train_hypothesis = []
for sentences in train_hypothesis_embedded:
  sentence = []
  for i in sentences:
    if i in vocab:
      sentence.append(nlp2vec.wv[i])
  X_train_hypothesis.append(np.array(sentence))

In [None]:
val_premises_embedded = []
for sentence in df_val.premise.values:
  val_premises_embedded.append(sentence_to_wordlist(sentence))

In [None]:
X_val_premises = []
for sentences in val_premises_embedded:
  sentence = []
  for i in sentences:
    if i in vocab:
      sentence.append(nlp2vec.wv[i])
  X_val_premises.append(np.array(sentence))

In [None]:
val_hypothesis_embedded = []
for sentence in df_val.hypothesis.values:
  val_hypothesis_embedded.append(sentence_to_wordlist(sentence))

In [None]:
X_val_hypothesis = []
for sentences in val_hypothesis_embedded:
  sentence = []
  for i in sentences:
    if i in vocab:
      sentence.append(nlp2vec.wv[i])
  X_val_hypothesis.append(np.array(sentence))

In [None]:
test_premises_embedded = []
for sentence in df_test.premise.values:
  test_premises_embedded.append(sentence_to_wordlist(sentence))

In [None]:
X_test_premises = []
for sentences in test_premises_embedded:
  sentence = []
  for i in sentences:
    if i in vocab:
      sentence.append(nlp2vec.wv[i])
  X_test_premises.append(np.array(sentence))

In [None]:
test_hypothesis_embedded = []
for sentence in df_test.hypothesis.values:
  test_hypothesis_embedded.append(sentence_to_wordlist(sentence))

In [None]:
X_test_hypothesis = []
for sentences in test_hypothesis_embedded:
  sentence = []
  for i in sentences:
    if i in vocab:
      sentence.append(nlp2vec.wv[i])
  X_test_hypothesis.append(np.array(sentence))

In [None]:
#Wir generieren ein NP-Array aus den Listen

In [None]:
X_train_premises_vect = np.array(X_train_premises)
X_val_premises_vect = np.array(X_val_premises)
X_test_premises_vect = np.array(X_test_premises)
X_train_hypothesis_vect = np.array(X_train_hypothesis)
X_val_hypothesis_vect = np.array(X_val_hypothesis)
X_test_hypothesis_vect = np.array(X_test_hypothesis)

  X_train_premises_vect = np.array(X_train_premises)
  X_val_premises_vect = np.array(X_val_premises)
  X_test_premises_vect = np.array(X_test_premises)
  X_train_hypothesis_vect = np.array(X_train_hypothesis)
  X_val_hypothesis_vect = np.array(X_val_hypothesis)
  X_test_hypothesis_vect = np.array(X_test_hypothesis)


In [None]:
X_train_premises_vect.shape

(10908,)

In [None]:
#Auch die Labels speichern wir

In [None]:
y_train = df_train.label.values
y_val = df_val.label.values

In [None]:
#Wir fassen die Arrays in einem DataFrame zusammen, welches wir dann abspeichern

In [None]:
X_train = pd.DataFrame(columns=["premises", "hypothesis"])
for premise, hypothesis in zip(X_train_premises_vect, X_train_hypothesis_vect):
  X_train = X_train.append({'premises':premise, 'hypothesis':hypothesis}, ignore_index=True)

In [None]:
X_val = pd.DataFrame(columns=["premises", "hypothesis"])
for premise, hypothesis in zip(X_val_premises_vect, X_val_hypothesis_vect):
  X_val = X_val.append({'premises':premise, 'hypothesis':hypothesis}, ignore_index=True)

In [None]:
X_test = pd.DataFrame(columns=["premises", "hypothesis"])
for premise, hypothesis in zip(X_test_premises_vect, X_test_hypothesis_vect):
  X_test = X_test.append({'premises':premise, 'hypothesis':hypothesis}, ignore_index=True)

In [None]:
X_train.to_pickle('data_lstm/X_train.pkl')
X_val.to_pickle('data_lstm/X_val.pkl')
X_test.to_pickle('data_lstm/X_test.pkl')

In [None]:
np.save('data_lstm/y_train.npy', y_train, allow_pickle=True)
np.save('data_lstm/y_val.npy', y_val, allow_pickle=True)


In [None]:
X_train.head()

Unnamed: 0,premises,hypothesis
0,"[[-0.16239001, 0.4711363, 0.045901172, -0.0624...","[[0.17375958, 0.38324487, -0.18143673, -0.1660..."
1,"[[-0.20772755, 0.39440167, 0.19650969, -0.3755...","[[0.048674084, 0.34099966, -0.07691175, -0.388..."
2,"[[-0.23557667, 0.048206203, -0.07953441, -0.16...","[[-0.31674537, 0.024199734, 0.14628309, 0.0123..."
3,"[[-0.10270183, 0.06397351, 0.0023827245, 0.132...","[[0.14381126, 0.1928164, 0.7238733, 0.34675923..."
4,"[[-0.40571034, -0.07702727, 0.07719418, 0.2914...",[]


Checkpoints

In [13]:
#Laden des WordEmbeddings ohne Übersetzung
nlp2vec = w2v.Word2Vec.load('W2V/nlp2vec_own.w2v')

In [None]:
#Laden des WordEmbeddings mit Übersetzung
#nlp2vec = w2v.Word2Vec.load('W2V/nlp2vec_own_en.w2v')

Analyse

In [None]:
#Im Notebook vom Word2Vec_translate