In [0]:
%tensorflow_version 2.x

In [2]:
cd 'drive/My Drive/colab_data'

/content/drive/My Drive/colab_data


In [0]:
import tensorflow as tf
import pandas as pd
from collections import Counter
from string import punctuation
from sklearn.model_selection import train_test_split
import gensim
import numpy as np

In [0]:
quora = pd.read_csv('quora.csv')

In [0]:
def tokenize(text: str) -> list:
  tokens = text.lower().split()
  return [token.strip(punctuation) for token in tokens]

def filter_dict(d: dict, func: callable) -> dict:
    new_d = dict()
    for key, value in d.items():
        if func((key, value)):
            new_d[key] = value
    return new_d


In [0]:
quora['tokenized'] = quora.question_text.apply(tokenize)

In [0]:
def build_vocab(texts: list, min_count: int) -> set:
  
  vocab = Counter()

  for text in texts:
    vocab.update(text)

  return set(filter_dict(vocab, lambda x: x[1] > min_count))

In [0]:
def index_words(words: set) -> dict:
  d = {'UNK': 1, 'PAD': 0}
  for word in words:
    d[word] = len(d)
  return d

In [0]:
def index_text(text: list) -> list:
  return [word2id.get(token, 1) for token in text]

In [0]:
vocab = build_vocab(quora.tokenized.values, 100)

In [0]:
word2id = index_words(vocab)

In [0]:
id2word = {i: word for word, i in word2id.items()}

In [0]:
quora['indexed'] = quora.tokenized.apply(index_text)

In [0]:
max_len = max(quora.indexed.str.len())

In [0]:
X = tf.keras.preprocessing.sequence.pad_sequences(quora.indexed.values, maxlen=max_len)

In [0]:
y = quora.target.values

1) На одной из задач сравните, что лучше: а) конкатенировать эмбединги в один большой эмбединг, б) усреднять эмбединги, в) складывать эмбединги. Зафиксируйте разбиение на train_test через random_seed, чтобы обучаться и оцениваться на одних и тех же данных. Параметры нейронок не нужно делать одинаковыми. Просто попробуйте добиться максимума от каждого из подходов.

In [0]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.05, random_state=1)

In [0]:
def get_model(emb_transform: str):
  inputs = tf.keras.layers.Input(shape=(max_len,))
  embeddings = tf.keras.layers.Embedding(input_dim=len(word2id), output_dim=100)(inputs)
  if emb_transform == 'mean':
    transformed = tf.keras.layers.Lambda(lambda x: tf.keras.backend.mean(x,  axis=1))(embeddings)
  elif emb_transform == 'sum':
    transformed = tf.keras.layers.Lambda(lambda x: tf.keras.backend.sum(x,  axis=1))(embeddings)
  else:
    transformed = tf.keras.layers.Flatten()(embeddings)
    

  dense = tf.keras.layers.Dense(64, activation='relu')(transformed)
  outputs = tf.keras.layers.Dense(1, activation='sigmoid')(dense)
  model = tf.keras.Model(inputs=inputs, outputs=outputs)
  model.compile(optimizer='adam',
                loss='binary_crossentropy',
                metrics=['accuracy'])
  return model

In [0]:
model_1 = get_model('mean')

In [70]:
%%time

model_1.fit(X_train, y_train, 
            validation_data=(X_valid, y_valid),
            batch_size=20000,
            epochs=10,
            callbacks=[tf.keras.callbacks.EarlyStopping(patience=2)])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 6min 43s, sys: 7.91 s, total: 6min 51s
Wall time: 5min 7s


<tensorflow.python.keras.callbacks.History at 0x7f35fe00b240>

In [0]:
model_2 = get_model('sum')

In [72]:
%%time

model_2.fit(X_train, y_train, 
            validation_data=(X_valid, y_valid),
            batch_size=20000,
            epochs=10,
            callbacks=[tf.keras.callbacks.EarlyStopping(patience=2)])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 6min 43s, sys: 7.37 s, total: 6min 50s
Wall time: 4min 57s


<tensorflow.python.keras.callbacks.History at 0x7f35fdffe9b0>

In [0]:
model_3 = get_model('flat')

In [74]:
%%time

model_3.fit(X_train, y_train, 
            validation_data=(X_valid, y_valid),
            batch_size=20000,
            epochs=10,
            callbacks=[tf.keras.callbacks.EarlyStopping(patience=2)])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
CPU times: user 4min 47s, sys: 5.98 s, total: 4min 53s
Wall time: 3min 50s


<tensorflow.python.keras.callbacks.History at 0x7f35fe4c6550>

В датасете очень большой перевес классов, поэтому на метрику *accuracy* полагаться не стоит.

In [75]:
quora.target.value_counts()

0    1225312
1      80810
Name: target, dtype: int64

In [0]:
from sklearn.metrics import classification_report, f1_score

In [0]:
preds_1 = model_1.predict(X_valid).reshape(-1)
preds_2 = model_2.predict(X_valid).reshape(-1)
preds_3 = model_3.predict(X_valid).reshape(-1)

In [88]:
print(classification_report(y_valid, (preds_1 > 0.5).astype(int)))

              precision    recall  f1-score   support

           0       0.96      0.99      0.97     61198
           1       0.66      0.39      0.49      4109

    accuracy                           0.95     65307
   macro avg       0.81      0.69      0.73     65307
weighted avg       0.94      0.95      0.94     65307



In [89]:
print(classification_report(y_valid, (preds_2 > 0.5).astype(int)))

              precision    recall  f1-score   support

           0       0.96      0.99      0.97     61198
           1       0.73      0.38      0.50      4109

    accuracy                           0.95     65307
   macro avg       0.84      0.69      0.74     65307
weighted avg       0.95      0.95      0.94     65307



In [90]:
print(classification_report(y_valid, (preds_3 > 0.5).astype(int)))

              precision    recall  f1-score   support

           0       0.97      0.98      0.97     61198
           1       0.64      0.56      0.60      4109

    accuracy                           0.95     65307
   macro avg       0.80      0.77      0.79     65307
weighted avg       0.95      0.95      0.95     65307



In [91]:
f1_score(y_valid, (preds_1 > 0.5).astype(int))

0.4883792048929664

In [92]:
f1_score(y_valid, (preds_2 > 0.5).astype(int))

0.5002391200382592

In [93]:
f1_score(y_valid, (preds_3 > 0.5).astype(int))

0.5954079647165651

Видим, что лучший результат показывает сплющивание вектора. Еще нужно отметить, что перед тем, как я подкрутил какие-то параметры, у меня получалась лучше сумма, но мне не удалось понять в чем именно было дело.

2) В одной из задач используйте сразу две предобученные эмбединг модели (можно фастекст и ворд2век, а можно просто 2 фастекста с разными параметрами) в одной нейронке. Попробуйте усреднять эмбединги и конкатенировать (общие эмбединги потом в один вектор собирайте любым способом - например, тем который оказался лучше в первой задаче). Подсказка: используйте functional api, сделайте два Embedding слоя и к их выходам примените слой Concatenate([emb1, emb2)) или Average([emb1, emb2))

In [0]:
dim = 100

In [0]:
ft_1 = gensim.models.FastText(quora.tokenized.values, size=dim, iter=2, window=3)

In [0]:
# # # Используем CBOW, суммируем контекстные вектора, задаем минимальные нграммы, минимальную частоту
# ft_1 = gensim.models.FastText(quora.tokenized.values, size=dim, iter=5, window=3,
#                               min_count=30, workers=-1, sg=0, cbow_mean=0, min_n=3)

In [0]:
# # Используем Skipgram, меньшая минимальная частота, большее окно, negative sampling
# ft_2 = gensim.models.FastText(quora.tokenized.values, size=dim, iter=5, window=7,
#                               min_count=10, negative=10, workers=-1)

In [0]:
w2v = gensim.models.Word2Vec(quora.tokenized.values, size=dim, window=3, min_count=1, workers=-1)

In [0]:
def get_weights(word2id, model):

  dim = model.vector_size
  weights = np.zeros((len(word2id), dim))

  for word, i in word2id.items():
    if word == 'PAD':
        continue
    if word == 'UNK':
        weights[i] = np.random.normal(0, 2, dim) #поменять
    try:
        weights[i] = model.wv.get_vector(word)
    except KeyError:
      weights[i] = np.random.normal(0, 2, dim)
  
  return weights

    

In [0]:
inputs = tf.keras.layers.Input(shape=(max_len,))

embeddings_1 = tf.keras.layers.Embedding(input_dim=len(word2id), output_dim=100, trainable=False,
                                      weights=[get_weights(word2id, ft_1)])(inputs)
embeddings_2 = tf.keras.layers.Embedding(input_dim=len(word2id), output_dim=100, trainable=False,
                                      weights=[get_weights(word2id, w2v)])(inputs)


avg_emb = tf.keras.layers.Average()([embeddings_1, embeddings_2])
mean = tf.keras.layers.Lambda(lambda x: tf.keras.backend.sum(x,  axis=1))(avg_emb)
dense = tf.keras.layers.Dense(64, activation='relu')(mean)
dense = tf.keras.layers.Dense(64, activation='relu')(dense)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(dense)

model_4 = tf.keras.Model(inputs=inputs, outputs=outputs)
model_4.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [48]:
%%time

model_4.fit(X_train, y_train, 
            validation_data=(X_valid, y_valid),
            batch_size=20000,
            epochs=3,
            callbacks=[tf.keras.callbacks.EarlyStopping(patience=2)])

Epoch 1/3
Epoch 2/3
Epoch 3/3
CPU times: user 4min 54s, sys: 5.12 s, total: 4min 59s
Wall time: 1min 21s


<tensorflow.python.keras.callbacks.History at 0x7fa4b6361278>

In [0]:
preds_4 = model_4.predict(X_valid).reshape(-1)

In [52]:
print(classification_report(y_valid, (preds_4 > 0.5).astype(int)))

              precision    recall  f1-score   support

           0       0.96      0.99      0.97     61198
           1       0.62      0.31      0.42      4109

    accuracy                           0.94     65307
   macro avg       0.79      0.65      0.69     65307
weighted avg       0.93      0.94      0.94     65307



In [0]:
inputs = tf.keras.layers.Input(shape=(max_len,))

embeddings_1 = tf.keras.layers.Embedding(input_dim=len(word2id), output_dim=100, trainable=False,
                                      weights=[get_weights(word2id, ft_1)])(inputs)
embeddings_2 = tf.keras.layers.Embedding(input_dim=len(word2id), output_dim=100, trainable=False,
                                      weights=[get_weights(word2id, w2v)])(inputs)


avg_emb = tf.keras.layers.Concatenate()([embeddings_1, embeddings_2])
mean = tf.keras.layers.Lambda(lambda x: tf.keras.backend.sum(x,  axis=1))(avg_emb)
dense = tf.keras.layers.Dense(64, activation='relu')(mean)
dense = tf.keras.layers.Dense(64, activation='relu')(dense)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(dense)

model_5 = tf.keras.Model(inputs=inputs, outputs=outputs)
model_5.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [65]:
%%time

model_5.fit(X_train, y_train, 
            validation_data=(X_valid, y_valid),
            batch_size=20000,
            epochs=15,
            callbacks=[tf.keras.callbacks.EarlyStopping(patience=2)])

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
CPU times: user 37min 22s, sys: 40.8 s, total: 38min 3s
Wall time: 10min 23s


<tensorflow.python.keras.callbacks.History at 0x7fa4b0b19278>

In [0]:
preds_5 = model_5.predict(X_valid).reshape(-1)

In [67]:
print(classification_report(y_valid, (preds_5 > 0.5).astype(int)))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97     61198
           1       0.60      0.40      0.48      4109

    accuracy                           0.95     65307
   macro avg       0.78      0.69      0.73     65307
weighted avg       0.94      0.95      0.94     65307



**Эксперименты**

Бонусный балл можно получить за эксперименты с параметрами в Embbeding, Dense (поизучайте другие существующие параметры и попробуйте комбинировать несколько слоев с разным активациями) слоях, а также за эксперименты с улучшением нормализации.

In [0]:
# Регуляризация в эмбеддинговом слое
inputs = tf.keras.layers.Input(shape=(max_len,))
embeddings = tf.keras.layers.Embedding(input_dim=len(word2id), output_dim=100,
                                       activity_regularizer=tf.keras.regularizers.l2())(inputs)
transformed = tf.keras.layers.Lambda(lambda x: tf.keras.backend.sum(x,  axis=1))(embeddings)
dense = tf.keras.layers.Dense(64, activation='relu')(transformed)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(dense)
model = tf.keras.Model(inputs=inputs, outputs=outputs)
model_6 = tf.keras.Model(inputs=inputs, outputs=outputs)

model_6.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [40]:
model_6.fit(X_train, y_train, 
            validation_data=(X_valid, y_valid),
            batch_size=20000,
            epochs=10,
            callbacks=[tf.keras.callbacks.EarlyStopping(patience=2)])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fa4a1c18cf8>

In [0]:
preds_6 = model_6.predict(X_valid).reshape(-1)

In [43]:
print(classification_report(y_valid, (preds_6 > 0.5).astype(int)))

              precision    recall  f1-score   support

           0       0.96      0.99      0.98     61198
           1       0.71      0.41      0.52      4109

    accuracy                           0.95     65307
   macro avg       0.84      0.70      0.75     65307
weighted avg       0.95      0.95      0.95     65307



Качество немного повысилось по сравнению с аналогичной моделью `model_2`

In [0]:
# Регуляризация в полносвязных слоях, дополнительный слой, другие функции активации
inputs = tf.keras.layers.Input(shape=(max_len,))
embeddings = tf.keras.layers.Embedding(input_dim=len(word2id), output_dim=100,)(inputs)
transformed = tf.keras.layers.Lambda(lambda x: tf.keras.backend.sum(x,  axis=1))(embeddings)
dense = tf.keras.layers.Dense(64, activation='tanh',
                              activity_regularizer=tf.keras.regularizers.l2())(transformed)
dense = tf.keras.layers.Dense(64, activation='tanh',
                              activity_regularizer=tf.keras.regularizers.l2())(dense)
dense = tf.keras.layers.Dense(64, activation='relu',
                              activity_regularizer=tf.keras.regularizers.l2())(dense)
                              
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(dense)
model = tf.keras.Model(inputs=inputs, outputs=outputs)
model_7 = tf.keras.Model(inputs=inputs, outputs=outputs)

model_7.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [85]:
model_7.fit(X_train, y_train, 
            validation_data=(X_valid, y_valid),
            batch_size=40000,
            epochs=10,
            callbacks=[tf.keras.callbacks.EarlyStopping(patience=2)])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fa48ddbe048>

In [0]:
preds_7 = model_7.predict(X_valid).reshape(-1)

In [87]:
print(classification_report(y_valid, (preds_7 > 0.5).astype(int)))

              precision    recall  f1-score   support

           0       0.97      0.98      0.98     61198
           1       0.66      0.53      0.59      4109

    accuracy                           0.95     65307
   macro avg       0.81      0.76      0.78     65307
weighted avg       0.95      0.95      0.95     65307

