In [None]:
"""
Running all models (BERT based models can be run and used in the BERT_based_models notebook)
"""

In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.pipeline import make_pipeline
from loader import loader_train, loader_test, train_validation_split
from transforms import MinEditDistance, Vectorizer, LemmaTransform, FuncTransform, Mixor, \
                        SynonymTransform, POSTransform, BaseTransform, ProcessingVocab, \
                        EmbeddingTransform, TokenTransform, EncoderTransform, BERTProcessing
from models import BowModel, CosineSimilarity, build_matrix, SiameseLSTM, BERTModel, MLPEstimator
from evaluation import evaluate_model, spearman_measure
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
import joblib
import tensorflow as tf

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Load data

In [2]:
dfX, y = loader_train()
dfX_test = loader_test()
vocab = ProcessingVocab(dfX).voc()
fname="data/glove-wiki-gigaword-300.txt"
df_embeddings = build_matrix(vocab=vocab, fname=fname)

# Create models

In [4]:
bow_model_counts = BowModel(ponderation='count') # Bag of words using simple tokenization and 'counts' weights
bow_model_tfidf = BowModel(ponderation='tfidf') # Bag of words using simple tokenization and 'TFIDF' weights
POSModel = make_pipeline(POSTransform(tags_only=True, join_labels=True), # Compare POS tags
                         Vectorizer(ponderation='tfidf'), 
                         CosineSimilarity())
synonyms_model_counts = make_pipeline(SynonymTransform(), # Process data with synonyms and 'counts' weights
                                                                     Vectorizer(ponderation='count'), 
                                                                     CosineSimilarity())
synonyms_model_tfidf = make_pipeline(SynonymTransform(), # Process data with synonyms and 'TFIDF' weights
                                     Vectorizer(ponderation='tfidf'), 
                                     CosineSimilarity())
lemmas_model_counts = make_pipeline(LemmaTransform(), # Process data with lemmas and 'counts' weights
                                    Vectorizer(ponderation='count'), 
                                    CosineSimilarity())
lemmas_model_tfidf = make_pipeline(LemmaTransform(), # Process data with lemmas and 'TFIDF' weights
                                   Vectorizer(ponderation='tfidf'), 
                                   CosineSimilarity())
inverse_med_model = make_pipeline(MinEditDistance(), # Compute Min-Edit-Distance and inverse it to get score between 0 and 1
                                    FuncTransform(lambda x:1/(1+np.sqrt(x))))
embeddings_model = make_pipeline(TokenTransform(), EmbeddingTransform(method=sum), # Use dense vectors from Glove embeddings
                                          CosineSimilarity())
model_MLP = make_pipeline(Vectorizer(ponderation='tfidf'), Mixor(method='sum'), # Use 'TFIDF' preprocessing and Perceptron as estimator
                          TruncatedSVD(n_components=1000), MLPEstimator())
model_LSTM = make_pipeline(TokenTransform(), # Use Glove embeddings and siamese bidirectional LSTM to get sentence vectors
                           EncoderTransform(vocab=vocab, l_seq=10), 
                           SiameseLSTM(df_embeddings))

# Fit models

In [None]:
model_MLP.fit(dfX_train, y_train) # Train Multi-Layer-Perceptron
model_LSTM.fit(dfX_train, y_train) # Train LSTM mdoel

# Make predictions

In [None]:
y_bow_counts = bow_model_counts.predict(dfX_test)
y_bow_tfidf = bow_model_tfidf.predict(dfX_test)
y_POS = POSModel.predict(dfX_test)
y_synonyms_counts = synonyms_model_counts.predict(dfX_test)
y_synonyms_tfidf = synonyms_model_tfidf.predict(dfX_test)
y_lemmas_counts = lemmas_model_counts.predict(dfX_test)
y_lemmas_tfidf = lemmas_model_tfidf.predict(dfX_test)
y_inverse_med = inverse_med_model.transform(dfX_test)
y_embeddings = embeddings_model.predict(dfX_test)
y_MLP_test = model_MLP.predict(dfX_test)
y_LSTM_test = model_LSTM.predict(dfX_test)

In [5]:
!jupyter nbconvert --to html Models.ipynb

[NbConvertApp] Converting notebook Models.ipynb to html
[NbConvertApp] Writing 293068 bytes to Models.html
