# Disaster Tweets Classification

This kernel intends to show data wrangling techiniques for tweets and using multiple inputs LSTM model (using Keras Funtional API). The techniques used here provides about 0.80 (0.79987 to be precise) score on leaderboard.

I plan to update it as I figure better ways to preprocess ad model data. In case you are here reading this, please let me know your feedback/questions in comments.

* [Data Preprocessing](Data-preprocessing)
* [LSTM Model](LSTM-Model)

In [None]:
import pandas as pd
import numpy as np
import re
from collections import Counter
import statistics
import pickle
import os

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score

import spacy
import en_core_web_sm   #spacy model
nlp = en_core_web_sm.load(disable=['parser', 'ner'])

from tensorflow import keras
from tensorflow.keras import models, initializers, regularizers, Input
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense, Dropout, Embedding, concatenate, LSTM, Bidirectional
import tensorflow as tf; tf.random.set_seed(420)

In [None]:
df_train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
df_test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [None]:
print(df_train.shape, "\n")
print(df_train.columns, "\n")
df_train.head()

In [None]:
# let's check some values for keyword and location
print(df_train['keyword'].unique()[:10])
print(df_train['location'].unique()[:10])

In [None]:
# Checking class balance
(df_train['target'].value_counts()*100)/df_train.shape[0]

57% not real disaster and 43% real disaster tweet

# Data preprocessing

In [None]:
# function to remove all links from tweets
# Input - @bbcmtd Wholesale Markets ablaze http://t.co/lHYXEOHY6C
# Output - @bbcmtd Wholesale Markets ablaze
def remove_http_links(string):
    string = re.sub(r'http\S+', '', str(string))
    string = re.sub(r'http', '', string)
    return string

In [None]:
#remove_http_links('RT @HuffPostComedy: We should build a wall that keeps Burning Man attendees from coming home http://t.co/xwVW1sft4I http://t.co/j7HUKhWmal')

In [None]:
# function to remove metions to other accounts
# Input - @bbcmtd Wholesale Markets ablaze
# Output - Wholesale Markets ablaze

def remove_mentions(string):
    return re.sub(r'@\S+', '', str(string))

In [None]:
#remove_mentions('RT @HuffPostComedy: We should build a wall that keeps Burning Man attendees from coming home ')

In [None]:
def replace_html_symbols(string):
    return string.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<")

In [None]:
#replace_html_symbols("abcd &amp; and are &gt; <")

In [None]:
# Replace %20 as space char
# Input - blew%20up
# Output - blew up
def replace_with_space(string):
    s = re.sub(r'(%20)+', ' ', str(string))
    s = re.sub(r'(&amp)+|(&AMP)+|(&)+', ' and ', str(s))
    return s

In [None]:
# function to capture hastags in separate column
# Input - You just got GIF bombed #AfricansInSF #BeyondGPS
# Output - AfricansInSF, BeyondGPS
def capture_hashtags(string):
    tags_list = re.findall(r'#[\S]*',str(string))
    if len(tags_list) > 0:
        return ' , '.join(tags_list).replace('#','')
    else:
        return ''

In [None]:
# hahaha - to laugh, lol to laugh - haha\w*, (lo*l)
def convert_haha_lol(string):
    return re.sub(r'\b(ha+h+a\w*)|\b(lo+l\w*)', ' laugh ', string)

In [None]:
# convert ccontractions to proper word (https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions)
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

In [None]:
#contractions.get("I'll")
def contractions_to_expansions(string):
    string_list = string.split(' ')
    for i,word in enumerate(string_list):
        if contractions.get(word, "--") != "--":
            string_list[i] = contractions.get(word)
    return " ".join(string_list)

In [None]:
contractions_to_expansions("y'all'd've asked where's food?") #Test

In [None]:
# remove stop words
all_stopwords = nlp.Defaults.stop_words

def remove_stopwords(sentence):
    tokens = sentence.split(" ")
    tokens_filtered= [word for word in tokens if not word in all_stopwords]
    return (" ").join(tokens_filtered)

In [None]:
#remove_stopwords("hello how are you ? how's everything")

In [None]:
# replace emoticon
emoticon_dict = {":)": "laugh", ":-)": "laugh", ";)": "laugh", ";-)": "laugh", ":P": "laugh", ":-P": "laugh", ":D": "laugh", 
                 ":-D": "laugh", ":(": "sad", ":-(": "sad", ":~(": "sad", ":-|": "sad", ">:-(": "sad", "8-)": "laugh", 
                 ":-O": "sad", "8-O": "sad", ">8-O": "sad"}
def replace_emoticons(string):
    for key in emoticon_dict.keys():
        if key in string:
            string = string.replace(key, " "+emoticon_dict[key]+" ")
    return string

In [None]:
#replace_emoticons("@LauradeHolanda I have the Forrest version from '83 that's bloody awful as well :))) xxx")

In [None]:
# Lemmatization
def lemma(string):
    text = []
    for tok in nlp(string):
        if tok.lemma_ != "-PRON-":
            text.append(tok.lemma_)
        else:
            text.append(str(tok))
    return " ".join(text)

In [None]:
# function to remove special characters (but not emojis)
def remove_special_chars(string):
    string = re.sub(r'@','at', str(string))
    string = re.sub(r'&','and', str(string)) #if there still are any
    return re.sub(r'[^a-z ]', ' ', string)

In [None]:
# function to remove extra white spaces
def remove_extra_blanks(string):
    return re.sub(r'\s+', ' ', str(string).strip())

In [None]:
def perform_all_preprocessing(string):
    string = string.lower()
    string = remove_http_links(string)
    string = remove_mentions(string)
    string = contractions_to_expansions(string)
    string = replace_with_space(string)
    string = replace_html_symbols(string)
    string = replace_emoticons(string)
    string = convert_haha_lol(string)
    string = remove_stopwords(string)
    string = lemma(string)
    string = remove_special_chars(string)
    string = remove_extra_blanks(string)
    return string

In [None]:
df_train['text'].fillna('', inplace=True)
df_train['keyword'].fillna('', inplace=True)
df_train['location'].fillna('', inplace=True)

df_train['text'] = df_train['text'].apply(perform_all_preprocessing)

# create separate columns with hashtags
df_train['hashtags'] = df_train['text'].apply(capture_hashtags)

for col in ['keyword', 'location']:
    df_train[col] = df_train[col].apply(str.lower)
    df_train[col] = df_train[col].apply(replace_with_space)
    df_train[col] = df_train[col].apply(remove_special_chars) #maybe not do this
    df_train[col] = df_train[col].apply(remove_extra_blanks)

In [None]:
df_train.head()

In [None]:
df_test['text'].fillna('', inplace=True)
df_test['keyword'].fillna('', inplace=True)
df_test['location'].fillna('', inplace=True)

df_test['text'] = df_test['text'].apply(perform_all_preprocessing)

# create separate columns with hashtags
df_test['hashtags'] = df_test['text'].apply(capture_hashtags)

for col in ['keyword', 'location']:
    df_test[col] = df_test[col].apply(str.lower)
    df_test[col] = df_test[col].apply(replace_with_space)
    df_test[col] = df_test[col].apply(remove_special_chars) #maybe not do this
    df_test[col] = df_test[col].apply(remove_extra_blanks)

print(df_test.shape, "\n")
df_test.head()

In [None]:
# apprently tere are some duplocates too in text with conflicting target response
df_train[df_train['id'].isin([1409, 1420])]['text'].values

In [None]:
print(df_train.shape)
df_train.drop_duplicates(['text']).shape

In [None]:
id1 = df_train['id'].values.tolist()
id2 = df_train.drop_duplicates(subset='text')['id'].values.tolist()
id_del = list(set(id1) - set(id2))
text_del = df_train[df_train['id'].isin(id_del)]['text'].unique().tolist()

In [None]:
print(len(id_del), len(text_del))

In [None]:
text_rep = Counter(df_train[df_train['text'].isin(text_del)]['text'].values.tolist())
text_rep = sorted(text_rep.items(), key=lambda x: x[1], reverse=True)

In [None]:
id = []; keyword = []; location = []; tweets = []; hashtags = []; target = []
error_texts = []

for t in text_del:
    df_temp = df_train[df_train['text'] == t]
    flag = 0
    try:
        tgt = statistics.mode(df_temp['target'].values)
        target.append(tgt)
    except:
        error_texts.append(t)
        flag = 1

    if flag != 1:
        try:
            kwd = statistics.mode(df_temp['keyword'].values)
            keyword.append(kwd)
        except:
            keyword.append(df_temp['keyword'].values[0])

        try:
            ltn = statistics.mode(df_temp['location'].values)
            location.append(ltn)
        except:
            location.append(df_temp['location'].values[0])

        hashtags.append(df_temp['hashtags'].values[0])
        id.append(df_temp['id'].values[0])
        tweets.append(t)

In [None]:
df_train_2 = pd.DataFrame.from_dict({'id':id, 'keyword':keyword, 'location':location, 'text': tweets, 'target':target, 'hashtags':hashtags})

In [None]:
#train_df = df_train_1.append(df_train_2, ignore_index=True)
train_df = df_train[~df_train['text'].isin(text_del)]
train_df.head()

In [None]:
train_df['target'].value_counts()/train_df.shape[0]

Adding single entry for repeated tweets

In [None]:
train_df = pd.concat([train_df, df_train_2], ignore_index=True)

In [None]:
train_df['target'].value_counts()/train_df.shape[0]

**Let's ignore `error_texts` for now, will handle them later**

## Feature engineering/tokenization

In [None]:
# Range (inclusive) of n-gram sizes for tokenizing text.
NGRAM_RANGE = (1, 2)

# Limit on the number of features
TOP_K_ngram = 5000

# One of 'word', 'char'.
TOKEN_MODE = 'word'

MIN_DOCUMENT_FREQUENCY = 2

def ngram_vectorize(train_texts, train_labels, TOP_K_ngram):
    
    # Create keyword arguments to pass to the 'tf-idf' vectorizer.
    kwargs = {
            'ngram_range': NGRAM_RANGE,  # Use 1-grams + 2-grams.
            'dtype': 'int32',
            'strip_accents': 'unicode',
            'decode_error': 'replace',
            'analyzer': TOKEN_MODE,  # Split text into word tokens.
            'min_df': MIN_DOCUMENT_FREQUENCY,
    }
    vectorizer = TfidfVectorizer(**kwargs)

    # Learn vocabulary from training texts and vectorize training texts.
    x_train = vectorizer.fit_transform(train_texts)

    # Select top 'k' of the vectorized features.
    selector = SelectKBest(f_classif, k=min(TOP_K_ngram, x_train.shape[1]))
    selector.fit(x_train, train_labels)
    x_train = selector.transform(x_train).astype('float32')
    
    #with open('/mnt/d/Kaggle/nlp-getting-started/vectorizer_selector.pkl', 'wb') as vect_sel:
    with open('./vectorizer_selector.pkl', 'wb') as vect_sel:
      vect_select_pkl = {'vectorizer':vectorizer, 'selector':selector}
      pickle.dump(vect_select_pkl, vect_sel, pickle.HIGHEST_PROTOCOL)

    x_train = x_train.todense()
    
    return x_train

# LSTM Model

In [None]:
#!wget http://nlp.stanford.edu/data/glove.6B.zip
#!unzip glove*.zip
#!ls

In [None]:
# Limit on the number of features. We use the top 5K features.
TOP_K = 5000

# Limit on the length of text sequences. Sequences longer than this will be truncated.
MAX_SEQUENCE_LENGTH = 30

def sequence_vectorize(train_texts):
    # Create vocabulary with training texts.
    tokenizer = text.Tokenizer(num_words=TOP_K)
    tokenizer.fit_on_texts(train_texts)

    # Vectorize training and validation texts.
    x_train = tokenizer.texts_to_sequences(train_texts)

    # Get max sequence length.
    max_length = len(max(x_train, key=len))#; print("max_length: ", max_length)
    if max_length > MAX_SEQUENCE_LENGTH:
        max_length = MAX_SEQUENCE_LENGTH

    x_train = sequence.pad_sequences(x_train, maxlen=max_length)

    #with open('/mnt/d/Kaggle/nlp-getting-started/tokenizer_max_length.pkl', 'wb') as seq_tranform:
    with open('./tokenizer_max_length.pkl', 'wb') as seq_tranform:
        seq_tranform_pkl = {'tokenizer':tokenizer, 'max_length':max_length}
        pickle.dump(seq_tranform_pkl, seq_tranform, pickle.HIGHEST_PROTOCOL)

    return x_train, tokenizer.word_index

In [None]:
def _get_embedding_matrix(word_index, embedding_dim):
    """Gets embedding matrix from the embedding index data.

    # Arguments
        word_index: dict, word to index map that was generated from the data.
        embedding_dim: int, dimension of the embedding vectors.

    # Returns
        dict, word vectors for words in word_index from pre-trained embedding.
    """

    # Read the pre-trained embedding file and get word to word vector mappings.
    embedding_matrix_all = {}

    # We are using 50d GloVe embeddings.
    fname = '../input/glove6b50dtxt/glove.6B.50d.txt'
    with open(fname) as f:
        for line in f:  # Every line contains word followed by the vector value
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embedding_matrix_all[word] = coefs

    # Prepare embedding matrix with just the words in our word_index dictionary
    num_words = min(len(word_index) + 1, TOP_K)
    embedding_matrix = np.zeros((num_words, embedding_dim))

    for word, i in word_index.items():
        if i >= TOP_K:
            continue
        embedding_vector = embedding_matrix_all.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [None]:
def lstm_model(embedding_dim, dropout_rate, input_shape, keyw_shape, num_classes,
                 num_features, use_pretrained_embedding=False, is_embedding_trainable=False, embedding_matrix=None):

    main_input = Input(shape=(input_shape[0]))
    keyword_input = Input(shape=(keyw_shape[0]))
    
    if use_pretrained_embedding:
        model = Embedding(input_dim=num_features, output_dim=embedding_dim, input_length=input_shape[0],
                            weights=[embedding_matrix], trainable=is_embedding_trainable)(main_input)
    else:
        model = Embedding(input_dim=num_features, input_length=input_shape[0])(main_input)
    
    model = Bidirectional(LSTM(32, return_sequences=True))(model)
    model = Bidirectional(LSTM(32, return_sequences=False))(model)
    
    model = Dense(32, activation='relu')(model)
    keyword = Dense(32, activation='relu')(keyword_input)
    
    model_ = concatenate([model, keyword], axis=1)
    model_ = Dropout(rate=dropout_rate)(model_)
    model_ = Dropout(rate=dropout_rate)(model_)
    model_ = Dense(32, activation='relu')(model_)
    model_ = Dropout(rate=dropout_rate)(model_)
    
    model_pred = Dense(1, activation='sigmoid')(model_)
    model_1 = keras.Model(inputs=[main_input, keyword_input], outputs=[model_pred])
    
    return model_1

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train_df[['text', 'keyword']], train_df['target'], test_size=0.3
                                                  , stratify=train_df['target'], random_state=42)

X_train_keyw = X_train.iloc[:,1]
X_train = X_train.iloc[:,0]
X_val_keyw = X_val.iloc[:,1]
X_val = X_val.iloc[:,0]

In [None]:
X_train.shape, X_val.shape

In [None]:
X_train, word_index = sequence_vectorize(X_train)
X_train_keyw = ngram_vectorize(X_train_keyw, y_train, 200)

with open('./tokenizer_max_length.pkl', 'rb') as test_transformation:
  test_transformation_dict = pickle.load(test_transformation)
  tokenizer = test_transformation_dict['tokenizer']
  max_length = test_transformation_dict['max_length']

#tokenizer.texts_to_sequences('fire burn')
X_val = tokenizer.texts_to_sequences(X_val)
X_val = sequence.pad_sequences(X_val, maxlen=max_length)

with open('./vectorizer_selector.pkl', 'rb') as test_transformation:
  test_transformation_dict = pickle.load(test_transformation)
  _vectorizer_ = test_transformation_dict['vectorizer']
  _selector_ = test_transformation_dict['selector']

X_val_keyw = _vectorizer_.transform(X_val_keyw)
X_val_keyw = _selector_.transform(X_val_keyw)
X_val_keyw = X_val_keyw.todense()

In [None]:
X_train.shape, X_train_keyw.shape

In [None]:
model = lstm_model(embedding_dim=50, dropout_rate=0.4,
                     input_shape=X_train.shape[1:], keyw_shape=X_train_keyw.shape[1:], 
                     num_classes=2, num_features=TOP_K, use_pretrained_embedding=True, 
                     is_embedding_trainable=False, embedding_matrix=_get_embedding_matrix(word_index, 50))

model.summary()

In [None]:
#keras.utils.plot_model(model, "multi_input_and_output_model.png", show_shapes=True, dpi=80)

In [None]:
# Functional API Model

def train_fine_tuned_functional_api_model(data, learning_rate=7e-5, epochs=200, batch_size=32, dropout_rate=0.5
                                          , embedding_dim=50):

    # Get the data.
    (x_train, x_train_key, train_labels), (x_val, x_val_key, val_labels), word_index = data

    # Number of features will be the embedding input dimension. Add 1 for the
    # reserved index 0.
    num_features = min(len(word_index) + 1, TOP_K)

    embedding_matrix = _get_embedding_matrix(word_index, embedding_dim)

    # Create model instance. First time we will train rest of network while
    # keeping embedding layer weights frozen. So, we set
    # is_embedding_trainable as False.
    
    model = lstm_model(embedding_dim=embedding_dim, 
                       dropout_rate=dropout_rate, 
                       input_shape=x_train.shape[1:],
                       keyw_shape=X_train_keyw.shape[1:],
                       num_classes=2,
                       num_features=num_features,
                       use_pretrained_embedding=True,
                       is_embedding_trainable=False,
                       embedding_matrix=embedding_matrix)

    # Compile model with learning parameters.
    loss = BinaryCrossentropy(from_logits=True)
    optimizer = Adam(lr=learning_rate)
    model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

    # Create callback for early stopping on validation loss. If the loss does
    # not decrease in two consecutive tries, stop training.
    callbacks = [EarlyStopping(monitor='val_loss', patience=2)]

    # Train and validate model.
    model.fit([x_train, x_train_key],
              train_labels,
              epochs=epochs,
              callbacks=callbacks,
              validation_data=([x_val, x_val_key], val_labels),
              verbose=2,  # Logs once per epoch.
              batch_size=batch_size)

    # Save the model.
    model.save_weights('./fine_tuned_lstm_model_with_pre_trained_embedding.h5')

    # Create another model instance. This time we will unfreeze the embedding
    # layer and let it fine-tune to the given dataset.
    model = lstm_model(embedding_dim=embedding_dim,
                         dropout_rate=dropout_rate,
                         input_shape=x_train.shape[1:],
                         keyw_shape=X_train_keyw.shape[1:], 
                         num_classes=2,
                         num_features=num_features,
                         use_pretrained_embedding=True,
                         is_embedding_trainable=True,
                         embedding_matrix=embedding_matrix)

    # Compile model with learning parameters.
    model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

    # Load the weights that we had saved into this new model.
    model.load_weights('./fine_tuned_lstm_model_with_pre_trained_embedding.h5')
    
    # Train and validate model.
    history = model.fit([x_train, x_train_key],
                        train_labels,
                        epochs=epochs,
                        callbacks=callbacks,
                        validation_data=([x_val, x_val_key], val_labels),
                        verbose=2,  # Logs once per epoch.
                        batch_size=batch_size)

    # Print results.
    history = history.history
    print('Validation accuracy: {acc}, loss: {loss}'.format(acc=history['val_accuracy'][-1], loss=history['val_loss'][-1]))

    # Save model.
    model.save('./disaster_tweets_lstm_fine_tuned_model')
    return history

In [None]:
data = (X_train, X_train_keyw, y_train), (X_val, X_val_keyw, y_val), word_index

In [None]:
history = train_fine_tuned_functional_api_model(data)

In [None]:
import matplotlib.pyplot as plt

def plot_graphs(history, metric):
  plt.plot(history[metric])
  plt.plot(history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])

In [None]:
plt.figure(figsize=(16,8))
plt.subplot(1,2,1)
plot_graphs(history, 'accuracy')
plt.ylim(None,1)
plt.subplot(1,2,2)
plot_graphs(history, 'loss')
plt.ylim(0,None)

In [None]:
# load the model
model = keras.models.load_model('./disaster_tweets_lstm_fine_tuned_model')

Predict on provided test set for Kaggle leaderboard

In [None]:
X_test_keyw = df_test['keyword']
X_test = df_test['text']

In [None]:
with open('./tokenizer_max_length.pkl', 'rb') as test_transformation:
  test_transformation_dict = pickle.load(test_transformation)
  tokenizer = test_transformation_dict['tokenizer']
  max_length = test_transformation_dict['max_length']

with open('./vectorizer_selector.pkl', 'rb') as test_transformation:
  test_transformation_dict = pickle.load(test_transformation)
  _vectorizer_ = test_transformation_dict['vectorizer']
  _selector_ = test_transformation_dict['selector']

X_test = tokenizer.texts_to_sequences(X_test)
X_test = sequence.pad_sequences(X_test, maxlen=max_length)

X_test_keyw = _vectorizer_.transform(X_test_keyw)
X_test_keyw = _selector_.transform(X_test_keyw)

X_test_keyw = X_test_keyw.todense()

In [None]:
y_test = (model.predict([X_test,X_test_keyw]) > 0.5).astype("int32")

In [None]:
#output = pd.DataFrame.from_dict({'id':df_test['id'].values.tolist(), 'target':y_test.ravel()})
#output.to_csv('./LSTM_emb_tuned_v4.csv', index=False)