In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import re
import string
import os
os.environ["KMP_SETTINGS"] = "false"

import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras

raw_df = pd.read_csv('../input/quora-insincere-questions-classification/train.csv')
test_df = pd.read_csv('../input/quora-insincere-questions-classification/test.csv')

raw_df

In [None]:
batch_size = 512

raw_df = raw_df[['question_text', 'target']]#.sample(frac=1).reset_index(drop=True)

dataset = tf.data.Dataset.from_tensor_slices((raw_df['question_text'], tf.convert_to_tensor(raw_df['target'], dtype=tf.int64)))
dataset = dataset.shuffle(10000).batch(batch_size) 

In [None]:
# split dataset
ds_batches = tf.data.experimental.cardinality(dataset)
val_ds = dataset.take(ds_batches // 5)
train_ds = dataset.skip(ds_batches // 5)

#train_ds_batches = tf.data.experimental.cardinality(train_ds)
#test_ds = dataset.take(train_ds_batches // 5)
#train_ds = dataset.skip(train_ds_batches // 5)

print('Number of train batches: %d' % tf.data.experimental.cardinality(train_ds))
print('Number of validation batches: %d' % tf.data.experimental.cardinality(val_ds))
#print('Number of test batches: %d' % tf.data.experimental.cardinality(test_ds))

In [None]:
def custom_standardization(input_data):
    s = tf.strings.lower(input_data)
    s = tf.strings.regex_replace(s, '<.*?>','') # html
    s = tf.strings.regex_replace(s, 'http\S+','') # links
    s = tf.strings.regex_replace(s, '[^\x00-\x7F]+','') # non-ascii
    s = tf.strings.regex_replace(s, '[%s]' % re.escape(string.punctuation),' ')
    s = tf.strings.regex_replace(s, '\n',' ')
    s = tf.strings.regex_replace(s, '\W+',' ') # non-word characters
    s = tf.strings.regex_replace(s, '\s+',' ')
    return s

# show example preprocessed
for row in dataset.take(1).map(lambda x, y: custom_standardization(x)):
    print(str(row[0].numpy().decode('ascii')))

In [None]:
max_features = 30000#len(vocab)
sequence_length = 64

vectorize_layer = keras.layers.experimental.preprocessing.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

text = train_ds.map(lambda x, y: x)
vectorize_layer.adapt(next(iter(text)))

In [None]:
import zipfile
local_zip = "/kaggle/input/quora-insincere-questions-classification/embeddings.zip"
zip_ref = zipfile.ZipFile(local_zip, 'r')
#zip_ref.extractall('/kaggle/temp/')
zip_ref.namelist()

In [None]:
#zip_ref.extract(_glove)
zip_ref.extractall()

In [None]:
_glove = './glove.840B.300d/glove.840B.300d.txt'
_paragram =  './paragram_300_sl999/paragram_300_sl999.txt'
_wiki_news = './wiki-news-300d-1M/wiki-news-300d-1M.vec'
#_google_news = './GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'

embeddings = [{'name': 'glove', 'path': _glove},
              {'name': 'paragram', 'path': _paragram},
              {'name': 'fasttext', 'path': _wiki_news}]

def load_embed(file):
    def get_coefs(word, *arr):
        return word, np.asarray(arr, dtype='float32')
    
    if file.split('/')[-1] == 'wiki-news-300d-1M.vec':
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file) if len(o) > 100)
    else:
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file, encoding='latin'))
        
    return embeddings_index

In [None]:
def create_emb_matrix(vocab_size, embed_size):
    return np.zeros((vocab_size, embed_size), dtype=np.float32)
        
def fill_emb_matrix(word_idx, emb_matrix, emb_index):
    for word, i in word_idx:
        emb_vector = emb_index.get(word)
        if emb_vector is not None:
            emb_matrix[i] = emb_vector
    return emb_matrix

def add_lower(embedding, vocab):
    count = 0
    for word in vocab:
        if word in embedding and word.lower() not in embedding:  
            embedding[word.lower()] = embedding[word]
            count += 1
    print(f"Added {count} words to embedding")

In [None]:
vectorize_layer.vocabulary_size()

In [None]:
def vocab_to_integer(vocab):
    return {word: ii for ii, word in enumerate(vocab, 0)}

vocab = vectorize_layer.get_vocabulary()
vocab_index = vocab_to_integer(vocab)
vocab_len = len(vocab) #+ 1 

conc_embedding = None
embedding_dim = 0

for embedding in embeddings:
    emb_name = embedding['name']
    emb_path = embedding['path']
    print("Running procedure on {}".format(emb_name))
    
    print("Loading {}".format(emb_name))
    emb_index = load_embed(emb_path)

    emb_size = 300
    embedding_dim += emb_size
    
    emb_matrix = create_emb_matrix(vocab_len, emb_size)
    print(emb_matrix.size)
    print(emb_matrix.shape)
    emb_matrix = fill_emb_matrix(vocab_index.items(), emb_matrix, emb_index)
    
    if conc_embedding is not None:
        conc_embedding = np.concatenate((conc_embedding, emb_matrix), axis=1)
        print("Concatenated! New shape: {}".format(conc_embedding.shape))
    else:
        conc_embedding = emb_matrix
    print("=================================================")

In [None]:
model = tf.keras.Sequential([
    keras.layers.Input(shape=(1,), dtype=tf.string),
    vectorize_layer,
    keras.layers.Embedding(vocab_len, embedding_dim, input_length=sequence_length, weights=[conc_embedding], trainable=False),
    keras.layers.SpatialDropout1D(0.3),
    keras.layers.Bidirectional(keras.layers.LSTM(156, return_sequences=True)),
    keras.layers.Bidirectional(keras.layers.GRU(156, return_sequences=True)),
    keras.layers.GlobalMaxPooling1D(),
    #keras.layers.Dense(128, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(1, activation='sigmoid')
])

model.summary()

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

early_stop = keras.callbacks.EarlyStopping(patience=1, restore_best_weights=True)

model.fit(train_ds, validation_data=val_ds, epochs=5, callbacks=[early_stop])

In [None]:
#test_ds = tf.data.Dataset.from_tensor_slices(test_df['question_text'])
y_pred = model.predict(test_df['question_text'])

In [None]:
y_pred

In [None]:
test_df['prediction'] = np.where(y_pred >= 0.5, 1, 0)
predictions = test_df[['qid', 'prediction']]
predictions.head()

In [None]:
predictions.to_csv('submission.csv', index=False)