In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy as np

In [None]:
train_df = pd.read_csv("/kaggle/input/quora-insincere-questions-classification/train.csv")
test_df = pd.read_csv("/kaggle/input/quora-insincere-questions-classification/test.csv")
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

In [None]:
train_df = train_df.dropna(subset=["question_text", "target"])
test_df = test_df.dropna(subset=["question_text"])

train_df = train_df.fillna(value={"qid":"_nan_"})
test_df = test_df.fillna(value={"qid":"_nan_"})

In [None]:
print("Shapes after NaN valus hadled")
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

In [None]:
train_df, val_df = train_test_split(train_df,
                                    test_size=0.1,
                                    random_state=2000)

training_sentences = list(train_df["question_text"].values)
val_sentences = val_df["question_text"].values
test_sentences = test_df["question_text"].values

embed_size = 300
max_features = 50000
maxlen = 100

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_features, oov_token="<oov>")
tokenizer.fit_on_texts(training_sentences)

X_train = tokenizer.texts_to_sequences(training_sentences)
X_val = tokenizer.texts_to_sequences(val_sentences)
X_test = tokenizer.texts_to_sequences(test_sentences)

X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train,
                                                        maxlen=maxlen,
                                                        padding="post",
                                                        truncating="post")
X_val = tf.keras.preprocessing.sequence.pad_sequences(X_val,
                                                        maxlen=maxlen,
                                                        padding="post",
                                                        truncating="post")
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test,
                                                        maxlen=maxlen,
                                                        padding="post",
                                                        truncating="post")

In [None]:
y_train = train_df["target"].values
y_val = val_df["target"].values

In [None]:
inputs = tf.keras.layers.Input(shape=(maxlen,))
x = tf.keras.layers.Embedding(max_features, embed_size)(inputs)
x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, return_sequences=True))(x)
x = tf.keras.layers.GlobalMaxPool1D()(x)
x = tf.keras.layers.Dense(16, activation="relu")(x)
x = tf.keras.layers.Dropout(0.1)(x)
outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)

model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()

In [None]:
hist = model.fit(X_train, y_train, batch_size=4096, epochs=15, validation_data=(X_val, y_val))
y_pred = model.predict(X_test, batch_size=1024)

In [None]:
print(y_pred.shape)

In [None]:
y_te = (y_pred[:,0] > 0.5).astype(np.int)
submit_df = pd.DataFrame({"qid": test_df["qid"], "prediction": y_te})
submit_df.to_csv("submission.csv", index=False)