# Part 1: Exploratory Data Analysis

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline


plt.style.use("seaborn")
pd.set_option("display.max_columns", None)

In [None]:
df = pd.read_csv("../input/predict-closed-questions-on-stack-overflow/train-sample.csv")
df.head()

In [None]:
print("Total rows:", df.shape[0])
print("Total columns:", df.shape[1])

The **PostId** and **OwnerUserId** columns are ID columns and do not have any predictive power so they can be dropped.

In [None]:
df.drop(columns=["PostId", "OwnerUserId"], inplace=True)

Analysing the number of unique values in various columns

In [None]:
df.nunique()

Analysing the percentage of missing values in various columns:

In [None]:
df.isna().sum()*100/df.shape[0]

There are so many missing values in the columns **Tag2** to **Tag5** and **PostClosedDate** but only a small fraction of data points (about 160) in the **Tag1** column is missing. This tells that each question has atleast one tag associated with it. Moreover, some questions do not have any **BodyMarkdown** thus the content of such questions must be contained in the **Title**.

In [None]:
df.loc[df["Tag1"].isna(), "Tag1"] = df["Tag1"].mode()

drop_cols = ["Tag2", "Tag3", "Tag4", "Tag5", "PostClosedDate"]
df.drop(drop_cols, axis=1, inplace=True)

Renaming some columns with very long names

In [None]:
name_map = {
    "PostCreationDate": "PostDate",
    "OwnerCreationDate": "OwnrDate",
    "ReputationAtPostCreation": "OwnrRep",
    "OwnerUndeletedAnswerCountAtPostTime": "AnsCount"
}

df.rename(columns=name_map, inplace=True)

# Part 2: Feature Engineering

We can extract several date and time features from the columns - **PostCreationDate** and **OwnerCreationDate** - and binary encoding them. Further, we can create a new feature signifying how old the owner account is by finding the difference between these two datetime columns in seconds.

In [None]:
import category_encoders as ce


def datetime_features(df):
    df["PostDate"] = pd.to_datetime(df["PostDate"])
    df["OwnrDate"] = pd.to_datetime(df["OwnrDate"])

    df["PostDay"] = df["PostDate"].dt.dayofweek
    df["PostMonth"] = df["PostDate"].dt.month
    df["PostYear"] = df["PostDate"].dt.year
    df["PostHour"] = df["PostDate"].dt.hour
    df["PostMin"] = df["PostDate"].dt.minute

    df["OwnrDay"] = df["OwnrDate"].dt.weekday
    df["OwnrMonth"] = df["OwnrDate"].dt.month
    df["OwnrYear"] = df["OwnrDate"].dt.year
    df["OwnrHour"] = df["OwnrDate"].dt.hour
    df["OwnrMin"] = df["OwnrDate"].dt.minute

    df["AccAge"] = (df["PostDate"] - df["OwnrDate"])/np.timedelta64(1, 's')

    del df["PostDate"]
    del df["OwnrDate"]

    cols = [
        "PostDay",
        "PostMonth",
        "PostYear",
        "PostHour",
        "PostMin",
        "OwnrDay",
        "OwnrMonth",
        "OwnrYear",
        "OwnrHour",
        "OwnrMin"
    ]

    encoder = ce.binary.BinaryEncoder(cols=cols)
    sub_cols = encoder.fit_transform(df[cols])

    df = pd.concat([df, sub_cols], axis=1)
    df.drop(cols, axis=1, inplace=True)

    return df


df = datetime_features(df)

Concatenating **Title** and **BodyMarkdown** to form a new column **QuestionText**.

In [None]:
df["BodyMarkdown"] = df["Title"] + df["BodyMarkdown"]
df.drop(columns=["Title"], axis=1, inplace=True)
df.rename(columns={"BodyMarkdown":"QuestionText"}, inplace=True)

Since, **Tag1** has many classes, one-hot encoding won't be a good idea. Performing binary encoding to encode all the categorical features. Also label encoding the target label since they are strings.

In [None]:
import category_encoders as ce


encoder = ce.binary.BinaryEncoder()
tag1_bits = encoder.fit_transform(df["Tag1"])

df = pd.concat([df, tag1_bits], axis=1)
df.drop(["Tag1"], axis=1, inplace=True)

target_map = {
    "not a real question": 0,
    "not constructive": 1,
    "off topic": 2,
    "open": 3,
    "too localized": 4
}

df["OpenStatus"] = df["OpenStatus"].map(target_map)

In [None]:
df.head()

# Part 3: Model building

Splitting the dataset into training and validation sets and then scaling with sklearn RobustScaler.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler


train_df, val_df = train_test_split(df, test_size=0.2, shuffle=True, random_state=42, stratify=df["OpenStatus"])
del df

train_text = train_df[["QuestionText"]]
train_meta = train_df.drop(["OpenStatus", "QuestionText"], axis=1)
train_target = train_df["OpenStatus"]
del train_df

val_text = val_df[["QuestionText"]]
val_meta = val_df.drop(["OpenStatus", "QuestionText"], axis=1)
val_target = val_df["OpenStatus"]
del val_df

scaler = RobustScaler()
train_meta = scaler.fit_transform(train_meta)
val_meta = scaler.transform(val_meta)

Since the dateset contains both text and meta data, we will require a multi-input neural network to process different kinds of input data type. The text data will pass through stacked LSTMs and then join with meta data where they will further pass through dense layers.

In [None]:
from tensorflow.keras.layers import TextVectorization


text2vec = TextVectorization()
text2vec.adapt(train_text)

print("Total tokens in training data:", text2vec.vocabulary_size())
print("Largest length of any sequence:", text2vec(train_text).shape[1])

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import Model


MAX_TOKENS = 10000
MAX_LEN = 150
EMBED_DIM = 50
NUM_CLASSES = train_target.nunique()

text2vec = TextVectorization(max_tokens=MAX_TOKENS, output_sequence_length=MAX_LEN, name="text2vec")
text2vec.adapt(train_text)

text_input = layers.Input(shape=(1,), dtype=tf.string, name="text_input")
x = text2vec(text_input)
x = layers.Embedding(input_dim=MAX_TOKENS, output_dim=EMBED_DIM, input_length=MAX_LEN, name="embedding")(x)
x = layers.LSTM(units=128, name="hidden_lstm")(x)

meta_input = layers.Input(shape=train_meta.shape[1:], name="meta_input")
y = layers.Concatenate()([x, meta_input])
y = layers.Dense(units=256, activation="selu", kernel_initializer="lecun_normal", name="hidden_dense_1")(y)
y = layers.BatchNormalization()(y)
y = layers.Dense(units=64, activation="selu", kernel_initializer="lecun_normal", name="hidden_dense_2")(y)
y = layers.Dense(units=NUM_CLASSES, activation="softmax", name="softmax_output")(y)

model = Model(inputs=[text_input, meta_input], outputs=y, name="NLP_Model")

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()

In [None]:
from tensorflow.keras.utils import plot_model


plot_model(
    model=model,
    to_file="model.jpeg",
    show_shapes=True
)

In [None]:
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping


reduce_lr = ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.5,
    patience=5,
    verbose=True
)

early_stop = EarlyStopping(
    monitor="val_loss",
    patience=20,
    restore_best_weights=True,
    verbose=True
)

callbacks = [reduce_lr, early_stop]

In [None]:
history = model.fit(
    x=[train_text, train_meta],
    y=train_target,
    batch_size=256,
    epochs=100,
    verbose=1,
    callbacks=callbacks,
    validation_data=([val_text, val_meta], val_target),
    shuffle=True
)

In [None]:
model.evaluate([val_text, val_meta], val_target)