In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import Library

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.preprocessing import OrdinalEncoder
import tensorflow as tf

# Import Data

In [None]:
df_train=pd.read_csv('/kaggle/input/stumbleupon/train.tsv',sep='\t')

df_test=pd.read_csv('/kaggle/input/stumbleupon/test.tsv',sep='\t')

# Exploratory Data Analysis

In [None]:
pd.set_option('display.max_columns', None)
df_train.head()

In [None]:
X = df_train.drop(columns=['url','boilerplate','label'])
Y = df_train['label']
encoder = OrdinalEncoder()
X = pd.DataFrame(encoder.fit_transform(X),columns=X.columns)
X.head()
X.shape[0]

In [None]:
sel_mutual = SelectKBest(mutual_info_classif, k=24)
X_train_mutual = sel_mutual.fit_transform(X, Y)
print(pd.DataFrame(sel_mutual.scores_,index=X.columns).sort_values(by=0))

In [None]:
plt.figure()
sns.countplot(x=df_train['alchemy_category'], hue=df_train["label"])
plt.xticks(rotation=90)

In [None]:
#distribution of labels
plt.figure()
sns.countplot(x=df_train['label'])
plt.xticks(rotation=90)

alchemy_category has the strongest correlation with the label.

# Boiler Plate Column Analysis

In [None]:
!pip3 install bert-for-tf2

In [None]:
import tensorflow_hub as hub
from bert import bert_tokenization
module_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'
bert_layer = hub.KerasLayer(module_url, trainable=True)

## Preprocessing Text

In [None]:
df_train['boilerplate'].replace(to_replace=r'"title":', value="",inplace=True,regex=True)
df_train['boilerplate'].replace(to_replace=r'"url":',value="",inplace=True,regex=True)

df_train['boilerplate'].replace(to_replace=r'{|}',value="",inplace=True,regex=True)

df_test['boilerplate'].replace(to_replace=r'"title":', value="",inplace=True,regex=True)
df_test['boilerplate'].replace(to_replace=r'"url":',value="",inplace=True,regex=True)

df_test['boilerplate'].replace(to_replace=r'{|}',value="",inplace=True,regex=True)


## Model Building

In [None]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case=True)

def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [None]:
def build_model(bert_layer, max_len=512):
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    bert_output = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)
    net = tf.keras.layers.BatchNormalization()(bert_output)
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(32, activation='relu')(net)
    net = tf.keras.layers.Dense(10, activation='relu')(net)
    out = tf.keras.layers.Dense(1, activation='sigmoid')(net)
    
    model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
     
    return model

In [None]:
max_len = 512
train_input = bert_encode(df_train.boilerplate.values, tokenizer, max_len=max_len)
test_input = bert_encode(df_test.boilerplate.values, tokenizer, max_len=max_len)
train_labels = df_train.label

In [None]:
model = build_model(bert_layer, max_len=max_len)
model.layers[3].trainable=False
model.summary()

In [None]:
from tensorflow.keras.utils import plot_model
plot_model(model, show_shapes=True, show_layer_names=True)

In [None]:
# Set up epochs and steps
epochs = 15
batch_size = 32

train_data_size = len(df_train)
steps_per_epoch = int(train_data_size / batch_size)
num_train_steps = steps_per_epoch * epochs
initial_learning_rate = 1e-5

lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=initial_learning_rate,
    decay_steps=10000,
    decay_rate=0.95)
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)


model.compile(optimizer=optimizer,
              loss=tf.keras.losses.BinaryCrossentropy(), metrics=[tf.keras.metrics.AUC()])
   

checkpoint = tf.keras.callbacks.ModelCheckpoint('model.h5', 
                                                monitor='val_accuracy',
                                                save_best_only=True, verbose=1)
earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy',
                                                 patience=5, verbose=1)

train_history = model.fit(
    train_input, train_labels, 
    validation_split=0.2,
    epochs=epochs,
    callbacks=[checkpoint, earlystopping],
    batch_size=batch_size,
    verbose=1)

In [None]:
predictions = model.predict(test_input)

In [None]:
df_test['label']=predictions
df_test.to_csv('submission.csv',columns=['urlid','label'],index=False)
print("Your submission was successfully saved!")