In [None]:
from IPython.display import Image
Image("../input/huggingface-image/Capture.JPG")

# Problem Introduction
StumbleUpon is a user-curated web content discovery engine that recommends relevant, high quality pages and media to its users, based on their interests. While some pages we recommend, such as news articles or seasonal recipes, are only relevant for a short period of time, others maintain a timeless quality and can be recommended to users long after they are discovered. In other words, pages can either be classified as "ephemeral" or "evergreen". The ratings we get from our community give us strong signals that a page may no longer be relevant - but what if we could make this distinction ahead of time? A high quality prediction of "ephemeral" or "evergreen" would greatly improve a recommendation system like ours.

Many people know evergreen content when they see it, but can an algorithm make the same determination without human intuition? Your mission is to build a classifier which will evaluate a large set of URLs and label them as either evergreen or ephemeral. Can you out-class(ify) StumbleUpon? As an added incentive to the prize, a strong performance in this competition may lead to a career-launching internship at one of the best places to work in San Francisco.

# Data<br>

There are two components to the data provided for this challenge:

The first component is two files: train.tsv and test.tsv. Each is a tab-delimited text file containing the fields outlined below for 10,566 urls total. Fields for which no data is available are indicated with a question mark.

train.tsv  is the training set and contains 7,395 urls. Binary evergreen labels (either evergreen (1) or non-evergreen (0)) are provided for this set.
test.tsv is the test/evaluation set and contains 3,171 urls.

# Approach 

My aim of taking this past competition is to try out the effect of the new advances in NLP technology over this dataset. <br> This can allow me to gain experience to in that technology . I am planing to tackle this problem using Transformer architecture (BERT) from Hugging face library and TensorFlow 2.0 . Combining this two would allow me to build a powerful state of the art model for the classification task. 

# Pre-processing 

I am just going to focus on the NLP part of the data and see if I can beat the 7 year old competition winner ( or atleast come close) . So i am going to neglect the rest of the feature columns. Also with respect to pre-processing I have done minimum pre-processing ( lower-casing and removing the title and url words )from the text as I expect the Hugging face tokenizer to take care of the rest. Also this article being some kind of news feed it wont contain as many slangs and non-proper english phrases. 

# Model 

Pre-traine BERT model from Hugging Face Library 
Take out embedding from the BERT model 
Feed the embedding into a feed forward Neural network model. 
Evaluate on ROC-AUC curve. 

In [None]:
import pandas as pd 
import numpy as np 

import matplotlib.pyplot as plt 
import seaborn as sns 
%matplotlib inline 

import tensorflow as tf 

# Load the data 

In [None]:
df=pd.read_csv('../input/rnntex/text_with_summary.csv')
df_train=df[:1292][['summary','target']]
df_test=df[1292:][['summary','target']].reset_index()

In [None]:
df_test.shape, df_train.shape

# Data Explorations 

The dataset containes 27 columns and the end goal is predicting if the article is evergreen or non-evergreen. <br>


In [None]:
df_train.head()


In [None]:
df_train.columns

In [None]:
df_train['alchemy_category'].value_counts()

In [None]:
plt.figure(figsize=(15,10))
sns.countplot(x=df_train['alchemy_category'],hue=df_train['label']);
plt.xlabel('Category');
plt.xticks(rotation=90);

Alchemy catergory does have a role in determining the label for the article 

We see that business, Recreation and health are more likley to be evergreen <br>

Where as sports computer_internet and arts and entertainment are more like to be non-evergreen. <br>

In [None]:
sns.countplot(x=df_train['label'])
# This is a balanced dataset 

## Cleaning the boilerplate text

Lets remove the title and url word from each description . We will also lower case the words as we are planing to used a uncased version of Transformer model

In [None]:
df_train['summary'].replace(to_replace=r'"title":', value="",inplace=True,regex=True)
df_train['summary'].replace(to_replace=r'"url":',value="",inplace=True,regex=True)

df_train['summary'].replace(to_replace=r'{|}',value="",inplace=True,regex=True)
df_train['summary']=df_train['summary'].str.lower()


#Cleaning the test dataframe 

df_test['summary'].replace(to_replace=r'"title":', value="",inplace=True,regex=True)
df_test['summary'].replace(to_replace=r'"url":',value="",inplace=True,regex=True)

df_test['summary'].replace(to_replace=r'{|}',value="",inplace=True,regex=True)
df_test['summary']=df_test['summary'].str.lower()

# Model Download from Hugging Face 

In [None]:
from transformers import AutoTokenizer, TFAutoModel


#Downloading the tokenizer and the Albert model for fine tuning

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
bert=TFAutoModel.from_pretrained('bert-base-uncased')

In [None]:
#ADD all the variable for the Transformer model 

SEQ_length=512

#Lets create the X and Y matrix from the Df train set 

Xids=np.zeros((df_train.shape[0],SEQ_length))
Xmask=np.zeros((df_train.shape[0],SEQ_length))
y=np.zeros((df_train.shape[0],1))

#Preparing the test dataframe

Xids_test=np.zeros((df_test.shape[0],SEQ_length))
Xmask_test=np.zeros((df_test.shape[0],SEQ_length))


In [None]:
for i,sequence in enumerate(df_train['summary']):
    tokens=tokenizer.encode_plus(sequence,max_length=SEQ_length,padding='max_length',add_special_tokens=True,
                           truncation=True,return_token_type_ids=False,return_attention_mask=True,
                           return_tensors='tf')
    
    Xids[i,:],Xmask[i,:],y[i,0]=tokens['input_ids'],tokens['attention_mask'],df_train.loc[i,'target']
    

for i,sequence in enumerate(df_test['summary']):
    tokens=tokenizer.encode_plus(sequence,max_length=SEQ_length,padding='max_length',add_special_tokens=True,
                           truncation=True,return_token_type_ids=False,return_attention_mask=True,
                           return_tensors='tf')
    
    Xids_test[i,:],Xmask_test[i,:]=tokens['input_ids'],tokens['attention_mask']

In [None]:
#Check if the GPU is avalaible
tf.config.get_visible_devices()

In [None]:
dataset=tf.data.Dataset.from_tensor_slices((Xids,Xmask,y))

def map_func(input_ids,mask,labels):
    return {'input_ids':input_ids,'attention_mask':mask},labels

dataset=dataset.map(map_func)
dataset=dataset.shuffle(100000).batch(32).prefetch(1000)

DS_size=len(list(dataset))

train=dataset.take(round(DS_size*0.85))
val=dataset.skip(round(DS_size*0.85))

In [None]:
dataset

In [None]:
#Preparing the test dataset

dataset_test=tf.data.Dataset.from_tensor_slices((Xids_test,Xmask_test))

def map_func(input_ids,mask):
    return {'input_ids':input_ids,'attention_mask':mask}

dataset_test=dataset_test.map(map_func)
dataset_test=dataset_test.batch(32).prefetch(1000)

In [None]:
dataset_test

Decode the test data and see if urlid and text matches 

# Build the model 

In [None]:
input_ids=tf.keras.layers.Input(shape=(SEQ_length,),name='input_ids',dtype='int32')
input_mask=tf.keras.layers.Input(shape=(SEQ_length,),name='attention_mask',dtype='int32')

embedding=bert(input_ids,attention_mask=input_mask)[0]
#x=tf.keras.layers.GlobalMaxPool1D()(embedding)
x=tf.keras.layers.GlobalAveragePooling1D()(embedding)
x=tf.keras.layers.BatchNormalization()(x)
x=tf.keras.layers.Dense(128,activation='relu')(x)
x=tf.keras.layers.Dropout(0.3)(x)
x=tf.keras.layers.Dense(64,activation='relu')(x)
output=tf.keras.layers.Dense(1,activation='sigmoid')(x)


model=tf.keras.Model(inputs=[input_ids,input_mask],outputs=output)

model.layers[2].trainable=False

In [None]:
model.summary()

In [None]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer='adam',metrics=['accuracy'])

In [None]:
#?tf.keras.losses.*

In [None]:
#?tf.keras.metrics.*

In [None]:
history=model.fit(train,validation_data=val,epochs=7)

# Prediction 

In [None]:
predictions=model.predict(dataset_test)

In [None]:
df_test['label']=predictions

df_test.to_csv('submission_avgpool_dp.csv',columns=['urlid','label'],index=False)

# Useful Links 

https://towardsdatascience.com/working-with-hugging-face-transformers-and-tf-2-0-89bf35e3555a

http://jalammar.github.io/a-visual-guide-to-using-bert-for-the-first-time/

https://www.youtube.com/watch?v=GYDFBfx8Ts8&ab_channel=JamesBriggs

https://medium.com/tensorflow/using-tensorflow-2-for-state-of-the-art-natural-language-processing-102445cda54a