plz upvote the sourve notebook
source notebook of Stumble Upon Challenge AUC Private LB 0.85 from Sumeet Sawant  

In [None]:
import pandas as pd 
import numpy as np 

import matplotlib.pyplot as plt 
import seaborn as sns 
%matplotlib inline 

import tensorflow as tf 

# Load the data 

In [None]:
# taking all columns of training set only for data exploration
df_train=pd.read_csv('/kaggle/input/stumbleupon/train.tsv',sep='\t')
# taking boilerplate column as an input for the model beacuse only this column contain lot of high quality text data useful for our nlp task
df_test=pd.read_csv('/kaggle/input/stumbleupon/test.tsv',sep='\t',usecols=['urlid','boilerplate'])


# Data Explorations 

The dataset containes 27 columns and the end goal is predicting if the article is evergreen or non-evergreen. <br>


In [None]:
df_train.head()


In [None]:
df_train.columns

In [None]:
df_train['alchemy_category'].value_counts()

In [None]:
plt.figure(figsize=(15,10))
sns.countplot(x=df_train['alchemy_category'],hue=df_train['label']);
plt.xlabel('Category');
plt.xticks(rotation=90);

Alchemy catergory does have a role in determining the label for the article 

We see that business, Recreation and health are more likley to be evergreen <br>

Where as sports computer_internet and arts and entertainment are more like to be non-evergreen. <br>

In [None]:
sns.countplot(x=df_train['label'])
# This is a balanced dataset 

## Cleaning the boilerplate text

Lets remove the title and url word from each description . We will also lower case the words as we are planing to used a uncased version of Transformer model

In [None]:
df_train['boilerplate'].replace(to_replace=r'"title":', value="",inplace=True,regex=True)
df_train['boilerplate'].replace(to_replace=r'"url":',value="",inplace=True,regex=True)

df_train['boilerplate'].replace(to_replace=r'{|}',value="",inplace=True,regex=True)
df_train['boilerplate']=df_train['boilerplate'].str.lower()


#Cleaning the test dataframe 

df_test['boilerplate'].replace(to_replace=r'"title":', value="",inplace=True,regex=True)
df_test['boilerplate'].replace(to_replace=r'"url":',value="",inplace=True,regex=True)

df_test['boilerplate'].replace(to_replace=r'{|}',value="",inplace=True,regex=True)
df_test['boilerplate']=df_test['boilerplate'].str.lower()

# Model Download from Hugging Face 

In [None]:
from transformers import AutoTokenizer


#Downloading the tokenizer and the Albert model for fine tuning

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [None]:
#ADD all the variable for the Transformer model 
# because bert base uncased Model can only handle upto 512 tokens at a time
SEQ_length=512

#Lets create the X and Y matrix from the Df train set 

Xids=np.zeros((df_train.shape[0],SEQ_length))
Xmask=np.zeros((df_train.shape[0],SEQ_length))
y=np.zeros((df_train.shape[0],1))

#Preparing the test dataframe

Xids_test=np.zeros((df_test.shape[0],SEQ_length))
Xmask_test=np.zeros((df_test.shape[0],SEQ_length))
Xids

In [None]:
for i,sequence in enumerate(df_train['boilerplate']):
    tokens=tokenizer.encode_plus(sequence,max_length=SEQ_length,padding='max_length',add_special_tokens=True,
                           truncation=True,return_token_type_ids=False,return_attention_mask=True,
                           return_tensors='tf')
    
    Xids[i,:],Xmask[i,:],y[i,0]=tokens['input_ids'],tokens['attention_mask'],df_train.loc[i,'label']
    

for i,sequence in enumerate(df_test['boilerplate']):
    tokens=tokenizer.encode_plus(sequence,max_length=SEQ_length,padding='max_length',add_special_tokens=True,
                           truncation=True,return_token_type_ids=False,return_attention_mask=True,
                           return_tensors='tf')
    
    Xids_test[i,:],Xmask_test[i,:]=tokens['input_ids'],tokens['attention_mask']

In [None]:
Xids.shape

In [None]:
#Check if the GPU is avalaible
tf.config.get_visible_devices()

In [None]:
dataset=tf.data.Dataset.from_tensor_slices((Xids,Xmask,y))

def map_func(input_ids,mask,labels):
    return {'input_ids':input_ids,'attention_mask':mask},labels

dataset=dataset.map(map_func)
dataset=dataset.shuffle(100000).batch(32).prefetch(1000)

DS_size=len(list(dataset))

train=dataset.take(round(DS_size*0.85))
val=dataset.skip(round(DS_size*0.85))

In [None]:
#Preparing the test dataset

dataset_test=tf.data.Dataset.from_tensor_slices((Xids_test,Xmask_test))

def map_func(input_ids,mask):
    return {'input_ids':input_ids,'attention_mask':mask}

dataset_test=dataset_test.map(map_func)
dataset_test=dataset_test.batch(32).prefetch(1000)

Decode the test data and see if urlid and text matches 

# Build the model 

In [None]:
from transformers import TFDistilBertModel, DistilBertConfig
distil_bert = 'distilbert-base-uncased'

config = DistilBertConfig(dropout=0.2, attention_dropout=0.2)
config.output_hidden_states = False
transformer_model = TFDistilBertModel.from_pretrained(distil_bert, config = config)

input_ids_in = tf.keras.layers.Input(shape=(SEQ_length,), name='input_ids', dtype='int32')
input_masks_in = tf.keras.layers.Input(shape=(SEQ_length,), name='attention_mask', dtype='int32') 

embedding_layer = transformer_model(input_ids_in, attention_mask=input_masks_in)[0]
X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(embedding_layer)
X = tf.keras.layers.GlobalMaxPool1D()(X)
X = tf.keras.layers.Dense(50, activation='relu')(X)
X = tf.keras.layers.Dropout(0.2)(X)
X = tf.keras.layers.Dense(1, activation='sigmoid')(X)
model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs = X)

for layer in model.layers[:3]:
  layer.trainable = False

In [None]:
model.summary()

In [None]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer='adam',metrics=[tf.keras.metrics.AUC(),tf.keras.metrics.Precision(),tf.keras.metrics.Recall()
])

# below is the Precision and Recall with loss and AUc

In [None]:
history=model.fit(train,validation_data=val,epochs=3)

# Prediction 

In [None]:
predictions=model.predict(dataset_test)
df_test['label']=predictions

df_test.to_csv('submission.csv',columns=['urlid','label'],index=False)

# Precision and recall for each classes

In [None]:
input_x=tf.data.Dataset.from_tensor_slices((Xids,Xmask,y))

def map_func(input_ids,mask,labels):
    return {'input_ids':input_ids,'attention_mask':mask}

input_x=input_x.map(map_func)
input_x=input_x.shuffle(100000).batch(32).prefetch(1000)

y_true = y

In [None]:
y_true

In [None]:
y_pred=model.predict(dataset)
y_pred

In [None]:
y_pred = np.round(y_pred)
y_pred

In [None]:
from sklearn import metrics
print(metrics.classification_report(y_true, y_pred))