In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/stumbleupon/test.tsv
/kaggle/input/stumbleupon/train.tsv
/kaggle/input/stumbleupon/sampleSubmission.csv
/kaggle/input/stumbleupon/raw_content.zip


In [2]:
import os
import pandas as pd
import random
import numpy as np

# pytorch: helps us make model
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset

# sklearn: help to genrate predicitve report
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split

# huggingface: Stores pre-trained models
from transformers import BertTokenizer, BertForSequenceClassification, set_seed
from transformers import pipeline, AdamW, get_linear_schedule_with_warmup
import json 

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

df=pd.read_csv("/kaggle/input/stumbleupon/train.tsv",sep='\t')

df.head()



Unnamed: 0,url,urlid,boilerplate,alchemy_category,alchemy_category_score,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,...,is_news,lengthyLinkDomain,linkwordscore,news_front_page,non_markup_alphanum_characters,numberOfLinks,numwords_in_url,parametrizedLinkRatio,spelling_errors_ratio,label
0,http://www.bloomberg.com/news/2010-12-23/ibm-p...,4042,"{""title"":""IBM Sees Holographic Calls Air Breat...",business,0.789131,2.055556,0.676471,0.205882,0.047059,0.023529,...,1,1,24,0,5424,170,8,0.152941,0.07913,0
1,http://www.popsci.com/technology/article/2012-...,8471,"{""title"":""The Fully Electronic Futuristic Star...",recreation,0.574147,3.677966,0.508021,0.28877,0.213904,0.144385,...,1,1,40,0,4973,187,9,0.181818,0.125448,1
2,http://www.menshealth.com/health/flu-fighting-...,1164,"{""title"":""Fruits that Fight the Flu fruits tha...",health,0.996526,2.382883,0.562016,0.321705,0.120155,0.042636,...,1,1,55,0,2240,258,11,0.166667,0.057613,1
3,http://www.dumblittleman.com/2007/12/10-foolpr...,6684,"{""title"":""10 Foolproof Tips for Better Sleep ""...",health,0.801248,1.543103,0.4,0.1,0.016667,0.0,...,1,0,24,0,2737,120,5,0.041667,0.100858,1
4,http://bleacherreport.com/articles/1205138-the...,9006,"{""title"":""The 50 Coolest Jerseys You Didn t Kn...",sports,0.719157,2.676471,0.5,0.222222,0.123457,0.04321,...,1,1,14,0,12032,162,10,0.098765,0.082569,0


In [4]:
def slicling_data(data):

    x = json.loads(data)
#     print(x)
    try:
        len_title = len(x['title'])
    except:
        len_title = 0

    try:
    # selecting title and last 400-500 words from boilerplate
        split_text = x['body'].split(' ')[-500+len_title:]
    except:
        split_text=""

    if len_title:
        temp_text = ' '.join(split_text)+x['title']
    else:
        temp_text = ' '.join(split_text)

    return temp_text

# applies above func and stores result in a new column
df['sliced_data'] = df['boilerplate'].map(slicling_data)

temp = df['sliced_data'].map(lambda x: len(x))
temp[temp == 0]


4142    0
Name: sliced_data, dtype: int64

In [5]:
df.drop(4142, axis=0, inplace=True)




In [6]:
import re
import string
import spacy
sp = spacy.load('en_core_web_sm')

def cleaning_text(text,emojis=True,html_tag=True,http=True,lemmitize=True,punctuation=True):
    
    #remove emojis
    if emojis is True:
        regrex_pattern = re.compile(pattern = "["                                                   
                                    u"\U0001F600-\U0001F64F"  # emoticons
                                    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                    u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                    u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                                       "]+", flags = re.UNICODE)

        text=regrex_pattern.sub(r'',text)
    
    
    
    #lower 
    text=text.lower()
    
    #remove html tag
    if html_tag is True:
        text=re.sub('<.*?>',"",text)
        
    #remove http link
    if http is True:
        text = re.sub("https?:\/\/t.co\/[A-Za-z0-9]*", '', text)
    
#     lemmitizing
    if lemmitize is True:
        lemmatized = [word.lemma_ for word in sp(text)]
        text = ' '.join(lemmatized)
    
    #remove punctuation
    if punctuation is True:
        text = text.translate(str.maketrans('', '', string.punctuation))
        
    # removing extra space
    text = re.sub("\s+", ' ', text)
    
    
#     print(text)
    
    return text

In [7]:
df['sliced_data_cleaned']=df['sliced_data'].apply(lambda x: cleaning_text(x,lemmitize=False,
                                                                         http=False))

In [8]:
# split dataset into train and val for validating model performance
x_train, x_val, y_train, y_val = train_test_split(
    df['sliced_data_cleaned'].values, df.label.values, test_size=0.05, stratify=df.label.values)

In [9]:
x_train.shape,x_val.shape

((7024,), (370,))

In [10]:
# init tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# get embedings
encoded_data_train = tokenizer.batch_encode_plus(
    list(x_train), 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    padding='max_length',
    max_length=512, 
    return_tensors='pt',
    truncation=True,
    return_token_type_ids=False
)

encoded_data_val = tokenizer.batch_encode_plus(
    list(x_val), 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True,
    padding='max_length', 
    max_length=512, 
    return_tensors='pt',
    truncation=True,
    return_token_type_ids=False
)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [11]:

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(list(y_train))#.float()

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(list(y_val))#.float()


# making dataset
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

# making dataloader
dataloader_train = DataLoader(dataset_train, sampler=RandomSampler(dataset_train), batch_size=2)
dataloader_val = DataLoader(dataset_val, sampler=SequentialSampler(dataset_val), batch_size=2)

In [12]:
from transformers import BertForSequenceClassification, AdamW, BertConfig
# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [13]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [14]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )
from transformers import get_linear_schedule_with_warmup
# Number of training epochs (authors recommend between 2 and 4)
epochs = 3
# Total number of training steps is number of batches * number of epochs.
total_steps = len(dataloader_train) * epochs
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

import numpy as np
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

import time
import datetime
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))



In [15]:
import random
# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
# Set the seed value all over the place to make this reproducible.
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
# Store the average loss after each epoch so we can plot them.
loss_values = []
# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    # Measure how long the training epoch takes.
    t0 = time.time()
    # Reset the total loss for this epoch.
    total_loss = 0
    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()
    # For each batch of training data...
    for step, batch in enumerate(dataloader_train):
        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(dataloader_train), elapsed))
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        
        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we
        # have provided the `labels`.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        
        # The call to `model` always returns a tuple, so we need to pull the 
        # loss value out of the tuple.
        loss = outputs[0]
        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_loss += loss.item()
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()
        # Update the learning rate.
        scheduler.step()
    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(dataloader_train)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.
    print("")
    print("Running Validation...")
    t0 = time.time()
    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()
    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    # Evaluate data for one epoch
    for batch in dataloader_val:
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have
            # not provided labels.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        logits = outputs[0]
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy
        # Track the number of batches
        nb_eval_steps += 1
    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))
print("")
print("Training complete!")


Training...
  Batch    40  of  3,512.    Elapsed: 0:00:07.
  Batch    80  of  3,512.    Elapsed: 0:00:14.
  Batch   120  of  3,512.    Elapsed: 0:00:20.
  Batch   160  of  3,512.    Elapsed: 0:00:27.
  Batch   200  of  3,512.    Elapsed: 0:00:33.
  Batch   240  of  3,512.    Elapsed: 0:00:40.
  Batch   280  of  3,512.    Elapsed: 0:00:46.
  Batch   320  of  3,512.    Elapsed: 0:00:53.
  Batch   360  of  3,512.    Elapsed: 0:00:59.
  Batch   400  of  3,512.    Elapsed: 0:01:05.
  Batch   440  of  3,512.    Elapsed: 0:01:12.
  Batch   480  of  3,512.    Elapsed: 0:01:18.
  Batch   520  of  3,512.    Elapsed: 0:01:24.
  Batch   560  of  3,512.    Elapsed: 0:01:31.
  Batch   600  of  3,512.    Elapsed: 0:01:37.
  Batch   640  of  3,512.    Elapsed: 0:01:44.
  Batch   680  of  3,512.    Elapsed: 0:01:50.
  Batch   720  of  3,512.    Elapsed: 0:01:56.
  Batch   760  of  3,512.    Elapsed: 0:02:03.
  Batch   800  of  3,512.    Elapsed: 0:02:09.
  Batch   840  of  3,512.    Elapsed: 0:02:16.


In [16]:
df1=pd.read_csv("/kaggle/input/stumbleupon/test.tsv",sep='\t')

df1.head()

Unnamed: 0,url,urlid,boilerplate,alchemy_category,alchemy_category_score,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,...,image_ratio,is_news,lengthyLinkDomain,linkwordscore,news_front_page,non_markup_alphanum_characters,numberOfLinks,numwords_in_url,parametrizedLinkRatio,spelling_errors_ratio
0,http://www.lynnskitchenadventures.com/2009/04/...,5865,"{""title"":""Homemade Enchilada Sauce Lynn s Kitc...",recreation,0.443906,2.55814,0.389706,0.257353,0.044118,0.022059,...,0.199438,1,1,15,0,5643,136,3,0.242647,0.080597
1,http://lolpics.se/18552-stun-grenade-ar,782,"{""title"":""lolpics Stun grenade ar "",""body"":"" f...",culture_politics,0.135844,3.771429,0.461538,0.205128,0.051282,0.0,...,0.08,?,1,62,0,382,39,2,0.128205,0.176471
2,http://www.xcelerationfitness.com/treadmills.html,6962,"{""title"":""Treadmills "",""body"":"" treadmills, st...",?,?,2.269565,0.495726,0.384615,0.17094,0.17094,...,10.0,?,1,42,0,2420,117,1,0.581197,0.125
3,http://www.bloomberg.com/news/2012-02-06/syria...,7640,"{""title"":""Father s Tactics Used by Assad to Cr...",culture_politics,0.90259,2.52349,0.705502,0.346278,0.122977,0.090615,...,0.005964,1,1,41,0,5559,309,10,0.038835,0.063126
4,http://www.wired.com/gadgetlab/2011/12/stem-tu...,3589,"{""title"":""Stem Turns Lemons and Limes Into Jui...",science_technology,0.486363,1.848,0.470968,0.16129,0.032258,0.0,...,0.035714,1,0,34,0,2209,155,10,0.096774,0.065341


In [17]:
def slicling_data(data):

    x = json.loads(data)
#     print(x)
    try:
        len_title = len(x['title'])
    except:
        len_title = 0

    try:
    # selecting title and last 400-500 words from boilerplate
        split_text = x['body'].split(' ')[-500+len_title:]
    except:
        split_text=""

    if len_title:
        temp_text = ' '.join(split_text)+x['title']
    else:
        temp_text = ' '.join(split_text)

    return temp_text

# applies above func and stores result in a new column
df1['sliced_data'] = df1['boilerplate'].map(slicling_data)


In [18]:

temp = df1['sliced_data'].map(lambda x: len(x))
temp[temp == 0]


Series([], Name: sliced_data, dtype: int64)

In [19]:
df1['sliced_data_clean']=df1['sliced_data'].apply(lambda x: cleaning_text(x,lemmitize=False,
                                                                         http=False))

In [20]:
encoded_data_test = tokenizer.batch_encode_plus(
    df1['sliced_data_clean'].values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    padding='max_length',
    max_length=512, 
    return_tensors='pt',
    truncation=True,
    return_token_type_ids=False
)

input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
# labels_train = torch.tensor(list(y_train))#.float()


# making dataset
dataset_test = TensorDataset(input_ids_test, attention_masks_test)


# making dataloader
dataloader_test = DataLoader(dataset_test,sampler=SequentialSampler(dataset_test), batch_size=1)

print(len(dataloader_test))

3171


In [21]:

# Prediction on test set
print('Predicting labels for {:,} test sentences...'.format(len(dataloader_test)))
# Put model in evaluation mode
model.eval()
# Tracking variables 
predictions , true_labels = [], []
# Predict 
for batch in dataloader_test:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)
  logits = outputs[0]
  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  # label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  # true_labels.append(label_ids)
print('DONE.')

Predicting labels for 3,171 test sentences...
DONE.


In [22]:
pred_flat=[]
for i in predictions:
  pred_flat.append(np.argmax(i, axis=1).tolist())
  # break

In [23]:
to_sumbit=[]
for i in range(len(df1)):
  # print(df1['urlid'][i])
  # print(pred_flat[i][0])
  to_sumbit.append([df1['urlid'][i],pred_flat[i][0]])
  # break

In [24]:
df_sumbit=pd.DataFrame(to_sumbit,columns=['urlid','label'])

In [25]:
df_sumbit.to_csv("submission.csv",index=False)