In [1]:
import numpy as np 
import pandas as pd 
import random
import json
import torch
import transformers
import torch.nn as nn
from transformers import BertTokenizer, BertModel
from torch.utils.data import Dataset, DataLoader
from sklearn import metrics, model_selection
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS,TfidfVectorizer
from sklearn.model_selection import GridSearchCV,cross_val_score, cross_validate
from sklearn.linear_model import LogisticRegression
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/stumbleupon-bert-model/model.bin
/kaggle/input/stumbleupon/test.tsv
/kaggle/input/stumbleupon/train.tsv
/kaggle/input/stumbleupon/sampleSubmission.csv
/kaggle/input/stumbleupon/raw_content.zip


In [2]:
SEED = 42
DEVICE = "cuda"
MAX_LEN = 256 #
TRAIN_BATCH_SIZE = 32 #
VALID_BATCH_SIZE = 16 #
EPOCHS = 2 #
LR = 2e-5 #
BERT_PATH = "bert-base-uncased"
MODEL_PATH = "./model.bin"
TOKENIZER = BertTokenizer.from_pretrained(BERT_PATH)

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [3]:
def prepare_dataset(df_train):
    df_train['boilerplate'] = df_train['boilerplate'].apply(lambda string: json.loads(string))
    df_train['title'] = df_train['boilerplate'].apply(lambda boilerplate: str(boilerplate['title'])+"-" if 'title' in boilerplate.keys() else "")
    df_train['title'] = df_train['title'].astype(str)
    df_train['url'] = df_train['boilerplate'].apply(lambda boilerplate: str(boilerplate['url'])+"-" if 'url' in boilerplate.keys() else "")
    df_train['title'] = df_train['title'].astype(str)
    df_train['content'] = df_train['boilerplate'].apply(lambda boilerplate: boilerplate['body'])
    df_train['content'] = df_train['content'].astype(str)
    df_train['body'] = df_train['title'].values+" "+df_train['url'].values+" "+df_train['content'].values
    

In [4]:
class ContentDataset(Dataset):
    
    def __init__(self, df, max_length):
        self.contents = df['body'].values
        self.targets = df['label'].values
        self.urlids = df['urlid'].values
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
        self.max_length = max_length
        
    def __len__(self):
        return len(self.contents)
    
    def __getitem__(self, item):
        content = str(self.contents[item])
        content = " ".join(content.split())
        inputs = self.tokenizer.encode_plus(
            content,
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors='pt'
        )
        input_ids = inputs["input_ids"].flatten()
        attention_mask = inputs["attention_mask"].flatten()
        token_type_ids = inputs["token_type_ids"].flatten()
        
        return {
            "urlid": self.urlids[item],
            "content": content,
            "ids": torch.tensor(input_ids, dtype=torch.long),
            "mask": torch.tensor(attention_mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "targets": torch.tensor(self.targets[item], dtype=torch.float),
        }

In [5]:
def loss_fn(outputs, targets):
    return nn.BCEWithLogitsLoss()(outputs, targets.view(-1, 1))


def train_fn(data_loader, model, optimizer, device, scheduler):
    model.train()

    for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        targets = d["targets"]

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)

        optimizer.zero_grad()
        outputs = model(ids=ids,
                        mask=mask,
                        token_type_ids=token_type_ids
                       )

        loss = loss_fn(outputs, targets)
        if bi % 10 == 0:
            print(f'batch={bi}, BCEloss={loss}')
        loss.backward()
        optimizer.step()
        scheduler.step()


def eval_fn(data_loader, model, device):
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]
            targets = d["targets"]

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            outputs = model(ids=ids,
                            mask=mask,
                            token_type_ids=token_type_ids
                           )
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [6]:
class BertBaseUncased(nn.Module):
    def __init__(self):
        super(BertBaseUncased, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-cased")
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768, 1)

    def forward(self, ids, mask, token_type_ids):
        o = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids).pooler_output
        bo = self.bert_drop(o)
        output = self.out(bo)
        return output

In [7]:
df = pd.read_csv('/kaggle/input/stumbleupon/train.tsv', sep='\t', usecols=['urlid', 'boilerplate', 'label'])
prepare_dataset(df)
df_train, df_valid = model_selection.train_test_split(
    df, test_size=0.3, random_state=SEED, stratify=df.label.values
)

print('Training Samples-', len(df_train), ', Validation Samples-', len(df_valid))

train_dataset = ContentDataset(df_train, max_length=MAX_LEN)

train_data_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=4
)

valid_dataset = ContentDataset(df_valid, max_length=MAX_LEN)

valid_data_loader = torch.utils.data.DataLoader(
    valid_dataset, batch_size=VALID_BATCH_SIZE, num_workers=1
)

device = torch.device(DEVICE)
model = BertBaseUncased()
model.to(device)

param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
    {
        "params": [
            p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.001,
    },
    {
        "params": [
            p for n, p in param_optimizer if any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.0,
    },
]

optimizer = AdamW(model.parameters(), lr=LR)


Training Samples- 5176 , Validation Samples- 2219


Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [8]:
num_train_steps = int(len(df_train) / TRAIN_BATCH_SIZE * EPOCHS)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_train_steps,
)

best_accuracy = 0
for epoch in range(EPOCHS):
    train_fn(train_data_loader, model, optimizer, device, scheduler)
    outputs, targets = eval_fn(valid_data_loader, model, device)
    accuracy = metrics.roc_auc_score(targets, outputs)
    precision_score = metrics.precision_score(targets, np.round(outputs).astype(int))
    recall_score = metrics.recall_score(targets, np.round(outputs).astype(int))
    print(f"roc_auc_score = {accuracy}, precision_score = {precision_score}, recall_score = {recall_score}")
    if accuracy > best_accuracy:
        torch.save(model.state_dict(), MODEL_PATH)
        best_accuracy = accuracy



batch=0, BCEloss=0.6955546140670776


  6%|▌         | 10/162 [00:10<02:11,  1.16it/s]

batch=10, BCEloss=0.5805596113204956


 12%|█▏        | 20/162 [00:18<01:52,  1.26it/s]

batch=20, BCEloss=0.5015707015991211


 19%|█▊        | 30/162 [00:26<01:44,  1.26it/s]

batch=30, BCEloss=0.4701194763183594


 25%|██▍       | 40/162 [00:34<01:36,  1.26it/s]

batch=40, BCEloss=0.4664444923400879


 31%|███       | 50/162 [00:42<01:28,  1.27it/s]

batch=50, BCEloss=0.5389180183410645


 37%|███▋      | 60/162 [00:50<01:20,  1.27it/s]

batch=60, BCEloss=0.46625661849975586


 43%|████▎     | 70/162 [00:58<01:11,  1.29it/s]

batch=70, BCEloss=0.44414061307907104


 49%|████▉     | 80/162 [01:06<01:06,  1.24it/s]

batch=80, BCEloss=0.4010791480541229


 56%|█████▌    | 90/162 [01:14<00:56,  1.27it/s]

batch=90, BCEloss=0.6186447143554688


 62%|██████▏   | 100/162 [01:21<00:48,  1.28it/s]

batch=100, BCEloss=0.3433424234390259


 68%|██████▊   | 110/162 [01:29<00:40,  1.27it/s]

batch=110, BCEloss=0.3558143377304077


 74%|███████▍  | 120/162 [01:38<00:34,  1.21it/s]

batch=120, BCEloss=0.5051407814025879


 80%|████████  | 130/162 [01:46<00:25,  1.26it/s]

batch=130, BCEloss=0.4741303026676178


 86%|████████▋ | 140/162 [01:54<00:17,  1.27it/s]

batch=140, BCEloss=0.5083321928977966


 93%|█████████▎| 150/162 [02:02<00:09,  1.21it/s]

batch=150, BCEloss=0.4852878153324127


 99%|█████████▉| 160/162 [02:10<00:01,  1.28it/s]

batch=160, BCEloss=0.4946613013744354


100%|██████████| 162/162 [02:11<00:00,  1.23it/s]
100%|██████████| 139/139 [00:47<00:00,  2.95it/s]


roc_auc_score = 0.881695281761129, precision_score = 0.8214285714285714, recall_score = 0.8279192273924495




batch=0, BCEloss=0.4273185133934021


  6%|▌         | 10/162 [00:11<02:13,  1.14it/s]

batch=10, BCEloss=0.4866641163825989


 12%|█▏        | 20/162 [00:18<01:51,  1.27it/s]

batch=20, BCEloss=0.40975672006607056


 19%|█▊        | 30/162 [00:26<01:46,  1.24it/s]

batch=30, BCEloss=0.38401421904563904


 25%|██▍       | 40/162 [00:34<01:36,  1.26it/s]

batch=40, BCEloss=0.3464055061340332


 31%|███       | 50/162 [00:42<01:28,  1.26it/s]

batch=50, BCEloss=0.18961775302886963


 37%|███▋      | 60/162 [00:50<01:20,  1.27it/s]

batch=60, BCEloss=0.2194509357213974


 43%|████▎     | 70/162 [00:58<01:14,  1.23it/s]

batch=70, BCEloss=0.38312840461730957


 49%|████▉     | 80/162 [01:06<01:04,  1.27it/s]

batch=80, BCEloss=0.4083515405654907


 56%|█████▌    | 90/162 [01:14<00:56,  1.27it/s]

batch=90, BCEloss=0.5153985619544983


 62%|██████▏   | 100/162 [01:22<00:49,  1.24it/s]

batch=100, BCEloss=0.7271988987922668


 68%|██████▊   | 110/162 [01:30<00:41,  1.25it/s]

batch=110, BCEloss=0.25498199462890625


 74%|███████▍  | 120/162 [01:38<00:32,  1.28it/s]

batch=120, BCEloss=0.3656969666481018


 80%|████████  | 130/162 [01:46<00:25,  1.27it/s]

batch=130, BCEloss=0.31999605894088745


 86%|████████▋ | 140/162 [01:54<00:17,  1.25it/s]

batch=140, BCEloss=0.24237903952598572


 93%|█████████▎| 150/162 [02:03<00:09,  1.25it/s]

batch=150, BCEloss=0.44746875762939453


 99%|█████████▉| 160/162 [02:10<00:01,  1.30it/s]

batch=160, BCEloss=0.2732906937599182


100%|██████████| 162/162 [02:12<00:00,  1.23it/s]
100%|██████████| 139/139 [00:48<00:00,  2.88it/s]


roc_auc_score = 0.8856928592332455, precision_score = 0.8299559471365638, recall_score = 0.8270412642669008


In [9]:
model.load_state_dict(torch.load("./model.bin"))
model.eval()

BertBaseUncased(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [10]:
df_test = pd.read_csv('/kaggle/input/stumbleupon/test.tsv', sep='\t', usecols=['urlid', 'boilerplate'])
prepare_dataset(df_test)

In [11]:
class ContentDatasetTest(Dataset):
    
    def __init__(self, df, max_length):
        self.contents = df['body'].values
        self.urlids = df['urlid'].values
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
        self.max_length = max_length
        
    def __len__(self):
        return len(self.contents)
    
    def __getitem__(self, item):
        content = str(self.contents[item])
        content = " ".join(content.split())
        inputs = self.tokenizer.encode_plus(
            content,
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors='pt'
        )
        input_ids = inputs["input_ids"].flatten()
        attention_mask = inputs["attention_mask"].flatten()
        token_type_ids = inputs["token_type_ids"].flatten()
        
        return {
            "urlid": self.urlids[item],
            "content": content,
            "ids": torch.tensor(input_ids, dtype=torch.long),
            "mask": torch.tensor(attention_mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
        }

In [12]:
test_dataset = ContentDatasetTest(df_test, MAX_LEN)

In [13]:
test_data_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=16,
    drop_last=False,
    num_workers=4,
    shuffle=False
)

In [14]:
with torch.no_grad():
    fin_outputs = []
    for bi, d in tqdm(enumerate(test_data_loader)):
        ids = d["ids"]
        mask = d["mask"]
        token_type_ids = d["token_type_ids"]

        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)

        outputs = model(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )

        outputs_np = torch.sigmoid(outputs).cpu().detach().numpy().tolist()
        fin_outputs.extend(outputs_np)

199it [00:47,  4.17it/s]


In [15]:
predictions = pd.DataFrame({'urlid': df_test['urlid'].values, 'label': np.array(fin_outputs).reshape(1, -1).tolist()[0]})
predictions.to_csv('predictions.csv', index=False)

In [16]:
predictions

Unnamed: 0,urlid,label
0,5865,0.951297
1,782,0.264772
2,6962,0.690004
3,7640,0.085582
4,3589,0.756774
...,...,...
3166,7264,0.911576
3167,9714,0.944673
3168,5903,0.686067
3169,3176,0.829150


In [17]:
# Loading the training file
df_train=pd.read_csv('../input/stumbleupon/train.tsv',delimiter='\t')
print('Training File Loaded')

# Loading test file
df_test=pd.read_csv('../input/stumbleupon/test.tsv',delimiter='\t')
print('Test File Loaded')

# Separating labels
y_train = df_train.label.values
print('Labels Seperated')

# Extracting urlid for creating submission file
urlid_test = df_test.urlid
print('Extracted urlid from test to make submission file')

# Getting length of training data
lentraindata = df_train.shape[0]

# Seperating usable data from all the rest
text_train = df_train.boilerplate 
text_test = df_test.boilerplate

# Cleaining data
text_train_clean = [i[9:-3] for i in text_train]
text_test_clean = [i[9:-3] for i in text_test]

# Combining text_train and text_test_clean
text = text_train_clean + text_test_clean
print('Both are combined')


# Building count vectorizer 
vect = TfidfVectorizer(stop_words='english',min_df=3,strip_accents='unicode',token_pattern=r'\w{1,}',ngram_range=(1,2),use_idf=True, smooth_idf=True, sublinear_tf=True)
vect.fit(text)
X= vect.transform(text)

# Seperating test and train file
X_train= X[:lentraindata]
X_test = X[lentraindata:]

# Building the classifier

lr = LogisticRegression(penalty='l2',C=1,dual=False)

cv_score = cross_validate(lr,X_train,y_train,cv=5,scoring=['roc_auc', 'precision', 'recall'])
print(f"roc auc - {cv_score['test_roc_auc'].mean()}, precision - {cv_score['test_precision'].mean()}, recall - {cv_score['test_recall'].mean()}")
                   
lr.fit(X_train,y_train)
# Predicting 
y_pred = lr.predict(X_test)
print('Y predicted')
data = {'urlid':urlid_test,'label':y_pred}


submission_df =pd.DataFrame(data)
print('Submission DataFrame build')
submission_df.to_csv('Submission.csv',index=False)
print('File Submitted')

Training File Loaded
Test File Loaded
Labels Seperated
Extracted urlid from test to make submission file
Both are combined
roc auc - 0.8763356845417368, precision - 0.8689056665045541, recall - 0.7413109354413703
Y predicted
Submission DataFrame build
File Submitted


In [18]:
ensemble_pred=[]
bert_pred = np.array(fin_outputs).reshape(1, -1).tolist()[0]
for i in range(len(bert_pred)):
    avg_pred = (y_pred[i] + bert_pred[i])/2
    ensemble_pred.append(float(avg_pred))

In [19]:
ensemble_predictions = pd.DataFrame({'urlid': df_test['urlid'].values, 'label': ensemble_pred})
ensemble_predictions.to_csv('ensemble_predictions.csv', index=False)

In [20]:
ensemble_predictions

Unnamed: 0,urlid,label
0,5865,0.975649
1,782,0.132386
2,6962,0.345002
3,7640,0.042791
4,3589,0.378387
...,...,...
3166,7264,0.955788
3167,9714,0.972336
3168,5903,0.843034
3169,3176,0.914575
