In [None]:
!pip install --pre torch==1.7.0.dev20200701+cu101 torchvision==0.8.0.dev20200701+cu101 -f https://download.pytorch.org/whl/nightly/cu101/torch_nightly.html

In [None]:
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
!python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev

In [None]:
!pip install pytorch-lightning

In [None]:
import pytorch_lightning as plit
import torch_xla.core.xla_model as xm
import torch_xla.distributed.parallel_loader as pl
import torch_xla.distributed.xla_multiprocessing as xmp
import transformers
import torch.nn as nn
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from scipy.special import softmax
import torch
import torch.nn as nn
from tqdm.notebook import tqdm
%matplotlib inline
sns.set()
import os
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_values = pd.read_csv('/kaggle/input/genetic-engineering-attribution-challenge/train_values.csv',index_col='sequence_id')
train_labels = pd.read_csv('/kaggle/input/genetic-engineering-attribution-challenge/train_labels.csv', index_col='sequence_id')
test_values = pd.read_csv('/kaggle/input/genetic-engineering-attribution-challenge/test_values.csv', index_col='sequence_id')
submission_format = pd.read_csv('/kaggle/input/genetic-engineering-attribution-challenge/submission_format_3TFRxH6.csv', index_col='sequence_id')

In [None]:
from tqdm.notebook import tqdm
def get_kmers(df, size=5,stride=3):
    sequence_list = list(df.sequence.values)
    kmers = []
    for item in tqdm(range(len(sequence_list))):
        #kmers.append([sequence_list[item][x:x+size].lower() for x in range(0,len(sequence_list[item]) - size + 1,stride)])
        kmers.append([sequence_list[item][x:x+size].lower() for x in range(0,512 - size + 1,stride)])
    print(len(kmers),df.shape[0])
  
    for idx in tqdm(range(len(kmers))):
        kmers[idx] = ' '.join(kmers[idx])

    return kmers

In [None]:
lab_ids = pd.DataFrame(train_labels.idxmax(axis=1), columns=['lab_id'])
lab_ids.reset_index(inplace=True)
lab_ids.columns = lab_ids.columns.get_level_values(0)
kmers_train = get_kmers(train_values,5,3)
kmers_test = get_kmers(test_values,5,3)
train_values.reset_index(inplace=True)
test_values.reset_index(inplace=True)
train = train_values[['sequence_id']]
test = test_values[['sequence_id']]
train['kmers'] = kmers_train
test['kmers'] = kmers_test
train = pd.merge(train,lab_ids,on='sequence_id',how='left')
train.head(5)

In [None]:
train['kmers'].apply(len).max()

In [None]:
train.memory_usage().sum()/(1024**2)

In [None]:
train_rare0 = train[train['lab_id'] == 'ON9AXMKF'].sample(frac=2,replace=True)
train_rare1 = train[train['lab_id'] == '0L3Y6ZB2'].sample(frac=2,replace=True)

In [None]:
train = pd.concat([train,train_rare0,train_rare1])
train.shape

In [None]:
le = LabelEncoder()
train['lab_id'] = le.fit_transform(train['lab_id'])
train.head()

In [None]:
data = train[['kmers','lab_id']]
data.rename(columns={'kmers':'text','lab_id':'label'},inplace=True)
data.head()

In [None]:
num_labels = data['label'].nunique()
num_labels

In [None]:
import transformers
import tokenizers

MAX_LEN = 128
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 3
#BERT_PATH = "/kaggle/input/bert-pytorch/"
#BERT_PATH = "/kaggle/input/bert-base-uncased/"
#MODEL_PATH = "bert-large-uncased-pytorch_model.bin"
#TRAINING_FILE = "../input/imdb.csv"
TOKENIZER_BERT = transformers.BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case=True
)

In [None]:
import torch.nn.functional as F

class BERTDataset:
    def __init__(self,text,label):
        self.text = text
        self.label=label
        self.tokenizer = TOKENIZER_BERT
        self.max_len = MAX_LEN
            
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, item):
        text = str(self.text[item])
        #text = " ".join(text.split())
                
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'label': torch.tensor(self.label[item], dtype=torch.long),
                       
        }

In [None]:
class BertDataModule(plit.LightningDataModule):
    def __init__(self, hparams, data):
        super().__init__()
        self.hparams = hparams
        self.data = data
        
    def prepare_data(self):
        pass

    def setup(self, stage=None):

        train_df,val_df = train_test_split(self.data,test_size=0.20,random_state=42,stratify=self.data['label'])
        
        self.train_dataset = BERTDataset(text=train_df.text.values,label=train_df.label.values)
        self.valid_dataset = BERTDataset(text=val_df.text.values,label=val_df.label.values)

    def train_dataloader(self):
        train_loader = torch.utils.data.DataLoader(
            self.train_dataset,
            batch_size=8,
            num_workers=2,
            shuffle=True,
        )
        return train_loader

    def val_dataloader(self):
        valid_loader = torch.utils.data.DataLoader(
            self.valid_dataset,
            batch_size=4,
            num_workers=2,
            shuffle=False,
        )

        return valid_loader

    def test_dataloader(self):
        return None


In [None]:
class BERTBaseUncased(nn.Module):
    def __init__(self):
        super(BERTBaseUncased, self).__init__()
        self.bert = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768, 1314)
    
    def forward(self, ids, mask,token_type_ids):
        _, o2 = self.bert(
            ids, 
            attention_mask=mask,
            token_type_ids=token_type_ids
        )
        bo = self.bert_drop(o2)
        output = self.out(bo)
        return output

In [None]:
class BertLit(plit.LightningModule):
    def __init__(self, hparams, model):
        super(BertLit, self).__init__()
        self.hparams = hparams
        self.model = model
        self.loss_fn = nn.CrossEntropyLoss() 
        
    def forward(self, ids,mask,token_type_ids):
        return self.model(ids,mask,token_type_ids)
        
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                               patience=3, threshold=0.00001, mode="min", verbose=True)
        return ([optimizer],[{'scheduler': scheduler, 'interval': 'epoch', 'monitor': 'valid_loss'}])
    
    def training_step(self, batch, batch_idx):
        ids = batch['ids']
        mask = batch['mask']
        token_type_ids = batch['token_type_ids']
        label = batch['label']
        out = self(ids,mask,token_type_ids)
        loss = self.loss_fn(out, label)
        
        logs = {'train_loss': loss}
        
        return {'loss': loss, 'log': logs, 'progress_bar': logs}
    
    def training_epoch_end(self, outputs):
        avg_loss = torch.stack([x['loss'] for x in outputs]).mean()
        logs = {'train_loss': avg_loss}
        return {'log': logs, 'progress_bar': logs}

    def validation_step(self, batch, batch_idx):
        ids = batch['ids']
        mask = batch['mask']
        token_type_ids = batch['token_type_ids']
        label = batch['label']
        out = self(ids,mask,token_type_ids)
        loss = self.loss_fn(out, label)
        logs = {'valid_loss': loss}
        
        return {'loss': loss, 'log': logs, 'progress_bar': logs}
    
    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x['loss'] for x in outputs]).mean()
        logs = {'valid_loss': avg_loss}
        return {'log': logs, 'progress_bar': logs}
                
   

In [None]:
trainer = plit.Trainer(tpu_cores=8,precision=16,max_epochs=3,weights_summary='full')

In [None]:
model = BERTBaseUncased()
model = BertLit(hparams={}, model=model)
dm = BertDataModule(hparams={}, data=data)

In [None]:
trainer.fit(model, dm)