In [None]:
############################### deps
!pip install scikit-learn
!pip install seaborn
!pip install matplotlib
!pip install numpy
!pip install pandas
!pip install transformers
!pip install torch
!pip install gdown
!gdown --id 1--p20cXTZvk57GvPTxPmoylPtImvh0Vf # best_model.pt from google drive
!gdown --id 1Udrd9a944rJH0GxDhR6052gGNksb7rXO # df_eda.pkl from google drive


Downloading...
From: https://drive.google.com/uc?id=1--p20cXTZvk57GvPTxPmoylPtImvh0Vf
To: /content/best_model.pt
100% 1.31G/1.31G [00:05<00:00, 252MB/s]
Downloading...
From: https://drive.google.com/uc?id=1Udrd9a944rJH0GxDhR6052gGNksb7rXO
To: /content/df_eda.pkl
100% 60.9M/60.9M [00:00<00:00, 153MB/s]


In [None]:
############################### imports
from sklearn.metrics import hamming_loss
from sklearn.metrics import jaccard_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.calibration import CalibratedClassifierCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn import metrics
from sklearn.svm import LinearSVC
import transformers
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
import seaborn as sns
import shutil, sys
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings
import torch
import os

In [None]:
############################### CONFIG
MAX_LEN = 225
TRAIN_BATCH_SIZE = 36
VALID_BATCH_SIZE = 36
EPOCHS = 5
LEARNING_RATE = 1e-05

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

In [None]:
################################ INIT
warnings.simplefilter("ignore")
warnings.filterwarnings('ignore')
sns.set_style("darkgrid")

In [None]:
################################ READS
df = pd.read_pickle("df_eda.pkl")

In [None]:
############################### CUDA
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [None]:
############################### JOIN TITLE + BODY
df['Combo'] = df['Title'] + ". " + df['Body']

In [None]:
############################### BINARIZATION
mlb = MultiLabelBinarizer()
tag_df = pd.DataFrame(mlb.fit_transform(df['Tags']), columns=mlb.classes_, index=df.index)
class_names = mlb.classes_

In [None]:
############################### DATAFRAME HOUSEKEEPING
df = df.join(tag_df)
df = df.drop(columns='Tags')
df['target_list'] = df.iloc[:, 3:103].values.tolist()
df = df.drop(df.columns[3:103], axis=1)
df = df.drop(df.columns[0:2], axis=1)

# DEBUG
# print(df.head(2))
# print(df.shape)

In [None]:
############################### SPLIT
# cross checking that my train and test split is exatcly the same with
# the train test split i did for the model A.
# reason for the check is the difference in data structures (df vs array)
# (for example pd.sample(random_state=0) returns different split than sklearn for the same state)

# so the split is 80/20 for train-val/test
# and another 80/20 for train/val
# so train: 72%, val 8%, and test 20%

# Splitting the dataframe
train_dataset, test_dataset = train_test_split(df, test_size=0.2, random_state=0)
train_dataset, val_dataset = train_test_split(train_dataset, test_size=0.2, random_state=0)

train_dataset = train_dataset.reset_index(drop=True)
val_dataset = val_dataset.reset_index(drop=True)
test_dataset = test_dataset.reset_index(drop=True)

# DEBUG
# print(Xy_train.head(1))
# print(Xy_test.head(1))
# quit()

print("[PROGRAM]: full-set shape: {}".format(df.shape))
print("[PROGRAM]: train-set shape: {}".format(train_dataset.shape))
print("[PROGRAM]: val-set shape: {}".format(val_dataset.shape))
print("[PROGRAM]: test-set shape: {}".format(test_dataset.shape))

[PROGRAM]: full-set shape: (80393, 2)
[PROGRAM]: train-set shape: (51451, 2)
[PROGRAM]: val-set shape: (12863, 2)
[PROGRAM]: test-set shape: (16079, 2)


In [None]:
############################### TORCH DATASET
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.combo = dataframe['Combo']
        self.targets = self.data.target_list
        self.max_len = max_len

    def __len__(self):
        return len(self.combo)

    def __getitem__(self, index):
        combo = str(self.combo[index])
        combo = " ".join(combo.split())

        inputs = self.tokenizer.encode_plus(
            combo,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

train_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
val_set = CustomDataset(val_dataset, tokenizer, MAX_LEN)
test_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

# DEBUG
# print(train_set[0])

In [None]:
############################### TORCH DATALOADER
training_loader = DataLoader(train_set, **train_params)
validation_loader = DataLoader(val_set, **test_params)
test_loader = DataLoader(test_set, **test_params)

In [None]:
############################### TRAIN FUNCS
# chckpoint and save funcs from here (joe)
# https://towardsdatascience.com/how-to-save-and-load-a-model-in-pytorch-with-a-complete-example-c2920e617dee

def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

def load_ckp(checkpoint_fpath, model, optimizer):
    # load check point
    # initialize state_dict from checkpoint to model
    # initialize optimizer from checkpoint to optimizer
    # initialize valid_loss_min from checkpoint to valid_loss_min
    # return model, optimizer, epoch value, min validation loss

    checkpoint = torch.load(checkpoint_fpath)
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    valid_loss_min = checkpoint['valid_loss_min']
    return model, optimizer, checkpoint['epoch'], valid_loss_min

In [None]:
############################### MODEL
# base : bert
# extra dropout + linear layer
# ending in 100 neurons, just like our classes
# after i extract the propabillities of each of the 100 neurons
# i select the proba >0.5 and bin the results to (0,1) (like sigmoid but manual)

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased', return_dict=False)
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 100)

    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
model,_,_,_ = load_ckp("best_model.pt", model, optimizer)
model.to(device)
print("[INFO]: BERT finetuned model loaded from best checkpoint")
print("[INFO]: model loaded to device")

[INFO]: BERT finetuned model loaded from best checkpoint
[INFO]: model loaded to device


In [None]:
################################ METRICS
def score_avg(y_pred, y_test):
    precision = precision_score(y_test, y_pred, average='micro')
    recall = recall_score(y_test, y_pred, average='micro')
    f1 = f1_score(y_test, y_pred, average='micro')
    hamming = hamming_loss(y_test, y_pred)
    jacard = jaccard_score(y_test, y_pred, average='micro')

    print("[PROGRAM]: classifier -> BERT finetuned")
    print("[PROGRAM]: avg precision: {}".format(precision))
    print("[PROGRAM]: avg recall: {}".format(recall))
    print("[PROGRAM]: avg f1-score: {}".format(f1))
    print("[PROGRAM]: avg hamming loss: {}".format(hamming))
    print("[PROGRAM]: avg jacard score: {}".format(jacard))

    return [precision, recall, f1, hamming, jacard]

def score_per_tag(y_pred, y_test):
    hamming = []
    jaccard = []
    precision, recall, fscore, support = score(y_test, y_pred)
    for i, (test, pred) in enumerate(zip(y_test.T, y_pred.T)):
        hamming.append(hamming_loss(test, pred))
        jaccard.append(jaccard_score(test,pred))

    # DEBUG
    # print(len(precision))
    # print(len(recall))
    # print(len(fscore))
    # print(len(support))
    # print(len(hamming))
    # print(len(jaccard))
    # print(len(y_classes))

    return pd.DataFrame(data=[precision, recall, fscore, support, hamming, jaccard],
                         index=["Precision", "Recall", "F-1 score", "True count", "Hamming loss", "Jaccard score"],
                         columns=mlb.classes_)

In [None]:
################################ INFERENCE TEST-SET
model.eval()
y_test = []
y_pred = []
with torch.no_grad():
    for batch_idx, data in enumerate(test_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)
        outputs = model(ids, mask, token_type_ids)
        y_test.extend(targets.cpu().detach().numpy().tolist())
        y_pred.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
        print("[PROGRAM]: INFERENCE BATCH ", batch_idx," /446")

# applying hard map of probas into (0,1)
y_pred = (np.array(y_pred) > 0.5).astype(int)
y_pred = np.array(y_pred)
y_test = np.array(y_test)

[PROGRAM]: INFERENCE BATCH  0  /446
[PROGRAM]: INFERENCE BATCH  1  /446
[PROGRAM]: INFERENCE BATCH  2  /446
[PROGRAM]: INFERENCE BATCH  3  /446
[PROGRAM]: INFERENCE BATCH  4  /446
[PROGRAM]: INFERENCE BATCH  5  /446
[PROGRAM]: INFERENCE BATCH  6  /446
[PROGRAM]: INFERENCE BATCH  7  /446
[PROGRAM]: INFERENCE BATCH  8  /446
[PROGRAM]: INFERENCE BATCH  9  /446
[PROGRAM]: INFERENCE BATCH  10  /446
[PROGRAM]: INFERENCE BATCH  11  /446
[PROGRAM]: INFERENCE BATCH  12  /446
[PROGRAM]: INFERENCE BATCH  13  /446
[PROGRAM]: INFERENCE BATCH  14  /446
[PROGRAM]: INFERENCE BATCH  15  /446
[PROGRAM]: INFERENCE BATCH  16  /446
[PROGRAM]: INFERENCE BATCH  17  /446
[PROGRAM]: INFERENCE BATCH  18  /446
[PROGRAM]: INFERENCE BATCH  19  /446
[PROGRAM]: INFERENCE BATCH  20  /446
[PROGRAM]: INFERENCE BATCH  21  /446
[PROGRAM]: INFERENCE BATCH  22  /446
[PROGRAM]: INFERENCE BATCH  23  /446
[PROGRAM]: INFERENCE BATCH  24  /446
[PROGRAM]: INFERENCE BATCH  25  /446
[PROGRAM]: INFERENCE BATCH  26  /446
[PROGRAM]: 

In [None]:
################################# METRICS (micro-average)
print("[INFO]: computing micro-average metrics for all tags")
metrics_avg = score_avg(y_pred, y_test)
metrics_per_tag = score_per_tag(y_pred, y_test)

[INFO]: computing micro-average metrics for all tags
[PROGRAM]: classifier -> BERT finetuned
[PROGRAM]: avg precision: 0.8367903538362161
[PROGRAM]: avg recall: 0.4596310021288339
[PROGRAM]: avg f1-score: 0.5933484312577928
[PROGRAM]: avg hamming loss: 0.009939050936003483
[PROGRAM]: avg jacard score: 0.42181620839363243


In [None]:
################################ METRICS ON TOP TEN TAGS
top_ten_tags = ["javascript", "java", "c#", "php", "android", "jquery", "python", "html", "c++", "ios"]
print("[INFO]: computing top-ten tag metrics")
print(metrics_per_tag[top_ten_tags])
print("[INFO]: computing top-ten tag metrics averaged")
print(metrics_per_tag[top_ten_tags].apply(np.mean, axis=1))

[INFO]: computing top-ten tag metrics
                javascript         java           c#         php      android  \
Precision         0.792972     0.878280     0.753503    0.890080     0.947538   
Recall            0.598566     0.693725     0.662745    0.794258     0.905166   
F-1 score         0.682190     0.775169     0.705216    0.839444     0.925868   
True count     1395.000000  1737.000000  1785.000000  836.000000  1297.000000   
Hamming loss      0.048386     0.043473     0.061509    0.015797     0.011692   
Jaccard score     0.517669     0.632878     0.544659    0.723312     0.861968   

                   jquery       python        html          c++         ios  
Precision        0.849408     0.924547    0.723214     0.840095    0.623580  
Recall           0.699164     0.838504    0.155172     0.636528    0.664145  
F-1 score        0.766998     0.879426    0.255521     0.724280    0.643223  
True count     718.000000  1096.000000  522.000000  1106.000000  661.000000  
Hamm