In [1]:
from transformers import BertTokenizer
from pathlib import Path
import torch

from box import Box
import pandas as pd
import collections
import os
from tqdm import tqdm, trange
import sys
import random
import numpy as np
import apex
from sklearn.model_selection import train_test_split

import datetime

from fast_bert.modeling import BertForMultiLabelSequenceClassification
from fast_bert.data_cls import BertDataBunch, InputExample, InputFeatures, MultiLabelTextProcessor, convert_examples_to_features
from fast_bert.learner_cls import BertLearner
from fast_bert.metrics import accuracy_multilabel, accuracy_thresh, fbeta, roc_auc

In [2]:
torch.cuda.empty_cache()

In [3]:
pd.set_option('display.max_colwidth', -1)
run_start_time = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M-%S')

  pd.set_option('display.max_colwidth', -1)


In [4]:
DATA_PATH = Path('../data/')
LABEL_PATH = Path('../labels/')

AUG_DATA_PATH = Path('../data/data_augmentation/')

MODEL_PATH=Path('../models/')
LOG_PATH=Path('../logs/')
MODEL_PATH.mkdir(exist_ok=True)

model_state_dict = None

# BERT_PRETRAINED_PATH = Path('../../bert_models/pretrained-weights/cased_L-12_H-768_A-12/')
BERT_PRETRAINED_PATH = Path('../../bert_models/uncased_L-12_H-768_A-12/')
# BERT_PRETRAINED_PATH = Path('../../bert_fastai/pretrained-weights/uncased_L-24_H-1024_A-16/')
# FINETUNED_PATH = Path('../models/finetuned_model.bin')
FINETUNED_PATH = None
# model_state_dict = torch.load(FINETUNED_PATH)

LOG_PATH.mkdir(exist_ok=True)

OUTPUT_PATH = MODEL_PATH/'output'
OUTPUT_PATH.mkdir(exist_ok=True)

In [5]:
args = Box({
    "run_text": "multilabel toxic comments with freezable layers",
    "train_size": -1,
    "val_size": -1,
    "log_path": LOG_PATH,
    "full_data_dir": DATA_PATH,
    "data_dir": DATA_PATH,
    "task_name": "toxic_classification_lib",
    "no_cuda": False,
    "bert_model": BERT_PRETRAINED_PATH,
    "output_dir": OUTPUT_PATH,
    "max_seq_length": 256,
    "do_train": True,
    "do_eval": True,
    "do_lower_case": True,
    "train_batch_size": 8,
    "eval_batch_size": 16,
    "learning_rate": 5e-5,
    "num_train_epochs": 6,
    "warmup_proportion": 0.0,
    "no_cuda": False,
    "local_rank": -1,
    "seed": 42,
    "gradient_accumulation_steps": 1,
    "optimize_on_cpu": False,
    "fp16": True,
    "fp16_opt_level": "O1",
    "weight_decay": 0.0,
    "adam_epsilon": 1e-8,
    "max_grad_norm": 1.0,
    "max_steps": -1,
    "warmup_steps": 500,
    "logging_steps": 50,
    "eval_all_checkpoints": True,
    "overwrite_output_dir": True,
    "overwrite_cache": False,
    "seed": 42,
    "loss_scale": 128,
    "task_name": 'intent',
    "model_name": 'xlnet-base-cased',
    "model_type": 'xlnet'
})

In [6]:
import logging

logfile = str(LOG_PATH/'log-{}-{}.txt'.format(run_start_time, args["run_text"]))

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S',
    handlers=[
        logging.FileHandler(logfile),
        logging.StreamHandler(sys.stdout)
    ])

logger = logging.getLogger()

In [7]:
logger.info(args)

01/18/2021 17:50:33 - INFO - root -   {'run_text': 'multilabel toxic comments with freezable layers', 'train_size': -1, 'val_size': -1, 'log_path': WindowsPath('../logs'), 'full_data_dir': WindowsPath('../data'), 'data_dir': WindowsPath('../data'), 'task_name': 'intent', 'no_cuda': False, 'bert_model': WindowsPath('../../bert_models/uncased_L-12_H-768_A-12'), 'output_dir': WindowsPath('../models/output'), 'max_seq_length': 256, 'do_train': True, 'do_eval': True, 'do_lower_case': True, 'train_batch_size': 8, 'eval_batch_size': 16, 'learning_rate': 5e-05, 'num_train_epochs': 6, 'warmup_proportion': 0.0, 'local_rank': -1, 'seed': 42, 'gradient_accumulation_steps': 1, 'optimize_on_cpu': False, 'fp16': True, 'fp16_opt_level': 'O1', 'weight_decay': 0.0, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'max_steps': -1, 'warmup_steps': 500, 'logging_steps': 50, 'eval_all_checkpoints': True, 'overwrite_output_dir': True, 'overwrite_cache': False, 'loss_scale': 128, 'model_name': 'xlnet-base-cased',

In [8]:
tokenizer = BertTokenizer.from_pretrained(str(BERT_PRETRAINED_PATH), do_lower_case=args['do_lower_case'])

01/18/2021 17:50:33 - INFO - transformers.tokenization_utils_base -   Model name '..\..\bert_models\uncased_L-12_H-768_A-12' not found in model shortcut name list (bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese, bert-base-german-cased, bert-large-uncased-whole-word-masking, bert-large-cased-whole-word-masking, bert-large-uncased-whole-word-masking-finetuned-squad, bert-large-cased-whole-word-masking-finetuned-squad, bert-base-cased-finetuned-mrpc, bert-base-german-dbmdz-cased, bert-base-german-dbmdz-uncased, TurkuNLP/bert-base-finnish-cased-v1, TurkuNLP/bert-base-finnish-uncased-v1, wietsedv/bert-base-dutch-cased). Assuming '..\..\bert_models\uncased_L-12_H-768_A-12' is a path, a model identifier, or url to a directory containing tokenizer files.
01/18/2021 17:50:33 - INFO - transformers.tokenization_utils_base -   Didn't find file ..\..\bert_models\uncased_L-12_H-768_A-12\added_t

In [9]:
device = torch.device('cuda')
if torch.cuda.device_count() > 1:
    args.multi_gpu = True
else:
    args.multi_gpu = False

In [10]:
label_cols = ['Company Attributes',
       'Covered Products', 'Customer Contacted Tags', 'Customer Rescue Tags',
       'Facilities', 'Facility Attributes', 'Food Items', 'Marketing',
       'Personnel', 'Personnel Attributes', 'Processes', 'Product Attributes',
       'Product and Service Attributes', 'Products', 'Products / Services',
       'Risk', 'Risk Management', 'Risks']

In [11]:
databunch = BertDataBunch(args['data_dir'], LABEL_PATH, args.model_name, train_file='train.csv', val_file='val.csv',
                          test_data='test.csv',
                          text_col="All_comments", label_col=label_cols,
                          batch_size_per_gpu=args['train_batch_size'], max_seq_length=args['max_seq_length'], 
                          multi_gpu=args.multi_gpu, multi_label=True, model_type=args.model_type)

01/18/2021 17:50:34 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json from cache at C:\Users\hqli0/.cache\torch\transformers\c9cc6e53904f7f3679a31ec4af244f4419e25ebc8e71ebf8c558a31cbcf07fc8.69e5e35e0b798cab5e473f253752f8bf4d280ee37682281a23eed80f6e2d09c6
01/18/2021 17:50:34 - INFO - transformers.configuration_utils -   Model config XLNetConfig {
  "architectures": [
    "XLNetLMHeadModel"
  ],
  "attn_type": "bi",
  "bi_data": false,
  "bos_token_id": 1,
  "clamp_len": -1,
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "dropout": 0.1,
  "end_n_top": 5,
  "eos_token_id": 2,
  "ff_activation": "gelu",
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-12,
  "mem_len": null,
  "model_type": "xlnet",
  "n_head": 12,
  "n_layer": 12,
  "pad_token_id": 5,
  "reuse_len": null,
  "same_length": false,
  "start_n_top": 5,
  "summary_activation": "tanh",
  "summary_last_dropout": 0.1,

In [12]:
databunch.train_dl.dataset[0][3]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0.])

In [13]:
# train_df.head(20)
# databunch = BertDataBunch.load(args['data_dir'])

In [14]:
num_labels = len(databunch.labels)
num_labels

18

In [15]:
# databunch.train_dl.dataset[10]

In [16]:
# torch.distributed.init_process_group(backend="nccl", 
#                                      init_method = "tcp://localhost:23459", 
#                                      rank=0, world_size=1)

In [17]:
metrics = []
metrics.append({'name': 'accuracy_thresh', 'function': accuracy_thresh})
metrics.append({'name': 'roc_auc', 'function': roc_auc})
metrics.append({'name': 'fbeta', 'function': fbeta})

In [18]:
learner = BertLearner.from_pretrained_model(databunch, args.model_name, metrics=metrics, 
                                            device=device, logger=logger, output_dir=args.output_dir, 
                                            finetuned_wgts_path=FINETUNED_PATH, warmup_steps=args.warmup_steps,
                                            multi_gpu=args.multi_gpu, is_fp16=args.fp16, 
                                            multi_label=True, logging_steps=0)



01/18/2021 17:50:41 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json from cache at C:\Users\hqli0/.cache\torch\transformers\c9cc6e53904f7f3679a31ec4af244f4419e25ebc8e71ebf8c558a31cbcf07fc8.69e5e35e0b798cab5e473f253752f8bf4d280ee37682281a23eed80f6e2d09c6
01/18/2021 17:50:41 - INFO - transformers.configuration_utils -   Model config XLNetConfig {
  "architectures": [
    "XLNetLMHeadModel"
  ],
  "attn_type": "bi",
  "bi_data": false,
  "bos_token_id": 1,
  "clamp_len": -1,
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "dropout": 0.1,
  "end_n_top": 5,
  "eos_token_id": 2,
  "ff_activation": "gelu",
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",

In [19]:
learner.fit(args.num_train_epochs, args.learning_rate, validate=True)

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
01/18/2021 17:50:45 - INFO - root -   ***** Running training *****
01/18/2021 17:50:45 - INFO - root -     Num examples = 67127
01/18/2021 17:50:45 - INFO - root -     Num Epochs = 6
01/18/2021 17:50:45 - INFO - root -     Total train batch size (w. parallel, distributed & accumulation) = 8
01/18/2021 17:50:45 -

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  ..\torch\csrc\utils\python_arg_parser.cpp:882.)
  return orig_fn(arg0, *args, **kwargs)


Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 131072.0
01/18/2021 18:59:07 - INFO - root -   Running evaluation
01/18/2021 18:59:07 - INFO - root -     Num examples = 28769
01/18/2021 18:59:07 - INFO - root -     Batch size = 16


01/18/2021 19:09:50 - INFO - root -   eval_loss after epoch 1: 0.15494368677071527: 
01/18/2021 19:09:50 - INFO - root -   eval_accuracy_thresh after epoch 1: 0.9442610144615173: 
01/18/2021 19:09:50 - INFO - root -   eval_roc_auc after epoch 1: 0.9670205994270473: 
01/18/2021 19:09:50 - INFO - root -   eval_fbeta after epoch 1: 0.8375136256217957: 
01/18/2021 19:09:50 - INFO - root -   lr after epoch 1: 4.6971392599443394e-05
01/18/2021 19:09:50 - INFO - root -   train_loss after epoch 1: 0.2284901311252359
01/18/2021 19:09:50 - INFO - root -   





Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 131072.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 131072.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 131072.0
01/18/2021 20:20:40 - INFO - root -   Running evaluation
01/18/2021 20:20:40 - INFO - root -     Num examples = 28769
01/18/2021 20:20:40 - INFO - root -     Batch size = 16


01/18/2021 20:30:59 - INFO - root -   eval_loss after epoch 2: 0.1303368349103521: 
01/18/2021 20:30:59 - INFO - root -   eval_accuracy_thresh after epoch 2: 0.9545518159866333: 
01/18/2021 20:30:59 - INFO - root -   eval_roc_auc after epoch 2: 0.9757168310161599: 
01/18/2021 20:30:59 - INFO - root -   eval_fbeta after epoch 2: 0.8671594858169556: 
01/18/2021 20:30:59 - INFO - root -   lr after epoch 2: 3.7952058901445867e-05
01/18/2021 20:30:59 - INFO - root -   train_loss after epoch 2: 0.14733783668434325
01/18/2021 20:30:59 - INFO - root -   





Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 131072.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 131072.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 131072.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 131072.0
01/18/2021 21:43:49 - INFO - root -   Running evaluation
01/18/2021 21:43:49 - INFO - root -     Num examples = 28769
01/18/2021 21:43:49 - INFO - root -     Batch size = 16


01/18/2021 21:53:44 - INFO - root -   eval_loss after epoch 3: 0.1201424604295243: 
01/18/2021 21:53:44 - INFO - root -   eval_accuracy_thresh after epoch 3: 0.9582710862159729: 
01/18/2021 21:53:44 - INFO - root -   eval_roc_auc after epoch 3: 0.9797760986381807: 
01/18/2021 21:53:44 - INFO - root -   eval_fbeta after epoch 3: 0.8819658756256104: 
01/18/2021 21:53:44 - INFO - root -   lr after epoch 3: 2.5393896032638558e-05
01/18/2021 21:53:44 - INFO - root -   train_loss after epoch 3: 0.12893888318185093
01/18/2021 21:53:44 - INFO - root -   





Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 131072.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 131072.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 131072.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 131072.0
01/18/2021 23:11:04 - INFO - root -   Running evaluation
01/18/2021 23:11:04 - INFO - root -     Num examples = 28769
01/18/2021 23:11:04 - INFO - root -     Batch size = 16


01/18/2021 23:37:54 - INFO - root -   eval_loss after epoch 4: 0.11493587224361689: 
01/18/2021 23:37:54 - INFO - root -   eval_accuracy_thresh after epoch 4: 0.959908664226532: 
01/18/2021 23:37:54 - INFO - root -   eval_roc_auc after epoch 4: 0.981926743636248: 
01/18/2021 23:37:54 - INFO - root -   eval_fbeta after epoch 4: 0.8910912871360779: 
01/18/2021 23:37:54 - INFO - root -   lr after epoch 4: 1.2728110832804649e-05
01/18/2021 23:37:54 - INFO - root -   train_loss after epoch 4: 0.1193719093597153
01/18/2021 23:37:54 - INFO - root -   





Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 131072.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 65536.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 131072.0
01/19/2021 00:47:40 - INFO - root -   Running evaluation
01/19/2021 00:47:40 - INFO - root -     Num examples = 28769
01/19/2021 00:47:40 - INFO - root -     Batch size = 16


01/19/2021 00:57:35 - INFO - root -   eval_loss after epoch 5: 0.11269423962203685: 
01/19/2021 00:57:35 - INFO - root -   eval_accuracy_thresh after epoch 5: 0.9609282612800598: 
01/19/2021 00:57:35 - INFO - root -   eval_roc_auc after epoch 5: 0.9824327224336975: 
01/19/2021 00:57:35 - INFO - root -   eval_fbeta after epoch 5: 0.8930819034576416: 
01/19/2021 00:57:35 - INFO - root -   lr after epoch 5: 3.4153152770316832e-06
01/19/2021 00:57:35 - INFO - root -   train_loss after epoch 5: 0.11426767817306734
01/19/2021 00:57:35 - INFO - root -   





Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 131072.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 131072.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 65536.0
01/19/2021 02:08:13 - INFO - root -   Running evaluation
01/19/2021 02:08:13 - INFO - root -     Num examples = 28769
01/19/2021 02:08:13 - INFO - root -     Batch size = 16


01/19/2021 02:18:11 - INFO - root -   eval_loss after epoch 6: 0.11235934502862936: 
01/19/2021 02:18:11 - INFO - root -   eval_accuracy_thresh after epoch 6: 0.9611329436302185: 
01/19/2021 02:18:11 - INFO - root -   eval_roc_auc after epoch 6: 0.9825482387100534: 
01/19/2021 02:18:11 - INFO - root -   eval_fbeta after epoch 6: 0.8932914733886719: 
01/19/2021 02:18:11 - INFO - root -   lr after epoch 6: 0.0
01/19/2021 02:18:11 - INFO - root -   train_loss after epoch 6: 0.11231594373822876
01/19/2021 02:18:11 - INFO - root -   





(50346, 0.14178706371040692)

In [20]:
learner.validate()

01/19/2021 02:18:11 - INFO - root -   Running evaluation
01/19/2021 02:18:11 - INFO - root -     Num examples = 28769
01/19/2021 02:18:11 - INFO - root -     Batch size = 16


{'loss': 0.11235934502862936,
 'accuracy_thresh': 0.9611329436302185,
 'roc_auc': 0.9825482387100534,
 'fbeta': 0.8932914733886719}

In [21]:
learner.save_model()

01/19/2021 02:28:06 - INFO - transformers.configuration_utils -   Configuration saved in ..\models\output\model_out\config.json
01/19/2021 02:28:08 - INFO - transformers.modeling_utils -   Model weights saved in ..\models\output\model_out\pytorch_model.bin


In [39]:
test=pd.read_csv('../data/test.csv')

In [40]:
test

Unnamed: 0,All_comments,Tag Category,Company Attributes,Covered Products,Customer Contacted Tags,Customer Rescue Tags,Facilities,Facility Attributes,Food Items,Marketing,Personnel,Personnel Attributes,Processes,Product Attributes,Product and Service Attributes,Products,Products / Services,Risk,Risk Management,Risks
0,i love the personnel touch and the coffee the tellers are always great to chat with and they remember details about me. i think matt (?) is always enjoyable to talk with.,"['Facility Attributes', 'Personnel', 'Personnel Attributes']",0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0
1,"no complaints. all of the staff are engaged, friendly, and are custoomer focused management also engaged. customer service and happy people willing to help. the bank and staff are a stark constrast to customer service at other banks and retail establishments. nothing. do not close.","['Facilities', 'Personnel', 'Personnel Attributes']",0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0
2,friendly fast service never had a problem with all the years we have been banking with you.,"['Marketing', 'Personnel Attributes', 'Processes']",0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0
3,the employees at this bank greet me with friendly face and knowledgeable. location and appearance outstanding. nothing that i can think of - employees are great and hours perfect.,"['Personnel', 'Personnel Attributes', 'Processes']",0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0
4,"this is one of my favorite locations. it is close to my home and very easy to access. the staff is great especially ingrid! she is always very helpful and so personable. she us truly an asset to your staff. as i mentioned before, everyone is super helpful but ingrid is the best! can't think of anything","['Facility Attributes', 'Personnel', 'Personnel Attributes']",0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0
5,"unfortunately, we were victims of a fraud. the banks reaction to our dilemma was with empathy and did not demean us in any way for our lack of good judgement. rachel shroeder was extremely understanding and went above and beyond to help us thru this very stressful and embarrassing episode. she is to be commended for her professionalism, tempered with her understanding of our state of mind because of the situation we got ourselves into. please, just keep working on someway to get quicker feedback on counterfeit and fraudulent checks. as stated previously, the understanding and empathy that was displayed by rachel. her willingness to do everything she could do, whatever it took, to avert anymore unpleasantness occurring due to the situation. we learned a lot from her. quicker feedback on the validity of deposited checks. check fraud reconciliation","['Customer Rescue Tags', 'Personnel', 'Personnel Attributes', 'Processes', 'Product Attributes', 'Products', 'Risk', 'Risk Management', 'Risks']",0,0,0,1,0,0,0,0,1,1,1,1,0,1,0,1,1,1
6,the 2nd branch closing in beatrice made the drive to the bank more of an inconvenience.,"['Customer Contacted Tags', 'Customer Rescue Tags', 'Facility Attributes', 'Personnel', 'Personnel Attributes', 'Processes']",0,0,1,1,0,1,0,0,1,1,1,0,0,0,0,0,0,0
7,"every time i visit this branch the staff is quick and friendly. i think it is quite clever that they put the checking deposit slip and pen in the tube. that to me is very convenient. however, i never tried to make a deposit online or at before.","['Marketing', 'Personnel', 'Personnel Attributes', 'Processes', 'Product Attributes', 'Products']",0,0,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,0
8,it would be easier to do business with more atm locations in lincoln.,"['Personnel', 'Products']",0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
9,i had multiple deposits to multiple accounts as well as detailed cash back requests.,"['Product Attributes', 'Products']",0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0


In [41]:
result=learner.predict_batch(test['All_comments'].tolist())

01/19/2021 10:31:22 - INFO - root -   Writing example 0 of 19




In [42]:
prediction=[]
for sublist in result:
    temp=[]
    for t,s in sublist:
        if (s>0.5):
            temp.append(t)
    prediction.append(temp)

In [44]:
test['Predict Tag Category']=prediction

In [45]:
test

Unnamed: 0,All_comments,Tag Category,Company Attributes,Covered Products,Customer Contacted Tags,Customer Rescue Tags,Facilities,Facility Attributes,Food Items,Marketing,...,Personnel Attributes,Processes,Product Attributes,Product and Service Attributes,Products,Products / Services,Risk,Risk Management,Risks,Predict Tag Category
0,i love the personnel touch and the coffee the tellers are always great to chat with and they remember details about me. i think matt (?) is always enjoyable to talk with.,"['Facility Attributes', 'Personnel', 'Personnel Attributes']",0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,"[Personnel, Facility Attributes, Personnel Attributes]"
1,"no complaints. all of the staff are engaged, friendly, and are custoomer focused management also engaged. customer service and happy people willing to help. the bank and staff are a stark constrast to customer service at other banks and retail establishments. nothing. do not close.","['Facilities', 'Personnel', 'Personnel Attributes']",0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,"[Personnel, Personnel Attributes, Facility Attributes]"
2,friendly fast service never had a problem with all the years we have been banking with you.,"['Marketing', 'Personnel Attributes', 'Processes']",0,0,0,0,0,0,0,1,...,1,1,0,0,0,0,0,0,0,"[Marketing, Personnel Attributes, Processes]"
3,the employees at this bank greet me with friendly face and knowledgeable. location and appearance outstanding. nothing that i can think of - employees are great and hours perfect.,"['Personnel', 'Personnel Attributes', 'Processes']",0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,"[Personnel Attributes, Personnel, Processes, Facility Attributes]"
4,"this is one of my favorite locations. it is close to my home and very easy to access. the staff is great especially ingrid! she is always very helpful and so personable. she us truly an asset to your staff. as i mentioned before, everyone is super helpful but ingrid is the best! can't think of anything","['Facility Attributes', 'Personnel', 'Personnel Attributes']",0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,"[Personnel, Personnel Attributes, Facility Attributes]"
5,"unfortunately, we were victims of a fraud. the banks reaction to our dilemma was with empathy and did not demean us in any way for our lack of good judgement. rachel shroeder was extremely understanding and went above and beyond to help us thru this very stressful and embarrassing episode. she is to be commended for her professionalism, tempered with her understanding of our state of mind because of the situation we got ourselves into. please, just keep working on someway to get quicker feedback on counterfeit and fraudulent checks. as stated previously, the understanding and empathy that was displayed by rachel. her willingness to do everything she could do, whatever it took, to avert anymore unpleasantness occurring due to the situation. we learned a lot from her. quicker feedback on the validity of deposited checks. check fraud reconciliation","['Customer Rescue Tags', 'Personnel', 'Personnel Attributes', 'Processes', 'Product Attributes', 'Products', 'Risk', 'Risk Management', 'Risks']",0,0,0,1,0,0,0,0,...,1,1,1,0,1,0,1,1,1,"[Personnel Attributes, Risks, Risk Management, Risk, Personnel, Processes, Product Attributes]"
6,the 2nd branch closing in beatrice made the drive to the bank more of an inconvenience.,"['Customer Contacted Tags', 'Customer Rescue Tags', 'Facility Attributes', 'Personnel', 'Personnel Attributes', 'Processes']",0,0,1,1,0,1,0,0,...,1,1,0,0,0,0,0,0,0,"[Personnel Attributes, Personnel, Processes, Facility Attributes]"
7,"every time i visit this branch the staff is quick and friendly. i think it is quite clever that they put the checking deposit slip and pen in the tube. that to me is very convenient. however, i never tried to make a deposit online or at before.","['Marketing', 'Personnel', 'Personnel Attributes', 'Processes', 'Product Attributes', 'Products']",0,0,0,0,0,0,0,1,...,1,1,1,0,1,0,0,0,0,"[Products, Personnel, Marketing, Processes, Product Attributes, Personnel Attributes]"
8,it would be easier to do business with more atm locations in lincoln.,"['Personnel', 'Products']",0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,"[Products, Personnel]"
9,i had multiple deposits to multiple accounts as well as detailed cash back requests.,"['Product Attributes', 'Products']",0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,"[Product Attributes, Products]"
