In [167]:
# !pip install transformers 
# !pip install pytorch-lightning==1.3.8
# !pip install lightning-bolts
# !pip install -v python-mecab-ko

from pl_bolts.callbacks import PrintTableMetricsCallback

import pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader
from torchmetrics.functional import accuracy, f1, auroc
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
# from pytorch_lightning.accelerators import CPUAccelerator
# from pytorch_lightning.plugins import NativeMixedPrecisionPlugin, DDPPlugin

import seaborn as sns
import matplotlib.pyplot as plt
import re
import os 

import tensorflow as tf
import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import AdamW, BertConfig
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoConfig
from transformers import RobertaConfig, RobertaModel
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score

import pandas as pd
import numpy as np
import random
import time
import datetime
import pickle 
from tqdm import tqdm
tqdm.pandas()

# from konlpy.tag import Mecab
from tqdm import tqdm
from collections import Counter
# import mecab
# mecab = mecab.MeCab()

from os import listdir
from os.path import isfile, join

class VOC_Dataset2(Dataset):

  def __init__(self, data, tokenizer, max_token_len=512):
    self.tokenizer = tokenizer
    self.data = data
    self.max_token_len = max_token_len
    
  def __len__(self):
    return len(self.data)

  def __getitem__(self, index):
    data_row = self.data.iloc[index]

    voc_text   = data_row.VOC
    voc_labels = data_row.label

    encoding = self.tokenizer.encode_plus(voc_text,
                                          add_special_tokens=True,
                                          max_length=self.max_token_len,
                                          return_token_type_ids=False,
                                          padding="max_length",
                                          truncation=True,
                                          return_attention_mask=True,
                                          return_tensors='pt')

    return dict(voc_text=voc_text,
                input_ids=encoding["input_ids"].flatten(),
                attention_mask=encoding["attention_mask"].flatten(),
                labels=torch.FloatTensor(voc_labels))

class VOC_DataModule(pl.LightningDataModule):
  def __init__(self, train_df, test_df, tokenizer, batch_size=4, max_token_len=200):
    super().__init__()
    self.batch_size = batch_size
    self.train_df = train_df
    self.test_df = test_df
    self.tokenizer = tokenizer
    self.max_token_len = max_token_len
    self.test_dataset  = VOC_Dataset2(self.test_df, self.tokenizer, self.max_token_len)

  def setup(self, stage=None):
    self.train_dataset = VOC_Dataset2(self.train_df, self.tokenizer, self.max_token_len)         
    self.test_dataset  = VOC_Dataset2(self.test_df, self.tokenizer, self.max_token_len)
    
  def train_dataloader(self):
    return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)
    
  def val_dataloader(self):
    return DataLoader(self.test_dataset, batch_size=self.batch_size)
    
  def test_dataloader(self):
    return DataLoader(self.test_dataset, batch_size=self.batch_size)
  
  def predict_dataloader(self):
    return DataLoader(self.test_dataset, batch_size=self.batch_size)

class VOC_TopicLabeler(pl.LightningModule):
  def __init__(self, n_classes, n_training_steps=None, n_warmup_steps=None):
    super().__init__()
    self.config = config
    # self.config = AutoConfig.from_pretrained("klue/roberta-base", output_hidden_states=True)
    # self.config.max_position_embeddings = 512
    self.model = AutoModelForMaskedLM.from_pretrained("klue/roberta-base", config=self.config)
    self.classifier = nn.Linear(self.model.config.hidden_size, n_classes)
    self.n_training_steps = n_training_steps
    self.n_warmup_steps = n_warmup_steps
    self.criterion = nn.BCEWithLogitsLoss() #nn.BCELoss() with sigmoid layer 
    self.dropout = nn.Dropout(self.config.hidden_dropout_prob) 
    self.dense = nn.Linear(self.model.config.hidden_size, self.model.config.hidden_size)
    self.activation = nn.Tanh()

  def forward(self, input_ids, attention_mask, labels=None):
    output = self.model(input_ids, attention_mask=attention_mask)
    last_hidden_state = output.hidden_states[-1]
    pooled_output = self.classifier(self.dropout(self.activation(self.dense(last_hidden_state[:,0]))))
    loss = 0
    if labels is not None:
        loss = self.criterion(pooled_output, labels)
    return loss, torch.sigmoid(pooled_output)

  def training_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("train_loss", loss, prog_bar=True, logger=True)
    return {"loss": loss, "predictions": outputs, "labels": labels}

  def validation_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("val_loss", loss, prog_bar=True, logger=True)
    return loss

  def test_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("test_loss", loss, prog_bar=True, logger=True)
    return loss

  def predict_step(self, batch, batch_idx, dataset_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("predic_loss", loss, prog_bar=True, logger=True)
    return loss, outputs

  def training_epoch_end(self, outputs):
    labels = []
    predictions = []
    for output in outputs:
      for out_labels in output["labels"].detach().cpu():
        labels.append(out_labels)
      for out_predictions in output["predictions"].detach().cpu():
        predictions.append(out_predictions)
    labels = torch.stack(labels).int()
    predictions = torch.stack(predictions)
    for i, name in enumerate(LABEL_COLUMNS):
      class_roc_auc = auroc(predictions[:, i], labels[:, i])
      self.logger.experiment.add_scalar(f"{name}_roc_auc/Train", class_roc_auc, self.current_epoch)

  def configure_optimizers(self):
    optimizer = AdamW(self.parameters(), lr=2e-5)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=self.n_warmup_steps, num_training_steps=self.n_training_steps)
    return dict(optimizer=optimizer, lr_scheduler=dict(scheduler=scheduler, interval='step'))

def findall_vec(key,voc):
  try:
    return re.findall(key, voc)[0]
  except:
    return ''

def findall_vec2(df):
  return findall_vec(df['keyword'],df['VOC'])

def filter_etc(df):
  voc_col = df['VOC'].apply(lambda x: re.sub('[^A-Za-z0-9가-힣 ]', '', x))
  filt0 = (voc_col.str.len() < 2).astype(int)
  filt1 = voc_col.apply(lambda x : bool(re.match(r'^[_\W]+$', str(x).replace(' ','')))).astype(int)
  filt2 = voc_col.apply(lambda x : bool(re.match(r'[\d/-]+$', str(x).replace(' ','')))).astype(int)
  filt3 = voc_col.str.replace(' ','').str.split('').fillna('').apply(set).str.len() == 2
  filt4 = voc_col.progress_apply(lambda x : tuple(Counter(mecab.morphs(x)).keys())).isin(voc_etc.apply(lambda x : tuple(x.keys())))
  return filt0+filt1+filt2+filt3+filt4

def filter_etc2(df):
  voc_col = df['VOC'].apply(lambda x: re.sub('[^A-Za-z0-9가-힣 ]', '', x))
  filt0 = (voc_col.str.len() < 2).astype(int)
  filt1 = voc_col.apply(lambda x : bool(re.match(r'^[_\W]+$', str(x).replace(' ','')))).astype(int)
  filt2 = voc_col.apply(lambda x : bool(re.match(r'[\d/-]+$', str(x).replace(' ','')))).astype(int)
  filt3 = voc_col.str.replace(' ','').str.split('').fillna('').apply(set).str.len() == 2
#   filt4 = voc_col.progress_apply(lambda x : tuple(Counter(mecab.morphs(x)).keys())).isin(voc_etc2.apply(lambda x : tuple(x.keys())))
  voc_col_enc = voc_col.apply(lambda x : Counter(tokenizer.encode_plus(x,
                      add_special_tokens=True,
                      max_length=200,
                      return_token_type_ids=False,
                      truncation=True,
                      return_attention_mask=True,
                      return_tensors='pt')['input_ids'].numpy()[0][1:-1]).keys())
  filt4 = voc_col_enc.apply(lambda x : ','.join([str(y) for y in x])).isin(voc_etc2.apply(lambda x : ','.join([str(y) for y in x.keys()])))
  return filt0+filt1+filt2+filt3+filt4

pd.options.mode.chained_assignment = None  # default='warn'
  
# tokenizer = AutoTokenizer.from_pretrained("klue/roberta-base")
# model = AutoModelForMaskedLM.from_pretrained("klue/roberta-base", return_dict=True)
#parameters
N_EPOCHS = 10
BATCH_SIZE = 12
MAX_LEN = 256
LR = 2e-05
opt_thresh = 0.4


# config_path=os.path.abspath(os.getcwd())+'/config.json'
# directory = '/content/drive/MyDrive/sema/'
directory = os.path.abspath(os.getcwd())

input_files = pd.Series([f[:-5] for f in listdir(directory+'/voc_data') if isfile(join(directory+'/voc_data', f))])
output_files = [f[:-5] for f in listdir(directory+'/output') if isfile(join(directory+'/output', f))]
running_files = input_files[~(input_files+'_output').isin(output_files)]
print(running_files)

#Load Model
tokenizer = pickle.load(open(directory+"/tokenizer.pkl", "rb"))
config = pickle.load(open(directory+"/config.pkl", "rb"))
# model = pickle.load(open(directory+"model.pkl", "rb"))
with open(directory+'/data.pkl', 'rb') as f:
    mlb = pickle.load(f)
LABEL_COLUMNS = mlb.classes_[:]
voc_etc2 = pd.read_pickle(directory+'/voc_etc2.pkl')
# voc_etc = pd.read_pickle(directory+'/voc_etc.pkl')
# voc_etc = pd.concat([voc_etc,pd.Series([Counter(mecab.morphs('모름'))])])[:]
# voc_etc = voc_etc.apply(lambda x : np.array(sorted(x.keys())))
# keyword = pd.read_pickle('/content/drive/MyDrive/sema/keyword.pkl')
keyword = pd.read_pickle(directory+'/keyword.pkl')


new_model = VOC_TopicLabeler.load_from_checkpoint(checkpoint_path=directory+"/model_weights/hosrevroberta_210825_5.ckpt", n_classes=len(LABEL_COLUMNS))
new_model.eval()

for file in running_files[~running_files.str.startswith('.')]:
  #input files 
  print('Reading : ' + file)
  voc_testset = pd.read_excel(directory+'/voc_data/' + file +'.xlsx',dtype=str)
  voc_testset['VOC1'] = voc_testset.VOC1.str.replace('\n',' ')
  voc_testset['VOC2'] = voc_testset.VOC2.str.replace('\n',' ')

  voc = pd.concat([voc_testset.VOC1,voc_testset.VOC2]).sort_index().values
  voc_testset = pd.concat([voc_testset]*2).sort_index().iloc[:,:-2]
  voc_testset['VOC'] = voc
  voc_testset['VOC'].fillna('',inplace=True)
  voc_testset['VOC'] = voc_testset['VOC'].apply(str)
  voc_testset.reset_index(inplace=True)
  voc_testset['label'] = pd.DataFrame(np.zeros((len(mlb.classes_),voc_testset.shape[0])).T).astype(int).apply(list, axis=1)

  #Setup
  print('Setting : ' + file)
  data_module = VOC_DataModule(voc_testset, voc_testset, tokenizer, batch_size=BATCH_SIZE, max_token_len=MAX_LEN)
  data_module.setup()
#   accelerator = CPUAccelerator(training_type_plugin=DDPPlugin(),precision_plugin=NativeMixedPrecisionPlugin())
#   trainer = pl.Trainer(accelerator=accelerator,max_epochs=N_EPOCHS, progress_bar_refresh_rate=3)
  trainer = pl.Trainer(max_epochs=N_EPOCHS, progress_bar_refresh_rate=3)

  #Inference
  print('Inferencing : ' + file)
  testing_predict = trainer.predict(new_model, datamodule=data_module)
  sema_df_final = np.vstack(pd.Series(np.vstack(testing_predict)[:,1]).apply(lambda x : np.vstack(x.detach().cpu().clone().numpy())))
  pred_label = (sema_df_final>opt_thresh).astype(int)
  voc_testset['pred'] = pd.Series(mlb.inverse_transform(pred_label)).apply(list)
  del voc_testset['label']

  #기타
  print('Filtering ETC data : ' + file)
  
  testing = filter_etc2(voc_testset)

  voc_testset.pred.loc[testing>0] = [[] for _ in range((testing>0).sum())]
  voc_testset = voc_testset.explode('pred',ignore_index=True)

  #키워드
  print('Extracting Keywords : ' + file)
  voc_testset['topic'] = voc_testset.pred.str.split('_').str[0]
  voc_testset['sentiment'] = voc_testset.pred.str.split('_').str[1]
  voc_testset.topic.fillna('기타',inplace=True)
  voc_testset['keyword'] = keyword.loc[voc_testset.topic].values
  voc_testset['keyword'] = voc_testset.apply(findall_vec2, axis=1)

  #save
  print('Saving output File : ' + file)
  try:
    voc_testset.fillna('').astype(str).to_excel(directory+'/output/' + file +'_output.xlsx',encoding='utf-8-sig',engine='openpyxl')
  except: 
    voc_testset.fillna('').astype(str).to_excel(directory+'/output/' + file +'_output.xlsx',encoding='utf-8-sig',engine='xlsxwriter')


0                                                 .DS_
1    PEI솔루션_VOC응답(2018_2020년)_20210713_s...
dtype: object


GPU available: False, used: False
TPU available: False, using: 0 TPU cores


Reading : PEI솔루션_VOC응답(2018_2020년)_20210713_sample500
Setting : PEI솔루션_VOC응답(2018_2020년)_20210713_sample500
Inferencing : PEI솔루션_VOC응답(2018_2020년)_20210713_sample500


  rank_zero_warn(


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Predicting', layout=Layout(flex='2'), m…


Filtering ETC data : PEI솔루션_VOC응답(2018_2020년)_20210713_sample500
Extracting Keywords : PEI솔루션_VOC응답(2018_2020년)_20210713_sample500
Saving output File : PEI솔루션_VOC응답(2018_2020년)_20210713_sample500


  ary = asanyarray(ary)


In [90]:
for file in running_files[~running_files.str.startswith('.')]:
  #input files 
  print('Reading : ' + file)
  voc_testset = pd.read_excel(directory+'/voc_data/' + file +'.xlsx',dtype=str)
  voc_testset['VOC1'] = voc_testset.VOC1.str.replace('\n',' ')
  voc_testset['VOC2'] = voc_testset.VOC2.str.replace('\n',' ')

  voc = pd.concat([voc_testset.VOC1,voc_testset.VOC2]).sort_index().values
  voc_testset = pd.concat([voc_testset]*2).sort_index().iloc[:,:-2]
  voc_testset['VOC'] = voc
  voc_testset['VOC'].fillna('',inplace=True)
  voc_testset['VOC'] = voc_testset['VOC'].apply(str)
  voc_testset.reset_index(inplace=True)
  voc_testset['label'] = pd.DataFrame(np.zeros((len(mlb.classes_),voc_testset.shape[0])).T).astype(int).apply(list, axis=1)

  #Setup
  print('Setting : ' + file)
  data_module = VOC_DataModule(voc_testset, voc_testset, tokenizer, batch_size=BATCH_SIZE, max_token_len=MAX_LEN)
  data_module.setup()
#   accelerator = CPUAccelerator(training_type_plugin=DDPPlugin(),precision_plugin=NativeMixedPrecisionPlugin())
#   trainer = pl.Trainer(accelerator=accelerator,max_epochs=N_EPOCHS, progress_bar_refresh_rate=3)
  trainer = pl.Trainer(max_epochs=N_EPOCHS, progress_bar_refresh_rate=3)

  #Inference
  print('Inferencing : ' + file)
  testing_predict = trainer.predict(new_model, datamodule=data_module)
  sema_df_final = np.vstack(pd.Series(np.vstack(testing_predict)[:,1]).apply(lambda x : np.vstack(x.detach().cpu().clone().numpy())))
  pred_label = (sema_df_final>opt_thresh).astype(int)
  voc_testset['pred'] = pd.Series(mlb.inverse_transform(pred_label)).apply(list)
  del voc_testset['label']

  #기타
  print('Filtering ETC data : ' + file)
  
  testing = filter_etc2(voc_testset)

  voc_testset.pred.loc[testing>0] = [[] for _ in range((testing>0).sum())]
  voc_testset = voc_testset.explode('pred',ignore_index=True)


GPU available: False, used: False
TPU available: False, using: 0 TPU cores


Reading : PEI솔루션_VOC응답(2018_2020년)_20210713_sample500
Setting : PEI솔루션_VOC응답(2018_2020년)_20210713_sample500
Inferencing : PEI솔루션_VOC응답(2018_2020년)_20210713_sample500


  rank_zero_warn(


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Predicting', layout=Layout(flex='2'), m…

  ary = asanyarray(ary)
  0%| | 1/1000 [00:00<00:00, 4739


Filtering ETC data : PEI솔루션_VOC응답(2018_2020년)_20210713_sample500





NameError: name 'mecab' is not defined

In [None]:

for file in running_files:
  #input files 
  print('Reading : ' + file)
  voc_testset = pd.read_excel(directory+'/voc_data/' + file +'.xlsx',dtype=str)
  voc_testset['VOC1'] = voc_testset.VOC1.str.replace('\n',' ')
  voc_testset['VOC2'] = voc_testset.VOC2.str.replace('\n',' ')

  voc = pd.concat([voc_testset.VOC1,voc_testset.VOC2]).sort_index().values
  voc_testset = pd.concat([voc_testset]*2).sort_index().iloc[:,:-2]
  voc_testset['VOC'] = voc
  voc_testset['VOC'].fillna('',inplace=True)
  voc_testset['VOC'] = voc_testset['VOC'].apply(str)
  voc_testset.reset_index(inplace=True)
  voc_testset['label'] = pd.DataFrame(np.zeros((len(mlb.classes_),voc_testset.shape[0])).T).astype(int).apply(list, axis=1)

  #Setup
  print('Setting : ' + file)
  data_module = VOC_DataModule(voc_testset, voc_testset, tokenizer, batch_size=BATCH_SIZE, max_token_len=MAX_LEN)
  data_module.setup()
  trainer = pl.Trainer(max_epochs=N_EPOCHS, progress_bar_refresh_rate=3)

  #Inference
  print('Inferencing : ' + file)
  testing_predict = trainer.predict(new_model, datamodule=data_module)
  sema_df_final = np.vstack(pd.Series(np.vstack(testing_predict)[:,1]).apply(lambda x : np.vstack(x.detach().cpu().clone().numpy())))
  pred_label = (sema_df_final>opt_thresh).astype(int)
  voc_testset['pred'] = pd.Series(mlb.inverse_transform(pred_label)).apply(list)
  del voc_testset['label']

  #기타
  print('Filtering ETC data : ' + file)
  
  testing = filter_etc(voc_testset)

  voc_testset.pred.loc[testing>0] = [[] for _ in range((testing>0).sum())]
  voc_testset = voc_testset.explode('pred',ignore_index=True)

  #키워드
  print('Extracting Keywords : ' + file)
  voc_testset['topic'] = voc_testset.pred.str.split('_').str[0]
  voc_testset['sentiment'] = voc_testset.pred.str.split('_').str[1]
  voc_testset.topic.fillna('기타',inplace=True)
  voc_testset['keyword'] = keyword.loc[voc_testset.topic].values
  voc_testset['keyword'] = voc_testset.apply(findall_vec2, axis=1)

  #save
  print('Saving output File : ' + file)
  voc_testset.fillna('').astype(str).to_excel(directory+'/output/' + file +'_output.xlsx',encoding='utf-8-sig',engine='openpyxl')


Reading : PEI솔루션_VOC응답(2018_2020년)_20210713_sample500_2 (1)


GPU available: True, used: False
TPU available: False, using: 0 TPU cores


Setting : PEI솔루션_VOC응답(2018_2020년)_20210713_sample500_2 (1)
Inferencing : PEI솔루션_VOC응답(2018_2020년)_20210713_sample500_2 (1)


  "GPU available but not used. Set the gpus flag in your trainer"


Predicting: 0it [00:00, ?it/s]

In [166]:
# testing = filter_etc2(voc_testset)

# voc_testset.pred.loc[testing>0] = [[] for _ in range((testing>0).sum())]
# voc_testset = voc_testset.explode('pred',ignore_index=True)
# voc_etc2.apply(lambda x : tuple(x.keys()))
def filter_etc2(df):
  voc_col = df['VOC'].apply(lambda x: re.sub('[^A-Za-z0-9가-힣 ]', '', x))
  filt0 = (voc_col.str.len() < 2).astype(int)
  filt1 = voc_col.apply(lambda x : bool(re.match(r'^[_\W]+$', str(x).replace(' ','')))).astype(int)
  filt2 = voc_col.apply(lambda x : bool(re.match(r'[\d/-]+$', str(x).replace(' ','')))).astype(int)
  filt3 = voc_col.str.replace(' ','').str.split('').fillna('').apply(set).str.len() == 2
#   filt4 = voc_col.progress_apply(lambda x : tuple(Counter(mecab.morphs(x)).keys())).isin(voc_etc2.apply(lambda x : tuple(x.keys())))
  voc_col_enc = voc_col.apply(lambda x : Counter(tokenizer.encode_plus(x,
                      add_special_tokens=True,
                      max_length=200,
                      return_token_type_ids=False,
                      truncation=True,
                      return_attention_mask=True,
                      return_tensors='pt')['input_ids'].numpy()[0][1:-1]).keys())
  filt4 = voc_col_enc.apply(lambda x : ','.join([str(y) for y in x])).isin(voc_etc2.apply(lambda x : ','.join([str(y) for y in x.keys()])))
  return filt0+filt1+filt2+filt3+filt4
voc_testset[filter_etc2(voc_testset)>0]
# voc_enccc = voc_testset['VOC'].apply(lambda x: re.sub('[^A-Za-z0-9가-힣 ]', '', x)).apply(lambda x : list(Counter(tokenizer.encode_plus(x,
#                       add_special_tokens=True,
#                       max_length=200,
#                       return_token_type_ids=False,
#                       truncation=True,
#                       return_attention_mask=True,
#                       return_tensors='pt')['input_ids'].numpy()[0][1:-1]).keys()))#.isin(voc_etc2.apply(lambda x : tuple(x.keys())))#
# voc_enccc.apply(lambda x : ','.join([str(y) for y in x])).isin(voc_etc2.apply(lambda x : ','.join([str(y) for y in x.keys()])))


Unnamed: 0.1,index,Unnamed: 0,설문번호,아이디,조사시작시간,조사종료시간,병원코드,진료과 코드,의사코드,병동코드,VOC,pred
1,0,0,2018102201,R508198794131698367,2018-10-22 16:23,2018-10-22 16:25,37524895,4900,3752490000000000,,해당사항없음,[진료경험(진료/투약/검사/회진)_부정]
4,2,2,2018102201,R8646383274773023108,2018-10-22 19:38,2018-10-22 21:11,37524895,4900,3752490000000000,,"대기시간이짫다,","[대기시간_부정, 주차_부정]"
6,3,3,2018102201,R2293067998190858561,2018-10-22 21:28,2018-10-22 21:34,37524895,4900,3752490000000000,,만족함,[진료경험(진료/투약/검사/회진)_긍정]
7,3,3,2018102201,R2293067998190858561,2018-10-22 21:28,2018-10-22 21:34,37524895,4900,3752490000000000,,잘모러곘음,[주차_부정]
13,6,6,2018102201,R9693965189165712177,2018-10-23 15:13,2018-10-23 15:16,37524895,4900,3752490000000000,,없습니다,[병원시스템_부정]
...,...,...,...,...,...,...,...,...,...,...,...,...
994,497,497,2018110501,R1376130336939876174,2018-11-07 18:54,2018-11-07 19:08,37100017,600,3710000000000000,,없음,[병원시스템_부정]
996,498,498,2018110501,R2525668391215629643,2018-11-07 18:55,2018-11-07 19:00,37100017,101,3710000000000000,,..,[주차_부정]
997,498,498,2018110501,R2525668391215629643,2018-11-07 18:55,2018-11-07 19:00,37100017,101,3710000000000000,,..,[주차_부정]
998,499,499,2018110501,R9377744594635125576,2018-11-07 19:09,2018-11-07 19:12,37100017,500,3710000000000000,,..,[주차_부정]


In [6]:
!pip freeze > requirements.txt

In [93]:
voc_etc2 = pd.read_pickle(directory+'/voc_etc2.pkl')

In [42]:
voc1_sema = pd.read_excel('~/Downloads/voc1_sema.xlsx')
voc2_sema = pd.read_excel('~/Downloads/voc2_sema.xlsx')
voc_etc_list = list(set(voc1_sema[voc1_sema.토픽=='기타'].VOC12.tolist())) + list(set(voc2_sema[voc2_sema.토픽=='기타']['VOC2.1'].tolist()))
voc_etc_list2 = pd.Series((voc_etc_list+['전혀 없음','업ㅇㄷㅁ','그저 그렇다','그저 그럼','잘 모름', '대체로 만족합니다.', '대체로 만족합니다','별로 없음','잘해주시고계심','잘하세요','없네여','모름','없더군요'])).str.replace(r'\W','',regex=True)
voc_etc_list2

0                          괜찬았어요
1                               
2                        아픈것이해결됨
3      모두모두가서로를배려하다면점더좋은진료환경이되겠죠
4                 체계적인연계진료에감동받았다
                 ...            
333                      잘해주시고계심
334                         잘하세요
335                          없네여
336                           모름
337                         없더군요
Length: 338, dtype: object

In [56]:

voc_etc_list2.apply(lambda x : Counter(tokenizer.encode_plus(x,
                      add_special_tokens=True,
                      max_length=200,
                      return_token_type_ids=False,
                      truncation=True,
                      return_attention_mask=True,
                      return_tensors='pt')['input_ids'].numpy()[0][1:-1])).drop_duplicates().to_pickle('voc_etc2.pkl')


In [77]:
voc_etc2 = pd.read_pickle(directory+'/voc_etc2.pkl')

for file in running_files[~running_files.str.startswith('.')]:
  #input files 
  print('Reading : ' + file)
  voc_testset = pd.read_excel(directory+'/voc_data/' + file +'.xlsx',dtype=str)
  voc_testset['VOC1'] = voc_testset.VOC1.str.replace('\n',' ')
  voc_testset['VOC2'] = voc_testset.VOC2.str.replace('\n',' ')

  voc = pd.concat([voc_testset.VOC1,voc_testset.VOC2]).sort_index().values
  voc_testset = pd.concat([voc_testset]*2).sort_index().iloc[:,:-2]
  voc_testset['VOC'] = voc
  voc_testset['VOC'].fillna('',inplace=True)
  voc_testset['VOC'] = voc_testset['VOC'].apply(str)
  voc_testset.reset_index(inplace=True)
  voc_testset['label'] = pd.DataFrame(np.zeros((len(mlb.classes_),voc_testset.shape[0])).T).astype(int).apply(list, axis=1)

  #Setup
  print('Setting : ' + file)
  data_module = VOC_DataModule(voc_testset, voc_testset, tokenizer, batch_size=BATCH_SIZE, max_token_len=MAX_LEN)
  data_module.setup()
  trainer = pl.Trainer(max_epochs=N_EPOCHS, progress_bar_refresh_rate=3, )

  #Inference
  print('Inferencing : ' + file)
  testing_predict = trainer.predict(new_model, datamodule=data_module)
  sema_df_final = np.vstack(pd.Series(np.vstack(testing_predict)[:,1]).apply(lambda x : np.vstack(x.detach().cpu().clone().numpy())))
  pred_label = (sema_df_final>opt_thresh).astype(int)
  voc_testset['pred'] = pd.Series(mlb.inverse_transform(pred_label)).apply(list)
  del voc_testset['label']

  #기타
  print('Filtering ETC data : ' + file)
  
  testing = filter_etc(voc_testset)

  voc_testset.pred.loc[testing>0] = [[] for _ in range((testing>0).sum())]
  voc_testset = voc_testset.explode('pred',ignore_index=True)

  #키워드
  print('Extracting Keywords : ' + file)
  voc_testset['topic'] = voc_testset.pred.str.split('_').str[0]
  voc_testset['sentiment'] = voc_testset.pred.str.split('_').str[1]
  voc_testset.topic.fillna('기타',inplace=True)
  voc_testset['keyword'] = keyword.loc[voc_testset.topic].values
  voc_testset['keyword'] = voc_testset.apply(findall_vec2, axis=1)

  #save
  print('Saving output File : ' + file)
  try:
    voc_testset.fillna('').astype(str).to_excel(directory+'/output/' + file +'_output.xlsx',encoding='utf-8-sig',engine='openpyxl')
  except: 
    voc_testset.fillna('').astype(str).to_excel(directory+'/output/' + file +'_output.xlsx',encoding='utf-8-sig',engine='xlsxwriter')


Reading : PEI솔루션_VOC응답(2018_2020년)_20210713_sample500
Setting : PEI솔루션_VOC응답(2018_2020년)_20210713_sample500


  rank_zero_warn(


MisconfigurationException: Selected distributed backend ddp is not compatible with an interactive environment. Run your code as a script, or choose one of the compatible backends: dp, ddp_spawn, ddp_sharded_spawn, tpu_spawn

In [88]:
from pytorch_lightning.accelerators import CPUAccelerator
from pytorch_lightning.plugins import NativeMixedPrecisionPlugin, DDPPlugin

accelerator = CPUAccelerator(training_type_plugin=DDPPlugin(),precision_plugin=NativeMixedPrecisionPlugin())
trainer = pl.Trainer(accelerator=accelerator)
