### TPU 사용하는 법
1) https://pytorch.org/xla/release/1.7/index.html<br>
2) https://pytorch.org/xla/release/1.7/index.html#xla-tensor-deep-dive<br><br><br>
### torchtext 사용하는 법
1) https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/A%20-%20Using%20TorchText%20with%20Your%20Own%20Datasets.ipynb<br><br>
#### 1) 이 코드를 전부 돌려보고 정상적으로 돌아가는지 확인한다 (stopwords 처리 안함)<br>
#### 2) stopwords를 삭제하고 나서, 1)번과 성능 차이를 본다<br>
#### 3) torchtext 버전으로 코드를 수정해보고, 또 stopwords 삭제도 해보면서 성능 차이를 확인한다.<br><br><br>
### References<br>
1) https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/4%20-%20Convolutional%20Sentiment%20Analysis.ipynb

In [37]:
# !curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
# !python pytorch-xla-env-setup.py --version nightly --apt-packages libomp5 libopenblas-dev

In [38]:
# import torch_xla
# import torch_xla.core.xla_model as xm

# # https://www.kaggle.com/tanlikesmath/the-ultimate-pytorch-tpu-tutorial-jigsaw-xlm-r

In [39]:
import numpy as np
import pandas as pd
import json
import pickle
import os, re, gensim
os.environ['XLA_USE_BF16'] = '1'
os.environ['XLA_TENSOR_ALLOCATOR_MAXSIZE'] = '100000000'
from nltk.corpus import stopwords
from collections import defaultdict
stop = stopwords.words('english')

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# device = xm.xla_device()

import warnings
warnings.filterwarnings('ignore')

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/posneg-for-cnn4sc/rt-polarity.pos
/kaggle/input/posneg-for-cnn4sc/rt-polarity.neg
/kaggle/input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin
/kaggle/input/cnn-word-vector-json/data.json
/kaggle/input/pickled-glove840b300d-for-10sec-loading/glove.840B.300d.pkl


In [40]:
### word_vecs 만들어주기
## dict 자료형에, word와 wordvector가 함께 있어야 함, 코드 에러가 안나면 limit=500000으로 변경하기 
word_vecs = gensim.models.KeyedVectors.load_word2vec_format("/kaggle/input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin", binary=True, limit=700000)

In [41]:
## modified codes in https://github.com/yoonkim/CNN_sentence 
class preprocess:
    def __init__(self, word_vecs, save=False):
        self.k = 300     # the embedding dimension of pretrained vector noted at the paper
        self.revs = []
        self.vocab_size = 0
        self.max_len = 56 # the value what yoon used at his codes
        self.word_idx_map = dict()
        self.word_vecs = word_vecs
        self.stop = set(stopwords.words('english'))
        self.save = save

        
    def clean_str(self, string):
        ## string이 sentence로 들어와서, self.stop으로 못 거름...=_=;
        ## 우선 stopwords 안 거르는걸로 해서 함 돌려보자.
        string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)     
        string = re.sub(r"\'s", " \'s", string) 
        string = re.sub(r"\'ve", " \'ve", string) 
        string = re.sub(r"n\'t", " n\'t", string) 
        string = re.sub(r"\'re", " \'re", string) 
        string = re.sub(r"\'d", " \'d", string) 
        string = re.sub(r"\'ll", " \'ll", string) 
        string = re.sub(r",", " , ", string) 
        string = re.sub(r"!", " ! ", string) 
        string = re.sub(r"\(", " \( ", string) 
        string = re.sub(r"\)", " \) ", string) 
        string = re.sub(r"\?", " \? ", string) 
        string = re.sub(r"\s{2,}", " ", string)
        return string.strip().lower()
    
    
    def build_data_cv(self, cv=10):
        pos_file = "/kaggle/input/posneg-for-cnn4sc/rt-polarity.pos"
        neg_file = "/kaggle/input/posneg-for-cnn4sc/rt-polarity.neg"

        file_list = [pos_file, neg_file]    
        self.vocab = defaultdict(float)

        for file in file_list:
            with open(file, "rb") as f:
                for line in f: 
                    try:
                        line = line.decode("utf-8")
                    except UnicodeDecodeError:
                        line = line.decode('latin-1')

                    rev = []
                    rev.append(line.strip())
                    orig_rev = self.clean_str(" ".join(rev))
        
                    words = set(orig_rev.split())
                    
                    if len(words) > self.max_len:
                        self.max_len = len(words)
                    
                    for word in words:
                        try:
                            self.vocab[word] += 1
                            self.vocab_size += 1
                        except:
                            self.vocab[word] = 1
                            self.vocab_size += 1
                            
                    if file[-3:] == "pos":        
                        datum = {"y": 1,
                                "text": orig_rev,
                                "split": np.random.randint(0,cv)}
                        ## np.random.randint는 discrete uniform distribution
                        
                    elif file[-3:] == "neg":
                        datum = {"y": 0,
                                "text": orig_rev,
                                "split": np.random.randint(0,cv)}
                    self.revs.append(datum)
        
        return self.revs, self.vocab, self.max_len
    
    
    def add_unknown_words(self, min_df=1):
        cnt = 0
        for word in self.vocab:
            if word not in self.word_vecs and self.vocab[word] >= min_df:
                self.word_vecs[word] = np.random.uniform(-0.25,0.25,self.k)  
                self.vocab_size += 1
                cnt += 1
        print(cnt, ' of unknown words were here')          

    def get_W(self):
        self.revs, self.vocab, self.max_len = self.build_data_cv()
        self.add_unknown_words()
        self.W = np.zeros(shape=(self.vocab_size+1, self.k), dtype='float32')            
        self.W[0] = np.zeros(self.k, dtype='float32')
        i = 1
        for word in self.vocab:
            self.W[i] = self.word_vecs[word]
            self.word_idx_map[word] = i
            i += 1
            
        if self.save:
            self.save_file()
#         print('W shape:', self.W.shape)
        # word_idx_map은 dictionary 타입이다.
        return self.W, self.word_idx_map, self.revs, self.max_len
    
    def save_file(self):
        data = {'revs':self.revs,
               'w':self.W,
               'word_idx_map':self.word_idx_map,
               'vocab':self.vocab}
        
        pd.Series(data).to_json('data.json')
        print('making a json file completed!')

In [42]:
## when data.json doesn't exist 
# pre = preprocess(word_vecs, save=True)
# w, word_idx_map, revs, max_len = pre.get_W()

## if data.json exist
with open("/kaggle/input/cnn-word-vector-json/data.json") as json_file:
    files = json.load(json_file)
    
revs, w, word_idx_map, vocab = files['revs'], files['w'], files['word_idx_map'], files['vocab']

In [43]:
print('w type:',type(w))
print('revs type:',type(revs))
print('revs shape:',len(revs))

## word_vecs : dictionary형, 벡터를 보고프면 word_vecs['word']

w type: <class 'list'>
revs type: <class 'list'>
revs shape: 10662


### Build each Dataset

In [44]:
class Make_Dataset(TensorDataset):
    def __init__(self, word_idx_map, xy):
        '''
        self.xy : a sentence, string type
        '''
        super().__init__()
        self.xy = xy
        self.max_len = 56
        self.word_idx_map = word_idx_map
        
    def __len__(self):
        return len(self.xy)
        
    def __getitem__(self, idx):
        splitted_sentence = self.xy[idx]['text'].split()
        tmp = []
        for word in splitted_sentence:
            tmp.append(self.word_idx_map[word])
            
        if len(tmp) < self.max_len:
            for _ in range(len(tmp), self.max_len):
                tmp.append(0)
                
        if self.xy[idx]['y'] in [0,1]: 
            return {'input_ids':torch.LongTensor(tmp).flatten(),
                    'target':torch.tensor(self.xy[idx]['y'])}
    
        else:                                               # mr-dataset에서는 여기에 걸리는 케이스가 없음. 전부 label이 존재함.. 
#             print('wwhhhhhhhhhhaaaaaaaaaattt:',self.xy[idx]['y'])
            return {'input_ids':torch.LongTensor(tmp).flatten()}

### build a model

In [45]:
def Kfold_Split(revs, word_idx_map, test_fold_id):
    train = []
    test = []
    for datum in revs:
        if datum['split'] == test_fold_id:
            test.append(datum)
        else:
            train.append(datum)   

    train_dataset = Make_Dataset(word_idx_map=word_idx_map, xy=train)
    test_dataset = Make_Dataset(word_idx_map=word_idx_map, xy=test)
    
    print('train length:', len(train_dataset))
    print('test length:', len(test_dataset))
## 우선 코드 전체적으로 쭈욱 한 번 돌리고 나서, 여길 수정하자.
#     proper_batch_size = int(len(vocab)/10)
#     print('proper:',proper_batch_size)

    train_loader = DataLoader(train_dataset, batch_size=50, shuffle=True, drop_last=True)
    test_loader = DataLoader(test_dataset, batch_size=50, shuffle=True, drop_last=True)
    return train_loader, test_loader

### pre-trained vectors를 google negative300.bin으로 넣어주는 방법
1) https://discuss.pytorch.org/t/expected-input-to-torch-embedding-layer-with-pre-trained-vectors-from-gensim/37029<br>
2) static과 non-static의 구분을 nn.Embedding.from_pretrained(freeze=True/False)로 해줌 : https://github.com/aisolab/nlp_classification/blob/master/Convolutional_Neural_Networks_for_Sentence_Classification/model/ops.py

In [46]:
class Cnn_Model(nn.Module):
    def __init__(self, W, word_vecs):
        super(Cnn_Model, self).__init__()
        self.n_filters = 100
        self.input_dim = 1   # 그냥 text니깐 1임. vision일 경우 3 
        self.word_vecs = torch.FloatTensor(word_vecs.vectors)
        
        self.W = torch.tensor(W)
        ## nn.embedding은 2-dim float tensor로 만들어지고,
        ## from_pretrained에서의 freeze는 기본적으로 True이다.
        self.embedding = nn.Embedding.from_pretrained(self.word_vecs, freeze=False)
        self.conv3_layer = nn.Conv2d(self.input_dim, self.n_filters, kernel_size=(3,300))
        self.conv4_layer = nn.Conv2d(self.input_dim, self.n_filters, kernel_size=(4,300))
        self.conv5_layer = nn.Conv2d(self.input_dim, self.n_filters, kernel_size=(5,300))
        
        ## 우선 filter_size를 3으로 줬을 때
        self.fc = nn.Linear(3*self.n_filters, 1) 
        self.dropout = nn.Dropout()
        
    def forward(self, x):
#         print('first x:', x.size())                  # 50, 56                
        x = self.embedding(self.W)                       # 50, 56, 300    
#         print('second x:', x.size())        
        x = x.unsqueeze(1)                          # 50, 1, 56,, 300                   
#         print('third x:', x.size())
        ## make a feature map for each filter
        f3 = F.relu(self.conv3_layer(x).squeeze(3))  # 50, 10, 54
        f4 = F.relu(self.conv4_layer(x).squeeze(3))  # 50, 10, 53
        f5 = F.relu(self.conv5_layer(x).squeeze(3))  # 50, 10, 52
#         print('fourth f3:', f3.size())
#         print('fourth f4:', f4.size())
#         print('fourth f5:', f5.size())

        x3 = F.max_pool1d(f3, f3.shape[2]).squeeze(2)  # 50, 10
        x4 = F.max_pool1d(f4, f4.shape[2]).squeeze(2)  # 50, 10
        x5 = F.max_pool1d(f5, f5.shape[2]).squeeze(2)  # 50, 10
#         print('fifth x3:', x3.size())
#         print('fifth x4:', x4.size())
#         print('fifth x5:', x5.size())
        
        ### a penultimate layer
        x = self.dropout(torch.cat((x3,x4,x5), dim=1))  # 50, 30
#         print('sixth x:', x.size())
#         x.size()  batch_size, n_filters*len(filter_sizes)

        output = self.fc(x)                          # 50, 30
#         print('seventh x:', x.size())   
        
        return output.squeeze()

In [47]:
## https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/4%20-%20Convolutional%20Sentiment%20Analysis.ipynb
epochs = 25  # the value exists in original code
NON_STATIC = True
model = Cnn_Model(W=w, word_vecs=word_vecs)
model.to(device)

Cnn_Model(
  (embedding): Embedding(700000, 300)
  (conv3_layer): Conv2d(1, 100, kernel_size=(3, 300), stride=(1, 1))
  (conv4_layer): Conv2d(1, 100, kernel_size=(4, 300), stride=(1, 1))
  (conv5_layer): Conv2d(1, 100, kernel_size=(5, 300), stride=(1, 1))
  (fc): Linear(in_features=300, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [48]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

In [49]:
# criterion = nn.BCELoss().to(device) 
# criterion = nn.NLLLoss().to(device) # NLL 썼음.. 
criterion = nn.BCEWithLogitsLoss().to(device) 
optimizer = optim.Adadelta(model.parameters(), rho=0.95, eps=1e-6, weight_decay=0.95)
# optimizer = optim.Adam(model.parameters())

for i in range(0,10):
    note = {}
    print('iiiiiiii:', i)
    train_loader, test_loader = Kfold_Split(revs, word_idx_map, test_fold_id=i)
      
    ## Training
    for epoch in range(epochs):
        check_loss = 0
        
        model.train()

        for idx, data in enumerate(train_loader):
            optimizer.zero_grad()

            outputs = model(data['input_ids'].to(device))
            loss = criterion(outputs, data['target'].type_as(outputs))
            
            acc = binary_accuracy(outputs, data['target'].type_as(outputs))
            
            loss.backward()   # 이걸로 W가 업뎃되나? 안될거 같은데. static/non-static설정 어케하나..
            optimizer.step()
#             xm.optimizer_step(self.optim)
#             check_loss += loss.item()
            if idx % 50 == 0:
                print('epoch:', epoch,' current acc:', acc)
#                 check_loss = 0

#         note['loss'] = check_loss
        print('training is done!')


    ## test 
    total = 1
    correct = 0
    model.eval()

    with torch.no_grad():
        for data in test_loader:
#             print('THIS IS THE DATA!:', data.size())
#             print()
            datas = data['input_ids'].to(device)
            print('datas size:', datas.size())  # 50.56
            outputs = model(datas)
            print('outputs size:', outputs.size()) # the value should be [50,]
            print('outputs unsq size:', outputs.unsqueeze(1))
            
            _, predicted = torch.max(outputs.data, 
                                     0)
            labels = data['target'].to(device)
            total += labels.size(0)
            print('label size0:', labels.size(0))
            correct += (predicted == labels).sum().item()

    note['accuray'] = 100 * (correct/total)
    print('test accuracy:', note['accuray'])

    ### 이렇게 구찮은거 말고, stratifiedKFold 써서
    ### 글구 필터도 한 방에 해서 갈 수 있는거 알아보자..
    # https://github.com/aisolab/nlp_classification/blob/master/Convolutional_Neural_Networks_for_Sentence_Classification/model/ops.py'
    
    
    ## w는 coefficient다

iiiiiiii: 0
train length: 9576
test length: 1086


TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not list