In [0]:
!unzip  /content/drive/'My Drive'/'01 text_classification'/'01 kaggle_google_textclassification'/data/my.zip -d /content/drive/'My Drive'/'01 text_classification'/


Archive:  /content/drive/My Drive/01 text_classification/01 kaggle_google_textclassification/data/my.zip
  inflating: /content/drive/My Drive/01 text_classification/glove.twitter.27B.25d.txt  
  inflating: /content/drive/My Drive/01 text_classification/glove.twitter.27B.50d.txt  
  inflating: /content/drive/My Drive/01 text_classification/glove.twitter.27B.100d.txt  
  inflating: /content/drive/My Drive/01 text_classification/glove.twitter.27B.200d.txt  


In [13]:
!pip install tensorboardX

Collecting tensorboardX
[?25l  Downloading https://files.pythonhosted.org/packages/35/f1/5843425495765c8c2dd0784a851a93ef204d314fc87bcc2bbb9f662a3ad1/tensorboardX-2.0-py2.py3-none-any.whl (195kB)
[K     |█▊                              | 10kB 23.8MB/s eta 0:00:01[K     |███▍                            | 20kB 31.8MB/s eta 0:00:01[K     |█████                           | 30kB 38.8MB/s eta 0:00:01[K     |██████▊                         | 40kB 44.1MB/s eta 0:00:01[K     |████████▍                       | 51kB 38.8MB/s eta 0:00:01[K     |██████████                      | 61kB 41.6MB/s eta 0:00:01[K     |███████████▊                    | 71kB 34.4MB/s eta 0:00:01[K     |█████████████▍                  | 81kB 35.0MB/s eta 0:00:01[K     |███████████████                 | 92kB 37.0MB/s eta 0:00:01[K     |████████████████▊               | 102kB 35.6MB/s eta 0:00:01[K     |██████████████████▍             | 112kB 35.6MB/s eta 0:00:01[K     |████████████████████            

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F


import pandas as pd
import numpy as np

print(torch.cuda.is_available())   # 需要设置笔记本从而打开GPU

True


### 模型定义(TextCNN)

#### 定义需要的layers

**线性层定义与封装**
$y=x A^{T}+b$

其中$A$是参数矩阵(weight parameters),$b$是bias parameters

pytorch提供了这个类torch.nn.Linear,该类有2个成员变量weight和bias

总结：

**基本网络模块的必要步骤：**
* 初始化，主要是所依托的其他模块的定义，同时调用参数初始化函数
* 参数初始化成员函数定义
* 前向传播过程定义

In [0]:
class Linear(nn.Module):
  def __init__(self,in_dim,out_dim):  # 成员函数self不要忘
    super(Linear,self).__init__()
    self.linear = torch.nn.Linear(in_dim, out_dim, bias=True)
    self.init_params()  # 调用成员函数初始化参数
  def init_params(self): # 就是给weight与bias进行初始化
    # 使用何恺明正态分布初始化权重，参数初始化要具体问题，具体分析
    nn.init.kaiming_normal_(self.linear.weight)
    nn.init.constant_(self.linear.bias, 0)
  def forward(self,x):
    y = self.linear(x)
    return y

#### 封装一维卷积层（用于文本）
参数数量：
* weights: out_channels,in_channels,kernel_size
* bias: out_channels
* * *
* convolution operation:
filter: $\mathbf{w} \in \mathbb{R}^{h k}$
该卷积核应用于h个words去产生新的feature。

* * *
* **特征计算的数学表示：**
$c_{i}=f\left(\mathbf{w} \cdot \mathbf{x}_{i: i+h-1}+b\right)$
这里的句子的窗口大小时h,乘以卷积核，然后再加上偏置b,最后通过f进行激活,f是一个non-linear function比如说hyperbolic tangent(tanh)。
* 从参数的角度考虑，这里默认输出通道为1
* * *

* **窗口进行滑动**
取遍所有窗口，将k维词向量所有的窗口$\left\{\mathbf{x}_{1: h}, \mathbf{x}_{2: h+1}, \dots, \mathbf{x}_{n-h+1: n}\right\}$进行卷积操作得到$\mathbf{c}=\left[c_{1}, c_{2}, \ldots, c_{n-h+1}\right]$，这里成为feature map。
$\mathbf{c} \in \mathbb{R}^{n-h+1}$： 卷积后的特征维度

* * *

* 进行max-over-time pooling operation操作，得到最大值$\hat{c}=\max \{\mathbf{c}\}$
**背后的思想：捕获feature map中最重要的特征，这种策略自然解决了句子的长度变化问题**

In [0]:
# 注意：使用官方提供的网络模块一定要弄清其进行了哪些操作
# 官方一维卷积模块并没有提供激活操作，因此最后得到的特征还需要激活一下
class Conv1d(nn.Module):
  def __init__(self,in_channels,out_channels,kernel_size_list):
    super(Conv1d,self).__init__()
    self.convs = torch.nn.ModuleList()
    for size in kernel_size_list: # 定义卷积核大小不同的卷积模块
      self.convs.append(nn.Conv1d(in_channels,out_channels,size))
    self.init_params

  def init_params(self):
    for m in self.convs:
      nn.init.xavier_uniform_(m.weight)
      nn.init.constant_(m.bias,0.1)
  def forward(self,x):   # 返回一个列表，列表中是len(kernel_size_list)个张量
    # 每个卷积网络都得到一个特征表示，放入到列表中
    return [F.relu(conv(x)) for conv in self.convs]
  

In [0]:
# 测试定义的卷积模块
# input:(batch_num,in_channels,length)
# output:(batch_num,out_channels,计算出的维度)
myConv = Conv1d(100,1,[2,5])
input = torch.randn(20,100,10)  # batch_num,in_channels,length
output = myConv(input)
print(len(output))
print(type(output))
for i in range(2):
  print(type(output[i]))
  print(output[i].shape)   

2
<class 'list'>
<class 'torch.Tensor'>
torch.Size([20, 1, 9])
<class 'torch.Tensor'>
torch.Size([20, 1, 6])


In [0]:
# torch.nn.functional.max_pool1d(*args, **kwargs) # *var: non-keyword variable, **var: keyworded variable
# torch.cat() can be seen as an inverse operation for torch.split() and torch.chunk()
test_tensor = torch.tensor([[4., -5., 100], [1., -1., 1000]])
print(test_tensor.shape)
test_tensor.unsqueeze_(dim=0)
print(test_tensor.shape)
output = F.max_pool1d(test_tensor,test_tensor.shape[2]) # 输入的test_tensor必须是3维的，考虑batch_num
# 函数的第一个参数是池化的数据，第2个参数是kernel_size即池化核的大小
print(output)
print(output.squeeze_(dim=2).shape) # 在第2个维度最大池化，然后在第2个维度squeeze
print(output)

torch.Size([2, 3])
torch.Size([1, 2, 3])
tensor([[[ 100.],
         [1000.]]])
torch.Size([1, 2])
tensor([[ 100., 1000.]])


In [0]:
# concatenate
test_tensor1 = torch.tensor([[4., -5., 100], [1., -1., 1000]])
test_tensor2 = torch.tensor([[1., 2., 3.], [4., 5., 6.]])
test_list = [test_tensor1,test_tensor2]
res1 = torch.cat(test_list,dim=0)  # dim指定拼接的维度，也是拼接操作后张量改变的维度
print(res1)
print(res1.shape)  # 0维

tensor([[   4.,   -5.,  100.],
        [   1.,   -1., 1000.],
        [   1.,    2.,    3.],
        [   4.,    5.,    6.]])
torch.Size([4, 3])


### 定义最终的TextCNN模型

**通道数的个人理解**

--可以理解为特征的维度，比如对文本进行卷积，初始输入通道数就是embedding_dim,输出通道数则可以用户指定，输出通道数越大代表保存的抽取的特征越丰富。

In [0]:
# This TextCNN model only one layer CNN to extract features.
# 为什么加载预先训练好的词向量能够提升模型效果？？？
class TextCNN(nn.Module):
  def __init__(self,embedding_dim,out_channels,kernel_size_list,
               output_dim,dropout,pretrained_embeddings):
    super(TextCNN,self).__init__()
    self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings,freeze=False)
    # freeze=False parameters get updated during learning process
    self.conv = Conv1d(embedding_dim,out_channels,kernel_size_list) # extract text features
    input_dim = len(kernel_size_list)*out_channels  # input dim of linear layer
    self.linear = Linear(in_dim=input_dim,out_dim=output_dim)
    self.dropout = nn.Dropout(dropout)
  def forward(self,x):
    # text:(sent_len,batch_size)  # why????
    text,_ = x  
    text = text.permute(1,0) # text:(batch_size,sent_len)
    embeded = self.embedding(text) # (batch_size,sent_len,embedding_dim)
    embeded = embeded.permute(0,2,1) 
    conv1ded = self.conv(embeded)  # 得到的张量列表
    pooled = [F.max_pool1d(conv,conv.shape[2]).squeeze(dim=2) for conv in conv1ded]
    # 在第2个维度上进行1d pool，最后去除该维度
    # 对每个卷积核卷出的特征进行最大池化操作
    final = torch.cat(pooled,dim=1) # pooled是个列表，将特征拼接在一起
    # final:[batch_size,out_channels*len(kernel_size_list)]
    droped = self.dropout(final)
    z = self.linear(droped)  # 这里只进行线性变化，没有激活？？？？？？？
    return z          # [batch_size,output_dim]

In [0]:
def info(var):
  print("shape:{}".format(var.shape))
  print("type:{}".format(type(var)))

### Test TextCNN model

In [0]:
torch.manual_seed(5)   # 限定随机数
# 模拟tokenization后的文本，注意这里面的张量是整型数字
text = torch.LongTensor([[1,2,4,5],[4,3,2,9]]) 
text = text.permute(1,0) # (batch_size,sen_len)=(2,5)
label = torch.LongTensor([1,0])
x = (text,label)
# 模拟预训练好的词向量参数，参数维度[num,embedding_dim],此处num >= 10
# 文本中标号最大为9,词向量个数为10，嵌入维度为3
pretrained_embedding = torch.FloatTensor([[1, 2.3, 3], 
                      [2, 5.1, 6.3],
                      [3, 5.1, 6.3],
                      [4, 5.1, 6.3],
                      [5, 5.1, 6.3],
                      [6, 5.1, 6.3],
                      [7, 5.1, 6.3],
                      [8, 5.1, 6.3],
                      [9, 5.1, 6.3],
                      [10, 5.1, 6.3],
                      ])
# info(pretrained_embedding)
# print(x)
textcnn = TextCNN(embedding_dim=3,out_channels=2,kernel_size_list=[2,3],
                  output_dim = 1,dropout=0.5,
                  pretrained_embeddings=pretrained_embedding)
output=textcnn(x)
info(output)
# for param in textcnn.parameters():
#   print(type(param.data), param.size())

shape:torch.Size([2, 1])
type:<class 'torch.Tensor'>


In [5]:
def get_device():
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    if torch.cuda.is_available():
        print("device is cuda, # cuda is: ", n_gpu)
    else:
        print("device is cpu, not recommend")
    return device, n_gpu
device, n_gpu = get_device()
print(device)
print(n_gpu)

device is cuda, # cuda is:  1
cuda:0
1


In [6]:
train_data = pd.read_csv('/content/drive/My Drive/01 text_classification/01 kaggle_google_textclassification/data/train.csv') 
print(train_data.shape)   # 训练数据只有7613条



(7613, 5)


### 参考资料
[01 TorchText用法示例及完整代码](https://blog.csdn.net/nlpuser/article/details/88067167)

[02 A Comprehensive Introduction to Torchtext (Practical Torchtext part 1)](
http://mlexplained.com/2018/02/08/a-comprehensive-tutorial-to-torchtext/)

[03 Language modeling tutorial in torchtext (Practical Torchtext part 2)](http://mlexplained.com/2018/02/15/language-modeling-tutorial-in-torchtext-practical-torchtext-part-2/)

[04 官方文档](https://pytorch.org/text/)

[05 torchtext读取json文件（引起重视）](https://blog.csdn.net/weixin_43896398/article/details/85559172)

[06 json格式专tsv与csv格式 ](https://blog.csdn.net/qq_41868948/article/details/81008520)
### 使用tortext构建数据集

In [0]:
from torchtext import data
from torchtext.vocab import Vectors
from tqdm import tqdm

**torchtext.vocab:**Defines a vocabulary object that will be used to numericalize a field

**torchtext.data.Field:**Defines a datatype together with instructions for converting to Tensor.(定义数据类型以及转换为张量的指令)
* It holds a Vocab object that defines the set of possible values for elements of the field and their corresponding numerical representations（Field中包含一个Vocab对象，该对象定义了元素可能值的集合以及对应的数值表示）
* 该对象同时包含tokenization method以及转换后tensor的数据类型



In [8]:
fix_length = 100
# 训练数据的路径
root_dir = '/content/drive/My Drive/01 text_classification/01 kaggle_google_textclassification/data/'
path1 = root_dir+'train.csv'
path2 = root_dir+'test.csv'
# 分别对应文本以及标签
TEXT = data.Field(tokenize='spacy', lower=True, include_lengths=True, fix_length=None) 
# 调试1：不限定句子的长度 
LABEL = data.LabelField(dtype=torch.long)
fields = [("id",None),("keyword",None),("location",None),("text",TEXT),("target",LABEL)]
raw_dataset = data.TabularDataset(path=path1, format="csv", fields=fields, skip_header=True)
train_dataset,dev_dataset = raw_dataset.split(split_ratio=[0.9,0.1])   # 划分为训练数据集以及验证集
fields = [("id",None),("keyword",None),("location",None),("text",TEXT)]
test_dataset = data.TabularDataset(path=path2, format="csv", fields=fields, skip_header=True)
print("the size of train: {}, dev:{}, test:{}".format(len(train_dataset.examples), len(dev_dataset.examples),len(test_dataset.examples)))

the size of train: 6852, dev:761, test:3263


### 使用建立好数据集建设vocab,用于将文本转换成数字序列

In [0]:
embedding_file = '/content/drive/My Drive/01 text_classification/glove.twitter.27B.200d.txt'
cache_dir = '/content/drive/My Drive/01 text_classification/cache'
vectors = Vectors(embedding_file, cache_dir)  # 初始化词向量
# 对文本建立词汇表，会遍历所有dataset的example将TEXT相关的内容进行登记，建立单词到数字的映射以及数字到单词的映射
TEXT.build_vocab(train_dataset,dev_dataset,test_dataset,vectors=vectors,unk_init=torch.Tensor.normal_)
# 可选参数max_size:dictate how many words are in the vocabulary,限定词汇表大小
# 可选参数min_freq：确保词汇表中出现单词的词频
# 对标签建立词汇表，感觉这一步可能没有必要
LABEL.build_vocab(train_dataset,dev_dataset)  

### Creating an Iterator to pass the data to our model(建立一个迭代器用于模型训练，batch的划分就在这步)

In [0]:
# 随机种子
seed = 3456
# 句子长度，这里没有用到
# sequence_length= 100
# 输出模型的保存地址
model_file='/content/sample_data/model_saved/'+'model1.pt'
# 日志文件根目录，用于保存画图数据
log_dir='/content/sample_data/log'
# Whether to run training.
do_train = True
# 多少步存储一次模型
print_step=10
# 优化参数
dropout=0.4
epoch_num=2
batch_size = 64
# 模型参数
output_dim=2
# TextCNN 参数
out_channels = 200       
kernel_size_list=[2,3]
# word Embedding
embedding_dim = 200 
glove_word_size = 1.2e6   #预训练词汇量的单词数   

In [11]:

device, n_gpu = get_device()
train_iter, dev_iter = data.BucketIterator.splits(
        (train_dataset,dev_dataset), batch_sizes=(batch_size,batch_size), 
        sort_key=lambda x: len(x.text), 
        sort_within_batch=True, 
        repeat=False, shuffle=True, 
        device=device)
test_iter = data.Iterator(test_dataset,batch_size=64,device=device,sort=False,sort_within_batch=False,repeat=False)

device is cuda, # cuda is:  1


In [0]:
batch = next(iter(train_iter))
print(batch)
text,label=batch.text,batch.target
print(len(text))
print(text[1])
print(text[0])   
# 搞不明白元组text第2个元素是什么？？？？？


[torchtext.data.batch.Batch of size 64]
	[.text]:('[torch.cuda.LongTensor of size 14x64 (GPU 0)]', '[torch.cuda.LongTensor of size 64 (GPU 0)]')
	[.target]:[torch.cuda.LongTensor of size 64 (GPU 0)]
2
tensor([14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
        14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
        14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
        14, 14, 14, 14, 14, 14, 14, 13, 13, 13], device='cuda:0')
tensor([[  144,    23,    20,   119,    14,   144,    60,     3,   174,   773,
          4087,   104,   378,    11,   623,     4,     3,   217,     2,     3,
           180,  8773,    87,   444, 11817,   128,   118,  6583, 10224,     3,
          9586,   933,   135,   270,  4229,  9925,    53,   233,   180,     3,
           654,   107,     3,     7,    81, 28619,    46,  1267,   144,   144,
           933,   632, 11475,   144,    53,   319, 11969, 10727,  6602, 14288,
           386,   

**注意：测试集的数据顺序不需要改变，比赛中得到的结果需要提交**

### 模型训练

In [0]:
import torch.optim as optim
from tensorboardX import SummaryWriter
import time
from sklearn import metrics

In [16]:
pretrained_embeddings = TEXT.vocab.vectors # torch.Tensor [29257,200]
info(pretrained_embeddings)

model=TextCNN(embedding_dim,out_channels,kernel_size_list,output_dim,
               dropout,pretrained_embeddings)
optimizer = optim.Adam(model.parameters())  
criterion = nn.CrossEntropyLoss()      

shape:torch.Size([29257, 200])
type:<class 'torch.Tensor'>


In [0]:
def classifiction_metric(preds, labels, label_list):
    """ 分类任务的评价指标， 传入的数据需要是 numpy 类型的 """

    acc = metrics.accuracy_score(labels, preds)

    labels_list = [i for i in range(len(label_list))]

    report = metrics.classification_report(
        labels, preds, labels=labels_list, target_names=label_list, digits=5, output_dict=True)

    return acc, report
def evaluate(model, iterator, criterion, label_list):
    model.eval()  
    epoch_loss = 0
    all_preds = np.array([], dtype=int)
    all_labels = np.array([], dtype=int)
    with torch.no_grad():
        for batch in iterator:
          # 模型进行推测时，要关闭梯度更新
            with torch.no_grad():
              logits = model(batch.text)

            loss = criterion(logits.view(-1, len(label_list)), batch.target)

            labels = batch.target.detach().cpu().numpy()
            preds = np.argmax(logits.detach().cpu().numpy(), axis=1)

            all_preds = np.append(all_preds, preds)
            all_labels = np.append(all_labels, labels)
            epoch_loss += loss.item()
    acc, report = classifiction_metric(
        all_preds, all_labels, label_list)

    return epoch_loss/len(iterator), acc, report

In [0]:
model = model.to(device)
criterion = criterion.to(device)


In [0]:
train_dataloader=train_iter
dev_dataloader=dev_iter 
label_list=['0', '1']  # 标签类型

In [23]:
model.train()
writer = SummaryWriter(
        log_dir=log_dir + '/' + time.strftime('%H:%M:%S', time.gmtime()))
global_step = 0
best_dev_loss = float('inf') # 最佳验证损失
for epoch in range(epoch_num):
# f-string: formatted string literals, 格式化字符串常量等同于str.format() 
  print(f'---------------- Epoch: {epoch+1:02} ----------')
  epoch_loss = 0
  train_steps = 0

  all_preds = np.array([], dtype=int)
  all_labels = np.array([], dtype=int)
  for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
    optimizer.zero_grad()
    logits=model(batch.text) 
    # 注意这里将标签进行调增为(batch_num,class_num)的形式，才方便调用函数计算
    loss = criterion(logits.view(-1, len(label_list)), batch.target) # 计算loss值

    # 得到实际的标签与预测的标签存储起来
    labels = batch.target.detach().cpu().numpy()
    preds = np.argmax(logits.detach().cpu().numpy(), axis=1)
    # 进行参数优化
    loss.backward()
    optimizer.step()
    global_step += 1
    # 
    epoch_loss += loss.item() #得到具体数值
    train_steps += 1

    all_preds = np.append(all_preds, preds)
    all_labels = np.append(all_labels, labels)

    if global_step % print_step == 0:
      # 计算当前loss值，准确率，召回率
      train_loss = epoch_loss / train_steps
      train_acc, train_report = classifiction_metric(
                    all_preds, all_labels, label_list)
      
      # 使用验证集验证模型
      dev_loss, dev_acc, dev_report = evaluate(
                    model, dev_dataloader, criterion, label_list)



      # 将画图用到的数据添加到画图类中
      c = global_step // print_step

      writer.add_scalar("loss/train", train_loss, c)
      writer.add_scalar("loss/dev", dev_loss, c)

      writer.add_scalar("acc/train", train_acc, c)
      writer.add_scalar("acc/dev", dev_acc, c)

      # 这里是label list
      for label in label_list:
        writer.add_scalar(label + ":" + "f1/train",
                    train_report[label]['f1-score'], c)
        writer.add_scalar(label + ":" + "f1/dev",
                    dev_report[label]['f1-score'], c)
        
      # 注意这里是print_list
      print_list = ['macro avg', 'weighted avg']
      for label in print_list:
        writer.add_scalar(label + ":" + "f1/train",train_report[label]['f1-score'], c)
        writer.add_scalar(label + ":" + "f1/dev",dev_report[label]['f1-score'], c)

      if dev_loss < best_dev_loss:
        best_dev_loss = dev_loss
        torch.save(model.state_dict(),model_file)
        print("new model saved")
      model.train()
writer.close()
      


Iteration:   0%|          | 0/108 [00:00<?, ?it/s][A

---------------- Epoch: 01 ----------



Iteration:   1%|          | 1/108 [00:00<00:52,  2.06it/s][A
Iteration:   9%|▉         | 10/108 [00:00<00:33,  2.89it/s][A
Iteration:  19%|█▊        | 20/108 [00:00<00:21,  4.06it/s][A

new model saved
new model saved



Iteration:  28%|██▊       | 30/108 [00:01<00:15,  5.20it/s][A
Iteration:  38%|███▊      | 41/108 [00:01<00:09,  7.27it/s][A

new model saved



Iteration:  46%|████▋     | 50/108 [00:02<00:06,  8.69it/s][A

new model saved



Iteration:  56%|█████▌    | 60/108 [00:02<00:04,  9.81it/s][A
Iteration:  65%|██████▍   | 70/108 [00:02<00:02, 13.36it/s][A

new model saved



Iteration:  74%|███████▍  | 80/108 [00:03<00:01, 18.02it/s][A
Iteration:  83%|████████▎ | 90/108 [00:03<00:00, 23.84it/s][A
Iteration:  93%|█████████▎| 100/108 [00:03<00:00, 28.37it/s][A
Iteration: 100%|██████████| 108/108 [00:03<00:00, 31.62it/s][A
Iteration:   0%|          | 0/108 [00:00<?, ?it/s][A
Iteration:   8%|▊         | 9/108 [00:00<00:01, 85.48it/s][A

new model saved
---------------- Epoch: 02 ----------



Iteration:  19%|█▊        | 20/108 [00:00<00:00, 91.03it/s][A
Iteration:  30%|██▉       | 32/108 [00:00<00:00, 89.74it/s][A
Iteration:  39%|███▉      | 42/108 [00:00<00:00, 85.94it/s][A

new model saved



Iteration:  48%|████▊     | 52/108 [00:01<00:01, 35.37it/s][A
Iteration:  59%|█████▉    | 64/108 [00:01<00:00, 44.80it/s][A

new model saved



Iteration:  71%|███████▏  | 77/108 [00:01<00:00, 55.46it/s][A
Iteration:  80%|███████▉  | 86/108 [00:01<00:00, 34.35it/s][A
Iteration:  90%|████████▉ | 97/108 [00:01<00:00, 43.08it/s][A

new model saved



Iteration:  98%|█████████▊| 106/108 [00:02<00:00, 50.86it/s][A
Iteration: 100%|██████████| 108/108 [00:02<00:00, 52.12it/s][A

#### 使用模型进行验证

[pytorch中detach的用法](https://www.cnblogs.com/jiangkejie/p/9981707.html)

[变量类型(cpu/gpu)](https://blog.csdn.net/g11d111/article/details/80896137)



In [24]:
test_preds = np.array([], dtype=int)
# 加载模型参数，这里的model必须提前定义好
model.load_state_dict(torch.load(model_file))
for batch in tqdm(test_iter):
  with torch.no_grad(): # 模型推测是关闭梯度更新
    logits = model(batch.text)
  preds = np.argmax(logits.detach().cpu().numpy(), axis=1)
  # 将计算结果从计算图中分开，并转换为cpu变量，再转换为numpy数组
  test_preds = np.append(test_preds, preds)

# detach(分开，脱离)：Returns a new Tensor, detached from the current graph.
# The result will never require gradient.


  0%|          | 0/51 [00:00<?, ?it/s][A
100%|██████████| 51/51 [00:00<00:00, 524.37it/s][A

In [26]:
print(test_preds.shape)   # 测试数据为3263
print(type(all_preds))
path2 = root_dir+'test.csv'
test_data = pd.read_csv(path2)
test_data.head()

(3263,)
<class 'numpy.ndarray'>


Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [27]:
submission_df = pd.DataFrame(data={'id':test_data.id,'target':test_preds})
print(submission_df.head())
submission_df.to_csv('res.csv', sep=",", header=True, index=False)

   id  target
0   0       1
1   2       1
2   3       0
3   9       0
4  11       1
