In [1]:
import os
import random
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim

# 一些工具的操作

## vscode
### 几个小技巧
1. 查看函数或者类的定义
`Ctrl`+`鼠标左键`点击函数名或者类名即可跳转到定义处，在函数名或者类名上按`F12`也可以实现同样功能


2. 命名重构：
在变量名上按`F2`即可实现重命名变量


3. 方法重构:
选中某一段代码，这个时候，代码的左侧会出现一个「灯泡图标」，点击这个图标，就可以把这段代码提取为一个单独的函数


4. python断点调试:
在行号的左边点击即可设置断点，在左边的调试界面可以查看变量的变化


5. 函数在哪被调用了：
选中`函数`（或者将光标放置在`函数`上），然后按住快捷键「Shift + F12」，就能看到`函数`在哪些地方被调用了，比较实用。

## jupyter notebook

### 恢复原来写过的代码
场景：在某个窗口写了很多代码，又删除了很多单元格，想找回原来的代码。

解决方法：直接在一个单元格中写入`history`就会展示出历史代码（前提是你运行过的，否则不会打印出来）

## win10的一些技巧
- Sticky Note：Go to the Windows Ink Workspace  > Sticky Notes to create reminders for yourself. 
- Stay focused：Select and hold the window you want to stay open, then give your mouse (or finger) a little back-and-forth shake.

# Pytorch的一些知识

## 在理解模型时一些有效的方法

### net.parameters()

In [3]:
class Generator(nn.Module):
    def __init__(self, num_emb, emb_dim, hidden_dim, use_cuda):
        super(Generator, self).__init__()
        self.num_emb = num_emb
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.use_cuda = use_cuda
        self.emb = nn.Embedding(num_emb, emb_dim) 
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True) 
        self.lin = nn.Linear(hidden_dim, num_emb)
        self.softmax = nn.LogSoftmax(dim = 1)
        self.init_params()

    def forward(self, x):
        emb = self.emb(x)
        h0, c0 = self.init_hidden(x.size(0))
        output, (h, c) = self.lstm(emb, (h0, c0)) 
        pred = self.softmax(self.lin(output.contiguous().view(-1, self.hidden_dim))) 
        return pred

    def init_hidden(self, batch_size):
        h = Variable(torch.zeros((1, batch_size, self.hidden_dim)))
        c = Variable(torch.zeros((1, batch_size, self.hidden_dim)))
        if self.use_cuda:
            h, c = h.cuda(), c.cuda()
        return h, c
    
    def init_params(self):
        for param in self.parameters():
            param.data.uniform_(-0.05, 0.05)

generator = Generator(num_emb = 5000,emb_dim = 128,hidden_dim = 64,use_cuda = 'Ture')
generator = generator.cuda()
x = torch.LongTensor([[2,50,100],
                       [40,3,1000]]).cuda()
pred = generator(x)
params = list(generator.parameters())
len(params)

7

In [4]:
for name,parameters in generator.named_parameters():
    print(name,':',parameters.size())
    # 需要值的话:parameters.data

emb.weight : torch.Size([5000, 128])
lstm.weight_ih_l0 : torch.Size([256, 128])
lstm.weight_hh_l0 : torch.Size([256, 64])
lstm.bias_ih_l0 : torch.Size([256])
lstm.bias_hh_l0 : torch.Size([256])
lin.weight : torch.Size([5000, 64])
lin.bias : torch.Size([5000])


In [5]:
pred

tensor([[-8.5687, -8.4996, -8.4787,  ..., -8.4956, -8.5581, -8.5530],
        [-8.5707, -8.5006, -8.4792,  ..., -8.4950, -8.5617, -8.5544],
        [-8.5721, -8.5014, -8.4795,  ..., -8.4947, -8.5635, -8.5533],
        [-8.5691, -8.4994, -8.4794,  ..., -8.4967, -8.5575, -8.5529],
        [-8.5701, -8.5013, -8.4791,  ..., -8.4966, -8.5608, -8.5534],
        [-8.5718, -8.5014, -8.4790,  ..., -8.4964, -8.5636, -8.5544]],
       device='cuda:0', grad_fn=<LogSoftmaxBackward>)

## TensorDataset()

In [5]:
x = torch.linspace(1,10,10) #linspace(star,end,step):从1到10，10个step走完
y = torch.linspace(10,1,10)

import torch.utils.data as Data
torch_dataset = Data.TensorDataset(x,y)
loader = Data.DataLoader(
    dataset = torch_dataset,
    batch_size = 5,
    shuffle = True,
    num_workers = 4)

for epoch in range(3):
    for step,(batch_x,batch_y) in enumerate(loader):
        print('Epoch:',epoch,'|Step:',step,'|batch_x:',batch_x.numpy(),
             '|batch_y:',batch_y.numpy())

Epoch: 0 |Step: 0 |batch_x: [6. 7. 5. 3. 4.] |batch_y: [5. 4. 6. 8. 7.]
Epoch: 0 |Step: 1 |batch_x: [10.  1.  9.  2.  8.] |batch_y: [ 1. 10.  2.  9.  3.]
Epoch: 1 |Step: 0 |batch_x: [ 8.  7.  1.  2. 10.] |batch_y: [ 3.  4. 10.  9.  1.]
Epoch: 1 |Step: 1 |batch_x: [9. 4. 6. 3. 5.] |batch_y: [2. 7. 5. 8. 6.]
Epoch: 2 |Step: 0 |batch_x: [ 3.  5. 10.  6.  9.] |batch_y: [8. 6. 1. 5. 2.]
Epoch: 2 |Step: 1 |batch_x: [7. 1. 2. 8. 4.] |batch_y: [ 4. 10.  9.  3.  7.]


In [11]:
k = []
j = []
for i in range(10):
    k.append(i+1)
    j.append(10-i)
a = np.array(k)
b = np.array(j)
a = torch.from_numpy(a)
b = torch.from_numpy(b)
print(a,b)

tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10], dtype=torch.int32) tensor([10,  9,  8,  7,  6,  5,  4,  3,  2,  1], dtype=torch.int32)


In [17]:
k = []
j = []
for i in range(10):
    k.append(i+1)
    j.append(10-i)
a = np.array(k)
b = np.array(j)

class My_Dataset(Data.TensorDataset): # 这里Data.Dataset也可运行
    def __init__(self,x,y):
        self.data_x = torch.from_numpy(x)
        self.data_y = torch.from_numpy(y)
        self.len = len(x)
        
    def __getitem__(self,index):
        return self.data_x[index],self.data_y[index]
    
    def __len__(self):
        return self.len
    
my_dataset = My_Dataset(a,b)
my_loader = Data.DataLoader(
    dataset = my_dataset,
    batch_size = 5,
    shuffle = True, # num_workers = 2会出错，放在main()函数里就不会出错；
    )

for epoch in range(3):
    for step,(batch_x,batch_y) in enumerate(my_loader):
        print('Epoch:',epoch,'|Step:',step,'|batch_x:',batch_x.numpy(),
             '|batch_y:',batch_y.numpy())

Epoch: 0 |Step: 0 |batch_x: [1 2 8 3 6] |batch_y: [10  9  3  8  5]
Epoch: 0 |Step: 1 |batch_x: [ 5  7  9  4 10] |batch_y: [6 4 2 7 1]
Epoch: 1 |Step: 0 |batch_x: [3 1 6 5 9] |batch_y: [ 8 10  5  6  2]
Epoch: 1 |Step: 1 |batch_x: [ 7  2  4  8 10] |batch_y: [4 9 7 3 1]
Epoch: 2 |Step: 0 |batch_x: [8 3 1 2 4] |batch_y: [ 3  8 10  9  7]
Epoch: 2 |Step: 1 |batch_x: [ 6  5  9 10  7] |batch_y: [5 6 2 1 4]


## nn.LSTM

In [2]:
# 一个input_size = 1 ， hidden_size = 20 ，num_layers = 1
rnn = nn.LSTM(10,20,1,batch_first = True) # batch_first不影响h和c
input_1 = torch.randn(3, 5, 10)
h0 = torch.randn(1, 3, 20)
c0 = torch.randn(1, 3, 20)
output, (hn, cn) = rnn(input_1, (h0, c0))

In [5]:
output.size()

torch.Size([3, 5, 20])

## torch.multinomial()

> torch.multinomial(input, num_samples,replacement=False, out=None) → LongTensor

```python
>>> weights = torch.Tensor([0, 10, 3, 0]) # create a Tensor of weights
>>> torch.multinomial(weights, 4) #可以试试重复运行这条命令，发现只会有2种结果：[1 2 0 0]以及[2 1 0 0]，以[1 2 0 0]这种情况居多。
 1
 2
 0
 0
[torch.LongTensor of size 4]
 
>>> torch.multinomial(weights, 4, replacement=True)
 1
 2
 1
 2
[torch.LongTensor of size 4]
```
- input张量可以看成一个权重张量，每一个元素代表其在该行中的权重。如果有元素为0，那么在其他不为0的元素被取干净之前，这个元素是不会被取到的。
- n_samples是每一行的取值次数，该值不能大于每一样的元素数，否则会报错。
- replacement指的是取样时是否是有放回的取样，True是有放回，False无放回。
- 输入二维张量，则返回的也会成为一个二维张量，行数为输入的行数，列数为n_samples，即每一行都取了n_samples次，取法和一维张量相同。

## torch.nn.Softmax  & torch.nn.LogSoftmax

Softmax函数定义：
$ f(x_i)=\frac{e^{(x_{i}-shift)}}{\sum_{j}e^{(x_{j}-shift)}} $,其中 $ shift = max(x_i)$

**以下是官方文档，但是貌似实际按上面的计算**
- 函数Softmax(X) 

其中$ X = (x_1,x_2,...) $
$$ Softmax(X) = (\frac{e^{x_1}}{\sum e^{x_i}},\frac{e^{x_2}}{\sum e^{x_i}},...) $$


- log_softmax(X)

其中$ X = (x_1,x_2,...) $
$$ log-Softmax(X) = (log\frac{e^{x_1}}{\sum e^{x_i}},log\frac{e^{x_2}}{\sum e^{x_i}},...) $$


In [6]:
input1 = torch.Tensor(np.arange(12)).view(3,4)
print('input=',input1)
m = nn.Softmax(dim = 0)
n = nn.Softmax(dim = 1)
k = nn.Softmax()
print('output(dim=0)=\n',m(input1))
print('output(dim=1)=\n',n(input1))
print('output(dim=default)=\n',k(input1))

input= tensor([[ 0.,  1.,  2.,  3.],
        [ 4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11.]])
output(dim=0)=
 tensor([[3.2932e-04, 3.2932e-04, 3.2932e-04, 3.2932e-04],
        [1.7980e-02, 1.7980e-02, 1.7980e-02, 1.7980e-02],
        [9.8169e-01, 9.8169e-01, 9.8169e-01, 9.8169e-01]])
output(dim=1)=
 tensor([[0.0321, 0.0871, 0.2369, 0.6439],
        [0.0321, 0.0871, 0.2369, 0.6439],
        [0.0321, 0.0871, 0.2369, 0.6439]])


  


output(dim=default)=
 tensor([[0.0321, 0.0871, 0.2369, 0.6439],
        [0.0321, 0.0871, 0.2369, 0.6439],
        [0.0321, 0.0871, 0.2369, 0.6439]])


In [6]:
a = math.exp(-2)/(math.exp(-3)+math.exp(-2)+math.exp(-1)+1)
print(a)

0.08714431874203257


In [10]:
t = nn.LogSoftmax()
print('output(dim=default)=\n',t(input1))

  


output(dim=default)=
 tensor([[-3.4402, -2.4402, -1.4402, -0.4402],
        [-3.4402, -2.4402, -1.4402, -0.4402],
        [-3.4402, -2.4402, -1.4402, -0.4402]])


In [9]:
print(math.log(0.08714431874203257))

-2.4401896985611953


## nn.CrossEntropyLoss()和nn.NLLLoss()

- NLLLoss(negative log likelihood loss)


- CrossEntropyLoss

$ Y = (y_1,y_2,...) $是target，而且是one-hot标签，$ P = (P_1,P_2,...) $是经过Softmax层输出的pred

$$ E = -\sum_{1}^{n}y_{i} * log(P_{i})$$ 

- **CrossEntropyLoss() = log_softmax() + NLLLoss()**
> This criterion combines `nn.LogSoftmax()` and `nn.NLLLoss()` in one single class.

$$CrossEntropyLoss(X,class) = - log\frac{e^{x_{class}}}{\sum e^{x_i}}$$

> input:(N,C),N = Batch_size,C = num_classes #其中每个datapoint也就是每一横排，就是上面的X

> Target:(N)  #一定要注意这不是(N,1)

> Output:scalar # 只和相应的$ x_{class}$大小相关

In [18]:
target = torch.LongTensor([1,0,0,1])
pred = torch.Tensor([-988,-0.01,-0.0005,-1080,-0.009,-3880,-180,-0.001]).view(4,2)
# 上面改成pred = torch.Tensor([-98,-0.01,-0.0005,-100,-0.009,-300,-10,-0.001]).view(4,2)结果一样
dis_criterion = nn.NLLLoss(reduction='sum')
a = dis_criterion(pred,target)

In [19]:
a

tensor(0.0205)

In [20]:
pred

tensor([[-9.8800e+02, -1.0000e-02],
        [-5.0000e-04, -1.0800e+03],
        [-9.0000e-03, -3.8800e+03],
        [-1.8000e+02, -1.0000e-03]])

In [23]:
loss = nn.CrossEntropyLoss()
input1 = torch.randn(3, 5, requires_grad=True)
target = torch.empty(3, dtype=torch.long).random_(5) # 应为每一行有5个预测值

In [24]:
input1

tensor([[-0.3260,  1.0335,  1.0452,  1.4386, -0.9655],
        [-0.0737,  0.8103,  0.4304,  0.1323, -0.3928],
        [ 0.1206, -2.6637, -0.0357,  1.7111, -0.9470]], requires_grad=True)

In [25]:
target

tensor([0, 3, 2])

In [26]:
aaa = loss(input1,target)
aaa

tensor(2.1972, grad_fn=<NllLossBackward>)

## torch.multinomial是否可以back_propagation

**凡是有embedding层的，都不能反向传导？？？？**
并不是这样

In [2]:
class Generator(nn.Module):
    def __init__(self, num_emb, emb_dim, hidden_dim, use_cuda):
        super(Generator, self).__init__()
        self.num_emb = num_emb
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.use_cuda = use_cuda
        self.emb = nn.Embedding(num_emb, emb_dim) 
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True) 
        self.lin = nn.Linear(hidden_dim, num_emb)
        self.softmax = nn.LogSoftmax(dim = 1)
        self.init_params()

    def forward(self, x):
        emb = self.emb(x)
        h0, c0 = self.init_hidden(x.size(0))
        output, (h, c) = self.lstm(emb, (h0, c0)) 
        pred = self.softmax(self.lin(output.contiguous().view(-1, self.hidden_dim))) 
        return pred

    def init_hidden(self, batch_size):
        h = Variable(torch.zeros((1, batch_size, self.hidden_dim)))
        c = Variable(torch.zeros((1, batch_size, self.hidden_dim)))
        if self.use_cuda:
            h, c = h.cuda(), c.cuda()
        return h, c
    
    def init_params(self):
        for param in self.parameters():
            param.data.uniform_(-0.05, 0.05)

generator = Generator(num_emb = 1000,emb_dim = 128,hidden_dim = 64,use_cuda = 'Ture')
generator = generator.cuda()
x = torch.LongTensor([[2,50,100],
                       [40,3,900]]).cuda()
target = torch.LongTensor([[600],[233],[111],[500],[700],[132]]).cuda().contiguous().view(-1)
# 以上这个target不加contiguous().view(-1)会出错
loss = nn.NLLLoss()
optimizer = optim.Adam(generator.parameters())

In [3]:
# 重复这个cell可以让最后的结果一直减少
for i in range(20):
    pred = generator(x)
    loss_scale = loss(pred,target)
    optimizer.zero_grad()
    loss_scale.backward()
    optimizer.step()
    print(loss_scale)

tensor(6.9089, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.9039, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.8989, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.8939, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.8888, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.8835, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.8780, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.8722, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.8662, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.8598, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.8530, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.8456, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.8377, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.8292, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.8199, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.8098, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.7988, device='cuda:0', grad_fn=<NllLossBackward

加入pred_index = pred.multinomial(1)可以有反向传播吗？没有反向传播会怎么报错？

In [2]:
class Generator(nn.Module):
    def __init__(self, num_emb, emb_dim, hidden_dim, use_cuda):
        super(Generator, self).__init__()
        self.num_emb = num_emb
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.use_cuda = use_cuda
        self.emb = nn.Embedding(num_emb, emb_dim) 
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True) 
        self.lin = nn.Linear(hidden_dim, num_emb)
        self.softmax = nn.LogSoftmax(dim = 1)
        self.init_params()

    def forward(self, x):
        emb = self.emb(x)
        h0, c0 = self.init_hidden(x.size(0))
        output, (h, c) = self.lstm(emb, (h0, c0)) 
        pred = self.softmax(self.lin(output.contiguous().view(-1, self.hidden_dim)))
        pred = - pred
        pred_index = pred.multinomial(1)
        return pred_index

    def init_hidden(self, batch_size):
        h = Variable(torch.zeros((1, batch_size, self.hidden_dim)))
        c = Variable(torch.zeros((1, batch_size, self.hidden_dim)))
        if self.use_cuda:
            h, c = h.cuda(), c.cuda()
        return h, c
    
    def init_params(self):
        for param in self.parameters():
            param.data.uniform_(-0.05, 0.05)

generator = Generator(num_emb = 1000,emb_dim = 64,hidden_dim = 64,use_cuda = 'Ture')
generator = generator.cuda()

x = torch.LongTensor([[2,50,100],
                       [40,3,900]]).cuda()
target = torch.LongTensor([[600],[233],[111],[500],[700],[132]]).cuda()
optimizer = optim.Adam(generator.parameters())
Loss = nn.MSELoss()

In [3]:
for i in range(10):
    pred = generator(x)
    pred=Variable(pred.float(),requires_grad=True)
    loss_scale = Loss(pred,target.float())
    optimizer.zero_grad()
    loss_scale.backward()
    optimizer.step()
    print(loss_scale)

tensor(263116., device='cuda:0', grad_fn=<MseLossBackward>)
tensor(122201.1641, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(173140.5000, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(184143.8281, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(160360.6719, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(84713.8359, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(121222., device='cuda:0', grad_fn=<MseLossBackward>)
tensor(89633.1641, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(190037.5000, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(247259.5000, device='cuda:0', grad_fn=<MseLossBackward>)


以上不收敛是因为multinomial()导致的吗？怎么验证我的观点？？？？？

## nn.embding()出错

> RuntimeError: Expected tensor for argument #1 'indices' to have scalar type Long; but got CPUType instead (while checking arguments for embedding)

结果是因为，`nn.embding()`的输入必须是LongTensor型，所以在用numpy处理时，数组`dtype = np.int64`，注意这个不是`int64`，哈哈哈

## contiguous()

**Tensor.contiguous().view( , ):**

有些tensor并不是占用一整块内存，而是由不同的数据块组成，而tensor的view()操作依赖于内存是整块的，这时只需要执行contiguous()这个函数，把tensor变成在内存中连续分布的形式。
判断是否contiguous用torch.Tensor.is_contiguous()函数。
```python
x = torch.ones(10, 10)
x.is_contiguous()  # True
x.transpose(0, 1).is_contiguous()  # False
x.transpose(0, 1).contiguous().is_contiguous()  # True
```

*目前只看过lstm函数的输出如果要view()，则先要contiguous.*

# 清洗数据和文件读取

## 保存和读取list

```python
import open
def text_save(content,filename,mode='a'):
    # Try to save a list variable in txt file.
    file = open(filename,mode)
    for i in range(len(content)):
        file.write(str(content[i])+'\n')
    file.close()

def text_read(filename):
    # Try to read a txt file and return a list.Return [] if there was a mistake.
    try:
        file = open(filename,'r')
    except IOError:
        error = []
        return error
    content = file.readlines()

    for i in range(len(content)):
        content[i] = content[i][:len(content[i])-1]

    file.close()
    return content
```

实际运行：
```python
test_text = ['just','for','test']
text_save(test_text,'1.txt')
```
得到1.txt文档：
> just

> for

> test

读取1.txt：
```python
test_content = text_read('1.txt')
print(test_content)
```
得到：
`['just', 'for', 'test']`


## 读取txt文件
```python
from os.path import join
from codecs import open
raw_nomal_dataset_address = '.\\training dataset v4\\01-一般项目\\'
with open(join(raw_nomal_dataset_address,'一般项目-'+str(i+1)+'.txtoriginal.txt'), 'r', encoding='utf-8') as f:
    @#$%^&*#$%^&*#$%^&*
```
**这里备注一下：**

在《保存和读取list》这个经验里不用写`encoding='utf-8'`，否则会报错

## startswith()函数
```python
>>> str = "this is string example....wow!!!"
print (str.startswith( 'this' ))   # 字符串是否以 this 开头
print (str.startswith( 'string', 8 ))  # 从第八个字符开始的字符串是否以 string 开头
print (str.startswith( 'this', 2, 4 )) # 从第2个字符开始到第四个字符结束的字符串是否以 this 开头
>>> True
True
False
```

## python删除和替换字符串中指定字符

###  str.maketrans(x[, y[, z]])和str.translate()的组合
`static str.maketrans(x[, y[, z]])`
This static method returns a translation table usable for `str.translate()`.

- If there is only one argument, 
it must be a dictionary mapping Unicode ordinals (integers) or characters (strings of length 1) to Unicode ordinals, 
strings (of arbitrary lengths) or None. Character keys will then be converted to ordinals.

- If there are two arguments, 
they must be strings of equal length, 
and in the resulting dictionary, 
each character in x will be mapped to the character at the same position in y. 
- If there is a third argument, it must be a string, whose characters will be mapped to None in the result.

In [3]:
import string
i = 'Hello, how are you!'
i.translate(str.maketrans('', '', string.punctuation))

'Hello how are you'

In [4]:
i = 'hello world i am li'
i.translate(str.maketrans('','','l'))

'heo word i am i'

In [10]:
intab = "aeiou"
outtab = "12345"
trantab = str.maketrans(intab, outtab)
 
str = "this is string example....wow!!!";
print(str.translate(trantab))

th3s 3s str3ng 2x1mpl2....w4w!!!


In [17]:
a = str.maketrans({'t': 'K', 'w': 'P'}) # 这个词典的key必须长度为1，而value的长度没有要求
stra = "this is string example....wow!!!";
print(stra.translate(a))

Khis is sKring example....PoP!!!


In [9]:
# 删去 xm
intab = "aeiou"
outtab = "12345"
trantab = str.maketrans(intab, outtab, 'xm')
 
str = "this is string example....wow!!!";
print(str.translate(trantab))

th3s 3s str3ng 21pl2....w4w!!!


### str.replace(old, new[, max])
- old -- 将被替换的子字符串；
- new -- 新字符串，用于替换old子字符串；
- max -- 可选字符串, 替换不超过 max 次；

In [20]:
str = "this is string example....wow!!! this is really string";
print(str.replace("is", "was"))
print(str.replace("is", "was", 3))
print(str.replace("is", ""))

thwas was string example....wow!!! thwas was really string
thwas was string example....wow!!! thwas is really string
th  string example....wow!!! th  really string


## os.walk()

`os.walk(top[, topdown=True[, onerror=None[, followlinks=False]]])`

参数:
- top -- 是你所要遍历的目录的地址, 返回的是一个三元组(root,dirs,files)。
- root 所指的是当前正在遍历的这个文件夹的本身的地址
- dirs 是一个 list ，内容是该文件夹中所有的目录的名字(不包括子目录)
- files 同样是 list , 内容是该文件夹中所有的文件(不包括子目录)
- topdown --可选，为 True，则优先遍历 top 目录，否则优先遍历 top 的子目录(默认为开启)。如果 topdown 参数为 True，walk 会遍历top文件夹，与top 文件夹中每一个子目录。

- onerror -- 可选，需要一个 callable 对象，当 walk 需要异常时，会调用。

- followlinks -- 可选，如果为 True，则会遍历目录下的快捷方式(linux 下是软连接 symbolic link )实际所指的目录(默认关闭)，如果为 False，则优先遍历 top 的子目录。

示例：
```python
import os
origin_path = 'E:\\github lab\\data\\ren_task\\training dataset v4\\02-病史特点\\'
tag_list = []
for root,dirs,files in os.walk(origin_path):
    for file in files:
        label_filepath = os.path.join(root,file)
        if 'original' not in label_filepath:
            for line in open(label_filepath, 'r', encoding='utf-8'):
                *term,b,e,tag = line.strip().split()
                tag_list.extend(term)
```

# 一些常见的函数

## squeeze()和unsqueeze()

```python
>>> x = torch.zeros(2, 1, 2, 1, 2)
>>> x.size()
torch.Size([2, 1, 2, 1, 2])
>>> y = torch.squeeze(x)
>>> y.size()
torch.Size([2, 2, 2])
>>> y = torch.squeeze(x, 0)
>>> y.size()
torch.Size([2, 1, 2, 1, 2])
>>> y = torch.squeeze(x, 1)
>>> y.size()
torch.Size([2, 2, 1, 2])
```
------
```python
>>> x = np.array([[[0], [1], [2]]])
>>> x.shape
(1, 3, 1)
>>> np.squeeze(x).shape
(3,)
>>> np.squeeze(x, axis=0).shape
(3, 1)
>>> np.squeeze(x, axis=1).shape
Traceback (most recent call last):
...
ValueError: cannot select an axis to squeeze out which has size not equal to one
>>> np.squeeze(x, axis=2).shape
(1, 3)
```

## @classmethod

In [28]:
class Data_test2(object):
    day=0
    month=0
    year=0
    def __init__(self,year=0,month=0,day=0):
        self.day=day
        self.month=month
        self.year=year

    @classmethod
    def get_date(cls,data_as_string):
        #这里第一个参数是cls， 表示调用当前的类名
        year,month,day = map(int,data_as_string.split('-'))
        return cls(year,month,day)

    def out_date(self):
        print ("year :")
        print (self.year)
       

In [29]:
t = Data_test2.get_date('2019-7-13')

In [30]:
t.out_date()

year :
2019


# 不同type的一些操作

## dateFrame格式的一些操作

### 读取时添加添加header
```python
x = pd.read_csv(ls,header = None,names = ["num","term","context"]) # 给列表加上‘num’‘term’‘context'几个表头
```

### dataFrame删除一行或一列：drop函数（相应的行标和列表也会跟着删除，特别的，删除行，行标就不再是连续的了）

用法：DataFrame.drop(labels=None,axis=0, index=None, columns=None, inplace=False)

------
参数说明：
- labels 就是要删除的行列的名字，用列表给定
- axis 默认为0，指删除行，因此删除columns时要指定axis=1；
- index 直接指定要删除的行
- columns 直接指定要删除的列
- inplace=False，默认该删除操作不改变原数据，而是返回一个执行删除操作后的新dataframe；
- inplace=True，则会直接在原数据上进行删除操作，删除后无法返回。

因此，删除行列有两种方式：
1. labels=None,axis=0 的组合
2. index或columns直接指定要删除的行或列

例子：
```python
>>>df = pd.DataFrame(np.arange(12).reshape(3,4), columns=['A', 'B', 'C', 'D'])

>>>df

   A   B   C   D

0  0   1   2   3

1  4   5   6   7

2  8   9  10  11

#Drop columns,两种方法等价

>>>df.drop(['B', 'C'], axis=1)

   A   D

0  0   3

1  4   7

2  8  11

>>>df.drop(columns=['B', 'C'])

   A   D

0  0   3

1  4   7

2  8  11

# 第一种方法下删除column一定要指定axis=1,否则会报错
>>> df.drop(['B', 'C'])

ValueError: labels ['B' 'C'] not contained in axis

#Drop rows
>>>df.drop([0, 1])

   A  B   C   D

2  8  9  10  11

>>> df.drop(index=[0, 1])

   A  B   C   D
   
2  8  9  10  11
```


### 遍历DataFrame的行：DataFrame.iterrows()

```python
>>>import pandas as pd
inp = [{'c1':10, 'c2':100}, {'c1':11,'c2':110}, {'c1':12,'c2':120}]
df = pd.DataFrame(inp)
print(df)
>>>
   c1   c2
0  10  100
1  11  110
2  12  120
>>>for index, row in df.iterrows():
    print row["c1"], row["c2"]
>>>10 100
11 110
12 120
```

### dateFrame.value_counts()

In [12]:
import pandas as pd
ls = 'E:\\reading_books\\PyTorchNLPBook-master\\data\\surnames\\surnames_with_splits.csv'
x = pd.read_csv(ls,header = 0)
clip = x.head()

In [13]:
clip

Unnamed: 0,surname,nationality,split
0,Totah,Arabic,train
1,Abboud,Arabic,train
2,Fakhoury,Arabic,train
3,Srour,Arabic,train
4,Sayegh,Arabic,train


In [24]:
c_1 = clip.nationality
d_1 = clip['nationality']

In [25]:
c_1,d_1

(0    Arabic
 1    Arabic
 2    Arabic
 3    Arabic
 4    Arabic
 Name: nationality, dtype: object, 0    Arabic
 1    Arabic
 2    Arabic
 3    Arabic
 4    Arabic
 Name: nationality, dtype: object)

In [19]:
nationality = x.nationality

In [20]:
nationality.value_counts()

English       2972
Russian       2373
Arabic        1603
Japanese       775
Italian        600
German         576
Czech          414
Spanish        258
Dutch          236
French         229
Chinese        220
Irish          183
Greek          156
Polish         120
Korean          77
Scottish        75
Vietnamese      58
Portuguese      55
Name: nationality, dtype: int64

In [21]:
nationality.value_counts().to_dict()

{'English': 2972,
 'Russian': 2373,
 'Arabic': 1603,
 'Japanese': 775,
 'Italian': 600,
 'German': 576,
 'Czech': 414,
 'Spanish': 258,
 'Dutch': 236,
 'French': 229,
 'Chinese': 220,
 'Irish': 183,
 'Greek': 156,
 'Polish': 120,
 'Korean': 77,
 'Scottish': 75,
 'Vietnamese': 58,
 'Portuguese': 55}

## generator类的一些操作

In [1]:
import pandas as pd
ls = 'E:\\reading_books\\PyTorchNLPBook-master\\data\\surnames\\surnames_with_splits.csv'
x = pd.read_csv(ls,header = 0)

In [4]:
x[x.split=='train']

Unnamed: 0,surname,nationality,split
0,Totah,Arabic,train
1,Abboud,Arabic,train
2,Fakhoury,Arabic,train
3,Srour,Arabic,train
4,Sayegh,Arabic,train
5,Cham,Arabic,train
6,Haik,Arabic,train
7,Kattan,Arabic,train
8,Khouri,Arabic,train
9,Antoun,Arabic,train


In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
a = x[x.split=='train']  
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"):
    """
    A generator function which wraps the PyTorch DataLoader. It will 
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict
        
batch_generator = generate_batches(a, batch_size=64, device='cuda')

In [11]:
batch_generator.__next__()

KeyError: 882

In [3]:
i = 0
b = []
for batch_index, batch_dict in enumerate(batch_generator):
    print(batch_dict)


KeyError: 4140

In [13]:
b

[]

In [23]:
import numpy as np
b_size = (5,)
b = np.zeros(b_size, dtype=np.int64)

In [18]:
b

array([0., 0., 0., 0., 0.], dtype=float32)

In [20]:
b.size

5

## TorchTensor类的操作

### 维度转换
```python
import torch
import numpy as np

a=np.array([[[1,2,3],[4,5,6]]])

unpermuted=torch.tensor(a)
print(unpermuted.size())  #  ——>  torch.Size([1, 2, 3])

permuted=unpermuted.permute(2,0,1)
print(permuted.size())     #  ——>  torch.Size([3, 1, 2])
```

### torch.stack()函数

In [5]:
import torch
a = torch.IntTensor([[1,2,3],[11,22,33]])
b = torch.IntTensor([[4,5,6],[44,55,66]])

c = torch.stack([a,b],dim = 0)  # c = [ a, b]
d = torch.stack([a,b],dim = 1)  # d = [ [a[0] , b[0] ] , [a[1], b[1] ] ]
e = torch.stack([a,b],dim = 2)  # e = [[[ a[0][0], b[0][0]],[a[0][1],b[0][1]],[a[0][2],b[0][2]]] ,
#                         [[ a[1][0], b[1][0]],[a[1][1],b[0][1]],[a[1][2],b[1][2]]]]
print('\na:\n',a,'\nb:\n',b,'\nc:\n',c,'\nd:\n',d,'\ne:\n',e)


a:
 tensor([[ 1,  2,  3],
        [11, 22, 33]], dtype=torch.int32) 
b:
 tensor([[ 4,  5,  6],
        [44, 55, 66]], dtype=torch.int32) 
c:
 tensor([[[ 1,  2,  3],
         [11, 22, 33]],

        [[ 4,  5,  6],
         [44, 55, 66]]], dtype=torch.int32) 
d:
 tensor([[[ 1,  2,  3],
         [ 4,  5,  6]],

        [[11, 22, 33],
         [44, 55, 66]]], dtype=torch.int32) 
e:
 tensor([[[ 1,  4],
         [ 2,  5],
         [ 3,  6]],

        [[11, 44],
         [22, 55],
         [33, 66]]], dtype=torch.int32)


### tensor.chunk()函数
> torch.chunk(input, chunks, dim=0) → List of Tensors

Splits a tensor into a specific number of chunks.
Last chunk will be smaller if the tensor size along the given dimension `dim` is not divisible by `chunks`.

> Parameters
- input(Tensor):the tensor to split
- chunks(int):number of chunks to return
- dim(int):dimension along which to split the tensor

In [20]:
a

tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]], dtype=torch.int32)

In [21]:
lis = a.chunk(4,dim=1)
lis

(tensor([[0],
         [4],
         [8]], dtype=torch.int32), tensor([[1],
         [5],
         [9]], dtype=torch.int32), tensor([[ 2],
         [ 6],
         [10]], dtype=torch.int32), tensor([[ 3],
         [ 7],
         [11]], dtype=torch.int32))

## python的列表（list）、字符串

### 反向输出和拼接

In [2]:
string = 'abcd'
my_list = [1,2,3,4,5]
print(string[::-1],my_list[::-1])

dcba [5, 4, 3, 2, 1]


In [8]:
word_list = ['awesome','is','this']
print(' '.join(word_list[::-1])+'!') # 注意前面那个空格，改为其他任何东都可以！

this is awesome!


### 列表推导式

In [10]:
def my_func(x):
    return x**2 + 5
my_list = [1,2,3,4,5]
'''
new_list = []
for x in my_list:
    if x % 2 != 0:
        new_list.append(my_func(x))
print(new_list) # [6,14,30]
'''
print([my_func(x) for x in my_list if x % 2 != 0]) #这个等价很帅！

[6, 14, 30]


In [11]:
print([x ** 2 +5 for x in my_list if x % 2 != 0]) #这个更加直接

[6, 14, 30]


**综上：**

```python
expression for item in list if conditional
```
等价于
```python
for item in list:
    if conditional:
        expression
```

## Numpy

### numpy对象切片的差别

In [8]:
a = np.array([i for i in range(12)])
a = a.reshape((6,2))
b = a[:,1]
c = a[:,0:1]
print('b.shape=',b.shape,'\t','c.shape',c.shape)

b.shape= (6,) 	 c.shape (6, 1)


# 机器学习知识

## 机器学习中的AUC

𝐴𝑈𝐶 的衡量标准对二分类非常易用（特别是本省的数据是Unbalanced的时候）[https://cloud.tencent.com/developer/news/253344]

# 一些模型的细节

## 注意力机制(Attention)

### 在seq2seq网络中的使用

In [4]:
import random

v = [random.gauss(0,1) for z in range(40)]

In [2]:
a = np.array([i for i in range(12)])
a = a.reshape((6,2))

In [4]:
b = a[:,1]
b

array([ 1,  3,  5,  7,  9, 11])

In [9]:
b.size

6

In [10]:
b.shape

(6,)

In [12]:
d = np.random.randint(0,10,(6,))
d

array([7, 0, 5, 1, 4, 3])

In [14]:
reward = []
reward.append(b)
reward.append(d)
reward

[array([ 1,  3,  5,  7,  9, 11]), array([7, 0, 5, 1, 4, 3])]

In [17]:
reward.shape()

AttributeError: 'list' object has no attribute 'shape'

In [20]:
for i in range(1,12):
    print(i)

1
2
3
4
5
6
7
8
9
10
11
