In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer
import json
from tqdm import tqdm
import numpy as np
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import copy

In [None]:
device=torch.device("cuda:0"if torch.cuda.is_available()else "cpu")
vocab_size=22000
embeding_size=1560
path="/kaggle/input/gptdata/train.txt"
num_heads=24
num_layers=24
batch_size=1
mid_size=embeding_size*2
max_length=512
step_size=1000000
epoch=1000
lr=3e-7

In [None]:
class MyDataset(Dataset):
    def __init__(self,path):
      self.data=[]
      with open(path,'r', encoding="utf-8") as f:
        now=""
        for line in tqdm(f):
          line=line.replace(" ","")
          line=line.replace("\n","")
          line=line.replace("“","")
          line=line.replace("”","")
          if len(line)>0 and line[0]!='—':
            now=now+line
          if len(now)>256:
            self.data.append(now)
            now=""
    def __len__(self):
        return len(self.data)
    def __getitem__(self,index):
        ans=self.data[index]
        return ans

In [None]:
dataset=MyDataset(path)
dataloader=DataLoader(dataset,batch_size=batch_size,shuffle=False)

In [None]:
tokenizer=AutoTokenizer.from_pretrained("/kaggle/input/bert-model")
embedings=nn.Embedding(vocab_size,embeding_size)
embedings=embedings.to(device)

In [None]:
class position_embeding(nn.Module):
  def __init__(self):
    super(position_embeding,self).__init__()
    self.ans=torch.zeros((512,embeding_size))
    for i in range(512):
        for j in range(embeding_size):    
            if j%2==0:
                self.ans[i][j]=np.sin(1/(10000**(j/embeding_size))*i)
            else:
                self.ans[i][j]=np.cos(1/(10000**((j-1)/embeding_size))*i)
    self.ans=self.ans.to(device)
  def forward(self,X):
    return X+self.ans[0:X.size()[1]]

In [None]:
pm=position_embeding()

In [None]:
def get_tensor(input):
    fenci=tokenizer(input)
    linshi=copy.deepcopy(fenci)
    max_len=0
    for i in range(batch_size):
        fenci["input_ids"][i]=fenci["input_ids"][i][:-1]
        linshi["input_ids"][i]=linshi["input_ids"][i][1:]
        if len(fenci["input_ids"][i])>max_length:
         fenci["input_ids"][i]=fenci["input_ids"][i][:max_length] 
         linshi["input_ids"][i]=linshi["input_ids"][i][:max_length] 
        max_len=max(max_len,len(fenci["input_ids"][i]))
    for i in range(batch_size):
        while len(fenci["input_ids"][i])<max_len:
           fenci["input_ids"][i].append(0)
           linshi["input_ids"][i].append(0)
    input = torch.LongTensor(fenci["input_ids"])
    input=input.to(device)
    output = torch.LongTensor(linshi["input_ids"])
    ans=pm(embedings(input))
    return ans,output

In [None]:
def gelu(x):
  out = 1 + torch.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * torch.pow(x, 3)))
  return out * x / 2

In [None]:
def get_mask_matrix(length):
  ans=torch.triu(torch.ones(batch_size,num_heads,length,length),diagonal=1)
  return ans

In [None]:
class feedforward(nn.Module):
  def __init__(self):
    super(feedforward,self).__init__()
    self.first_layer=nn.Linear(embeding_size,mid_size)
    self.second_layer=nn.Linear(mid_size,embeding_size)
    self.W_O=nn.Linear(embeding_size,embeding_size)
    self.layer_norm=nn.LayerNorm(embeding_size)
  def forward(self,X):
    mid_ans1=self.first_layer(X)
    mid_ans2=gelu(mid_ans1)
    mid_ans3=self.second_layer(mid_ans2)
    mid_ans4=gelu(mid_ans3)
    endans=self.layer_norm(mid_ans4)+self.W_O(X)
    return endans

In [None]:
class Attention_layer(nn.Module):
  def __init__(self):
    super(Attention_layer,self).__init__()
    self.W_Q=nn.Linear(embeding_size,embeding_size)
    self.W_K=nn.Linear(embeding_size,embeding_size)
    self.W_V=nn.Linear(embeding_size,embeding_size)
    self.W_O=nn.Linear(embeding_size,embeding_size)
  def forward(self,X):
    length=X.size()[1]
    Q=gelu(self.W_Q(X))
    K=gelu(self.W_K(X))
    V=gelu(self.W_V(X))
    Q_heads=Q.reshape(batch_size,length,num_heads,-1).transpose(1,2)
    K_heads=K.reshape(batch_size,length,num_heads,-1).transpose(1,2).transpose(2,3)
    V_heads=V.reshape(batch_size,length,num_heads,-1).transpose(1,2)
    A_matrix=[]
    mask_matrix=get_mask_matrix(length)
    mask_matrix1=mask_matrix.to(device)
    A_matrix=torch.softmax(torch.matmul(Q_heads,K_heads)+mask_matrix1*(-1e9),dim=-1)
    mid_output=torch.matmul(A_matrix,V_heads).transpose(1,2).reshape(batch_size,length,embeding_size)
    endoutput=mid_output+gelu(self.W_O(X))
    return endoutput

In [None]:
class chineseGPT(nn.Module):
  def __init__(self):
    super(chineseGPT,self).__init__()
    self.attention=nn.Sequential()
    for i in range(num_layers):
      self.attention.add_module(f"attention{i}",Attention_layer())
      self.attention.add_module(f"feedworward{i}",feedforward())
  def forward(self,X):
    ans=self.attention(X)
    return ans

In [None]:
class classifier(nn.Module):
  def __init__(self):
    super(classifier,self).__init__()
    self.first_layer=nn.Linear(embeding_size,mid_size)
    self.second_layer=nn.Linear(mid_size,embeding_size)
    self.third_layer=nn.Linear(embeding_size,vocab_size)
  def forward(self,X):
    mid_ans1=self.first_layer(X)
    mid_ans2=gelu(mid_ans1)
    mid_ans3=self.second_layer(mid_ans2)
    mid_ans4=gelu(mid_ans3)
    mid_ans5=self.third_layer(mid_ans4)
    return mid_ans5

In [None]:
model=chineseGPT()
model=model.to(device)
classify=classifier()
classify=classify.to(device)

In [None]:
ipath1='/kaggle/input/gpt-model2/embeding.pth'
ipath2='/kaggle/input/gpt-model2/model.pth'
ipath3='/kaggle/input/gpt-model2/classify.pth'
path1='/kaggle/working/embeding.pth'
path2='/kaggle/working/model.pth'
path3='/kaggle/working/classify.pth'
try:
    embedings.load_state_dict(torch.load(ipath1,map_location='cpu'))
    model.load_state_dict(torch.load(ipath2,map_location='cpu'))
    classify.load_state_dict(torch.load(ipath3,map_location='cpu'))
except:
    print("none")

In [None]:
torch.save(embedings.state_dict(),path1)
torch.save(model.state_dict(),path2)
torch.save(classify.state_dict(),path3)

In [None]:
weights=torch.ones(1,vocab_size)
weights[0][0]=0
class_weights = torch.FloatTensor(weights).to(device)
loss=nn.CrossEntropyLoss(weight=class_weights)
optimizer=torch.optim.Adam([{'params':embedings.parameters()},{'params':model.parameters()},{'params':classify.parameters()}],lr=lr)
scheduler=torch.optim.lr_scheduler.StepLR(optimizer,step_size=step_size,gamma=1)

In [None]:
jishu=0
sum=0
for www in range(epoch):
  for i,j in enumerate(dataloader):
    input,pre=get_tensor(j)
    pre=pre.to(device)
    mid_output=model(input)
    output=classify(mid_output)
    end_output=torch.transpose(output,1,2)
    optimizer.zero_grad()
    end_loss=loss(end_output,pre)
    end_loss.backward()
    nn.utils.clip_grad_norm_(embedings.parameters(),max_norm=2,norm_type=2)
    nn.utils.clip_grad_norm_(model.parameters(),max_norm=2,norm_type=2)
    nn.utils.clip_grad_norm_(classify.parameters(),max_norm=2,norm_type=2)
    optimizer.step()
    scheduler.step()
    sum+=end_loss.item()
    if jishu%100==0:
      _,ids=torch.max(nn.functional.softmax(output,dim=2),dim=-1)
      for kkk in range(min(6,pre.size()[1])):
        print(output[0][kkk][ids[0][kkk]])
      ans=ids.tolist()
      print("预测")
      print(tokenizer.decode(ans[0]))
      #print(tokenizer.decode(ans[1]))
      print("真实")
      print(tokenizer.decode(pre[0]))
      #print(tokenizer.decode(pre[1]))
      print("loss")
      print(sum/100)
      sum=0
    if jishu%1000==0:
      print("-------save"+str(jishu)+"--------")
      torch.save(embedings.state_dict(),path1)
      torch.save(model.state_dict(),path2)
      torch.save(classify.state_dict(),path3)
    jishu=jishu+1


In [None]:
def get_grad(model):
    for name, param in model.named_parameters():
        if param.grad is not None:
            print(name, param.grad)

In [None]:
for jj in range(output.size()[1]):
    print(nn.functional.softmax(output,dim=-1)[0][jj][102])

In [None]:
get_grad(model)