In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import pandas as pd
import os

os.chdir('/content/gdrive/My Drive/특허경진대회/')
os.getcwd()



In [None]:
data = pd.read_csv('data.csv')

In [None]:
len(data)

In [None]:
data.columns

In [None]:
title_word = data[['invention_title','tech_word']]

In [None]:
!pip install ratsnlp

In [None]:
from transformers import  GPT2LMHeadModel, GPT2Tokenizer,AdamW
import pandas as pd
from torch.utils.data import Dataset , DataLoader
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-large")
gpt2 = GPT2LMHeadModel.from_pretrained("gpt2-large")

In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
prompt = tokenizer.encode("machine learning", max_length = 30 , padding = "max_length" , truncation = True , return_tensors = "pt")
output = gpt2.generate(prompt,do_sample = True, max_length = 100,top_k = 10, temperature = 0.8)
tokenizer.decode(output[0]  , skip_special_tokens = True)

In [None]:
class TitleDataset(Dataset):
    def __init__(self,titles):
        self.tokenizer = tokenizer
        self.titles = titles
    
    def __len__(self):
        return len(self.titles)
    
    def __getitem__(self,index):
        title = self.titles[index]
        title_token = tokenizer.encode(title , max_length = 30 , padding = "max_length" , truncation = True, return_tensors = "pt").reshape(-1)
        return title_token

In [None]:
dset = TitleDataset(data['invention_title'].values)
title = next(iter(DataLoader(dset , batch_size = 1,shuffle = True)))
display(title)

In [None]:
x_train , x_test = train_test_split(title_word, test_size = 0.3 , random_state = 42)

In [None]:
class TitleDataModule(pl.LightningDataModule):
    def __init__(self):
        super().__init__()
        self.train = TitleDataset(x_train["invention_title"].values )
        self.test = TitleDataset(x_test["invention_title"].values )
        self.val = TitleDataset(x_test["invention_title"].values)
    
    def train_dataloader(self):
        return DataLoader(self.train , batch_size = 1 , shuffle = True)
    def test_dataloader(self):
        return DataLoader(self.test , batch_size = 1 , shuffle = False)
    def val_dataloader(self):
        return DataLoader(self.val , batch_size = 1 , shuffle = False)

In [None]:
gpt2_model = gpt2
print("done")

In [None]:
class TitleGenerator(pl.LightningModule):
    def __init__(self):
        super().__init__()
        gpt2_model.train()
        self.neural_net = gpt2_model
        
    def forward(self,x):
        return self.neural_net(x , labels = x)
    
    def configure_optimizers(self):
        return AdamW(self.parameters(), 1e-4)
        
    def training_step(self,batch,batch_idx):
        x= batch
        output = self(x)
        return output.loss
    
    def test_step(self,batch,batch_idx):
        x= batch
        output = self(x)
        return output.loss
    
    def validation_step(self,batch,batch_idx):
        x= batch
        output = self(x)
        return output.loss

In [None]:
from pytorch_lightning import Trainer
model = TitleGenerator()
module = TitleDataModule()
trainer = Trainer(max_epochs = 2,gpus = 1)
trainer.fit(model,module)

In [None]:
## test ##

In [None]:
gpt2.state_dict = model.state_dict 

In [None]:
raw_text = ['use', 'machine learning']
output_text = []
for x in raw_text:
    prompt = tokenizer.encode(x , return_tensors = "pt")
    output = gpt2.generate(prompt,do_sample = True, max_length = 100,top_k = 10, temperature = 0.8)
    output_text.append(tokenizer.decode(output[0] , skip_special_tokens = True))

In [None]:
display(output_text)