In [19]:
import pandas as pd
import numpy as np
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import warnings
warnings.filterwarnings(action='ignore')

print("Cuda available", torch.cuda.is_available())  # Should return True if CUDA is available
print("Cuda version", torch.version.cuda)         # Should print the CUDA version PyTorch was built with
print("Device count", torch.cuda.device_count())

Cuda available True
Cuda version 12.1
Device count 1


In [3]:
# Check if CUDA is available
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    raise EnvironmentError("CUDA is not available. Check your installation and try again.")

In [None]:
df1 = pd.read_csv('training_data/QnA_intent_processed_amazon_reviews_us_Beauty.tsv', sep= '\t')
df2 = pd.read_csv('training_data/QnA_intent_processed_amazon_reviews_us_Health.tsv', sep= '\t')


print(df1.shape, df2.shape)

df = pd.concat([df1,df2], ignore_index= True, axis=0)

X = df[['review_id','QnA']]
y = df.verified_purchase
X_train, X_test, _, _ = train_test_split(X, y, random_state= 56, test_size= 0.4) 

X_train.to_csv('training_data/train.tsv', sep= '\t', index= False)
X_test.to_csv('training_data/test.tsv', sep= '\t', index= False)

In [4]:
X_train = pd.read_csv('training_data/train.tsv', sep= '\t')
X_train.shape

(1078930, 2)

In [5]:
class DataFrameDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        qa_pair = str(self.dataframe.iloc[idx]['QnA'])
        encoding = self.tokenizer.encode_plus(qa_pair, return_tensors='pt', max_length=self.max_length, truncation=True, padding='max_length')
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        return {'input_ids': input_ids, 'attention_mask': attention_mask}

In [12]:
training_args = TrainingArguments(
    output_dir='./gpt2-finetuned',
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=4,
    logging_dir= './logs',
    save_steps=10_000,
    save_total_limit= 2,
    learning_rate= 0.01,
    logging_steps= 1000
)

In [7]:
model_name = "distilgpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
max_length = 128

dataset = DataFrameDataset(X_train, tokenizer, max_length)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
model.resize_token_embeddings(len(tokenizer))

Embedding(50257, 768)

In [None]:
# chunks = np.array_split(X_train, 100)

for idx, chunk in tqdm(enumerate(chunks_2)):
    # if idx == 0:
    #     path = model_name
    # else:
    path = './gpt2-finetuned'
    dataset = DataFrameDataset(chunk, tokenizer, max_length)
    model = GPT2LMHeadModel.from_pretrained(path).to(device)
    model.resize_token_embeddings(len(tokenizer))

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
    )

    trainer.train()

    trainer.save_model('./gpt2-finetuned')

0it [00:00, ?it/s]

Step,Training Loss
1000,4.5644
2000,4.5497
3000,4.4346
4000,4.2213
5000,4.1719


1it [27:00, 1620.45s/it]

Step,Training Loss
1000,4.5831
2000,4.5943
3000,4.4124
4000,4.253
5000,4.1921


2it [1:31:29, 2943.24s/it]

Step,Training Loss
1000,4.6674
2000,4.6703
3000,4.4944
4000,4.3188
5000,4.2325


3it [2:02:07, 2438.58s/it]

Step,Training Loss
1000,4.6403
2000,4.57
3000,4.4309
4000,4.2432
5000,4.1557


4it [2:55:56, 2750.65s/it]

Step,Training Loss
1000,4.5301
2000,4.4252
3000,4.3142
4000,4.1431
5000,4.0915


5it [3:26:59, 2430.44s/it]

Step,Training Loss
1000,4.508
2000,4.5083
3000,4.3252
4000,4.1588
5000,4.0767


6it [4:28:13, 2853.16s/it]

Step,Training Loss
1000,4.5322
2000,4.4459
3000,4.3142
4000,4.1189
5000,4.0291


7it [5:02:20, 2589.79s/it]

Step,Training Loss
1000,4.5249
2000,4.4642
3000,4.3086
4000,4.1339
5000,4.1137


8it [6:04:52, 2959.64s/it]

Step,Training Loss
1000,4.5809
2000,4.5402
3000,4.3648
4000,4.1195
5000,4.0744


9it [6:37:09, 2640.01s/it]

Step,Training Loss
1000,4.5447
2000,4.4539
3000,4.3324
4000,4.116
5000,4.0403


10it [7:41:46, 3021.94s/it]

Step,Training Loss
1000,4.5384
2000,4.4235
3000,4.255
4000,4.0965
5000,4.0017


11it [8:13:28, 2679.02s/it]

Step,Training Loss
1000,4.4622
2000,4.3657
3000,4.2675
4000,4.068
5000,3.9752


12it [9:18:27, 3050.41s/it]

Step,Training Loss
1000,4.4627
2000,4.3747
3000,4.251
4000,4.0325
5000,3.9787


13it [9:50:32, 2709.29s/it]

Step,Training Loss
1000,4.427
2000,4.336
3000,4.1786
4000,4.0225
5000,3.9385


14it [10:55:00, 3059.21s/it]

Step,Training Loss
1000,4.3797
2000,4.3213
3000,4.1901
4000,3.9926
5000,3.9445


15it [11:26:41, 2710.07s/it]

Step,Training Loss
1000,4.388
2000,4.3341
3000,4.1974
4000,3.9825
5000,3.9371


16it [12:29:07, 3022.13s/it]

Step,Training Loss
1000,4.4489
2000,4.4118
3000,4.1973
4000,4.0652
5000,3.9567


17it [13:00:46, 2684.32s/it]

Step,Training Loss
1000,4.4166
2000,4.3745
3000,4.1757
4000,3.9834
5000,3.9216


18it [13:59:57, 2944.66s/it]

Step,Training Loss
1000,4.3881
2000,4.3052
3000,4.1925
4000,3.9863
5000,3.9116


19it [14:31:34, 2629.94s/it]

Step,Training Loss
1000,4.3438
2000,4.3178
3000,4.1675
4000,3.9778
5000,3.893


20it [15:36:44, 3014.29s/it]

Step,Training Loss
1000,4.3678
2000,4.2921
3000,4.2024
4000,3.9587
5000,3.9122


21it [16:08:44, 2685.79s/it]

Step,Training Loss
1000,4.3527
2000,4.3103
3000,4.1677
4000,3.9527
5000,3.8886


22it [17:12:48, 3033.47s/it]

Step,Training Loss
1000,4.3287
2000,4.2787
3000,4.1259
4000,3.9445
5000,3.8707


23it [17:44:22, 2691.69s/it]

Step,Training Loss
1000,4.3792
2000,4.2837
3000,4.1475
4000,3.952
5000,3.9046


24it [18:44:40, 2969.50s/it]

Step,Training Loss
1000,4.3295
2000,4.2669
3000,4.0996
4000,3.9225
5000,3.8412


25it [19:15:38, 2635.93s/it]

Step,Training Loss
1000,4.2945
2000,4.2511
3000,4.1125
4000,3.903
5000,3.8377


26it [20:13:57, 2895.05s/it]

Step,Training Loss
1000,4.3111
2000,4.2724
3000,4.1436
4000,3.9363
5000,3.8531


27it [20:45:14, 2589.43s/it]

Step,Training Loss
1000,4.3367
2000,4.2508
3000,4.1319
4000,3.9675
5000,3.9071


28it [21:44:26, 2878.36s/it]

Step,Training Loss
1000,4.308
2000,4.2054
3000,4.0691
4000,3.8612
5000,3.8148


29it [22:16:05, 2584.49s/it]

Step,Training Loss
1000,4.2977
2000,4.1646
3000,4.0305
4000,3.8463
5000,3.7488


30it [23:17:52, 2921.21s/it]

Step,Training Loss
1000,4.2929
2000,4.1937
3000,4.0376
4000,3.8428
5000,3.7807


31it [23:49:15, 2609.72s/it]

Step,Training Loss
1000,4.275
2000,4.2025
3000,4.0834
4000,3.8775
5000,3.7967


32it [24:49:34, 2912.47s/it]

Step,Training Loss
1000,4.2659
2000,4.1853
3000,4.0359
4000,3.824
5000,3.7797


33it [25:21:00, 2604.73s/it]

Step,Training Loss
1000,4.2289
2000,4.187
3000,4.034
4000,3.7981
5000,3.7802


34it [26:20:13, 2889.21s/it]

Step,Training Loss
1000,4.2161
2000,4.1479
3000,4.0289
4000,3.7943
5000,3.7554


35it [26:51:38, 2587.86s/it]

Step,Training Loss
1000,4.2062
2000,4.1236
3000,3.9827
4000,3.7915
5000,3.7257


36it [27:50:49, 2876.76s/it]

Step,Training Loss
1000,4.1775
2000,4.1711
3000,3.9921
4000,3.8012
5000,3.7253


37it [28:22:04, 2576.33s/it]

Step,Training Loss
1000,4.2304
2000,4.1916
3000,3.9969
4000,3.8035
5000,3.7442


38it [29:21:05, 2865.83s/it]

Step,Training Loss
1000,4.1803
2000,4.1706
3000,3.9776
4000,3.7689
5000,3.7329


39it [29:52:30, 2571.51s/it]

Step,Training Loss
1000,4.1838
2000,4.1447
3000,3.9865
4000,3.7752
5000,3.7587


40it [30:52:44, 2884.32s/it]

Step,Training Loss
1000,4.2239
2000,4.1977
3000,4.0149
4000,3.8226
5000,3.7478


41it [31:24:18, 2587.08s/it]

Step,Training Loss
1000,4.2286
2000,4.166
3000,4.0125
4000,3.7964
5000,3.7821


42it [32:23:32, 2877.14s/it]

Step,Training Loss
1000,4.1879
2000,4.1482
3000,3.9655
4000,3.7656
5000,3.7101


43it [32:54:57, 2579.53s/it]

Step,Training Loss
1000,4.1538
2000,4.1262
3000,3.968
4000,3.7686
5000,3.7071


44it [33:54:05, 2870.15s/it]

Step,Training Loss
1000,4.2043
2000,4.1949
3000,3.959
4000,3.7453
5000,3.6639


45it [34:25:37, 2576.68s/it]

Step,Training Loss
1000,4.1602
2000,4.1085
3000,3.9939
4000,3.7711
5000,3.7295


46it [35:25:04, 2873.87s/it]

Step,Training Loss
1000,4.1564
2000,4.1131
3000,3.9737
4000,3.7459
5000,3.6859


47it [35:56:36, 2579.02s/it]

Step,Training Loss
1000,4.1582
2000,4.1148
3000,3.9069
4000,3.7286
5000,3.6745


48it [36:56:01, 2875.08s/it]

Step,Training Loss
1000,4.1385
2000,4.1206
3000,3.9718
4000,3.7458
5000,3.7021


49it [37:27:32, 2579.77s/it]

Step,Training Loss
1000,4.1456
2000,4.1101
3000,3.9537
4000,3.7573
5000,3.6885


50it [38:26:32, 2867.82s/it]

Step,Training Loss
1000,4.1841
2000,4.1089
3000,3.9586
4000,3.7249
5000,3.6678


51it [38:58:27, 2581.96s/it]

Step,Training Loss
1000,4.1277
2000,4.0509


In [23]:
chunks_2 = chunks[46:]

107893