In [1]:
import pandas as pd
from transformers import GPT2Tokenizer
import torch
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Make sure your CSV is inside a folder named "data"
df = pd.read_csv("data/conversational_data.csv", encoding='utf-8')  # 👈 replace with your actual path
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

In [3]:
# Show sample
df.head()

Unnamed: 0,input,target
0,"hi, how are you doing?",i'm fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i've been great. what about you?
4,i've been great. what about you?,i've been good. i'm in school right now.


In [4]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # ✅ Important for GPT2 which doesn't have PAD token
tokenizer.save_pretrained("models/gpt2_tokenizer/")



('models/gpt2_tokenizer/tokenizer_config.json',
 'models/gpt2_tokenizer/special_tokens_map.json',
 'models/gpt2_tokenizer/vocab.json',
 'models/gpt2_tokenizer/merges.txt',
 'models/gpt2_tokenizer/added_tokens.json')

In [5]:
from torch.utils.data import Dataset

class ChatDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=64):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_text = self.data.iloc[idx]['input']
        target_text = self.data.iloc[idx]['target']

        input_encoding = self.tokenizer(
            input_text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors="pt"
        )

        target_encoding = self.tokenizer(
            target_text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            'input_ids': input_encoding['input_ids'].squeeze(),
            'attention_mask': input_encoding['attention_mask'].squeeze(),
            'labels': target_encoding['input_ids'].squeeze()
        }

In [6]:
# Create the dataset object
dataset = ChatDataset(df, tokenizer, max_length=64)

In [7]:
# Now save the dataset to a pickle file
import pickle

with open("data/processed_dataset.pkl", "wb") as f:
    pickle.dump(dataset, f)
    print("successfully dump ho gya")

successfully dump ho gya


In [8]:
# import pandas as pd
# from transformers import GPT2Tokenizer
# import torch
# import os


In [9]:
# # Make sure your CSV is inside a folder named "data"
# df = pd.read_csv("data/conversational_data.csv", encoding='utf-8')  # 👈 replace with your actual path
# df.dropna(inplace=True)
# df.reset_index(drop=True, inplace=True)

# # Show sample
# df.head()


#### Initialize GPT-2 Tokenizer

In [10]:
# from transformers import GPT2Tokenizer
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# tokenizer.pad_token = tokenizer.eos_token  # ✅ Important for GPT2 which doesn't have PAD token
# tokenizer.save_pretrained("models/gpt2_tokenizer/")


#### Tokenize input & target

In [11]:
# from torch.utils.data import Dataset

# class ChatDataset(Dataset):
#     def __init__(self, dataframe, tokenizer, max_length=64):
#         self.tokenizer = tokenizer
#         self.data = dataframe
#         self.max_length = max_length

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         input_text = self.data.iloc[idx]['input']
#         target_text = self.data.iloc[idx]['target']

#         input_encoding = self.tokenizer(
#             input_text,
#             truncation=True,
#             padding='max_length',
#             max_length=self.max_length,
#             return_tensors="pt"
#         )

#         target_encoding = self.tokenizer(
#             target_text,
#             truncation=True,
#             padding='max_length',
#             max_length=self.max_length,
#             return_tensors="pt"
#         )

#         return {
#             'input_ids': input_encoding['input_ids'].squeeze(),
#             'attention_mask': input_encoding['attention_mask'].squeeze(),
#             'labels': target_encoding['input_ids'].squeeze()
#         }


#### Stack & Save Tensor Da

In [12]:
# import pickle

# with open("data/processed_dataset.pkl", "wb") as f:
#     pickle.dump(dataset, f)


In [13]:

# # Function to tokenize the dataset
# def tokenize_data(data):
#     input_texts = data['input'].tolist()  # 'input' column
#     target_texts = data['target'].tolist()  # 'output' column
    
#     # Tokenize both input and output texts
#     input_ids = tokenizer(input_texts, truncation=True, padding=True, max_length=50, return_tensors="pt")
#     target_ids = tokenizer(target_texts, truncation=True, padding=True, max_length=50, return_tensors="pt")
    
#     return input_ids, target_ids

In [14]:
# # Tokenize the data
# input_ids, target_ids = tokenize_data(data)

In [15]:
# # Check the tokenized outputs
# print(input_ids)
# print(target_ids)

In [16]:
# # Optional: Save tokenizer
# tokenizer.save_pretrained("models/gpt2_tokenizer/")