# **Environment Setup**

# First, install necessary libraries:

In [1]:
!pip install kaggle kagglehub transformers torch pandas


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

# **Upload Kaggle API Key**

In [2]:
from google.colab import files
files.upload()  # Upload kaggle.json


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"samwelmotari","key":"8338058c06b166bfd740b2e4955a3bb7"}'}

# **Move it to the correct directory:**

In [4]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


mv: cannot stat 'kaggle.json': No such file or directory


# **📌 Step 2: Download the Dataset**

In [8]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("tariqsays/chatgpt-twitter-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/tariqsays/chatgpt-twitter-dataset?dataset_version_number=1...


100%|██████████| 8.56M/8.56M [00:01<00:00, 7.91MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/tariqsays/chatgpt-twitter-dataset/versions/1


In [9]:
import os

dataset_path = "/root/.cache/kagglehub/datasets/tariqsays/chatgpt-twitter-dataset/versions/1"
print(os.listdir(dataset_path))


['chatgpt1.csv']


# **Step 1: Load the Dataset**

In [10]:
import pandas as pd

dataset_path = "/root/.cache/kagglehub/datasets/tariqsays/chatgpt-twitter-dataset/versions/1/chatgpt1.csv"
df = pd.read_csv(dataset_path)

# Show first few rows and column names
print(df.head())
print(df.columns)


                    Datetime             Tweet Id  \
0  2023-01-22 13:44:34+00:00  1617156270871699456   
1  2023-01-22 13:44:39+00:00  1617156291046133761   
2  2023-01-22 13:44:44+00:00  1617156308926349312   
3  2023-01-22 13:44:49+00:00  1617156332297256961   
4  2023-01-22 13:44:52+00:00  1617156345064570880   

                                                Text         Username  \
0  ChatGPTで遊ぶの忘れてた！！\n書類作るコード書いてみてほしいのと、\nどこまで思考整...      mochico0123   
1  @AlexandrovnaIng Prohibition of ChatGPT has be...  Caput_LupinumSG   
2  Schaut Euch an, was @fobizz @DianaKnodel alles...            ciffi   
3  Bow down to chatGPT 🫡..... https://t.co/ENTSzi...    Vishwasrisiri   
4  Profilinde vatan, Türkiye falan yazan bireyler...   0xGenetikciniz   

                                           Permalink  \
0  https://twitter.com/mochico0123/status/1617156...   
1  https://twitter.com/Caput_LupinumSG/status/161...   
2  https://twitter.com/ciffi/status/1617156308926...   
3  https://twitter

# ** Step 2: Check for Missing Values**

In [11]:
print(df.isnull().sum())


Datetime              0
Tweet Id              0
Text                  0
Username              0
Permalink             0
User                  0
Outlinks          30059
CountLinks        30059
ReplyCount            0
RetweetCount          0
LikeCount             0
QuoteCount            0
ConversationId        0
Language              0
Source                0
Media             40499
QuotedTweet       46438
MentionedUsers    32832
hashtag               0
hastag_counts         0
dtype: int64


📌 Step 3: Preprocess the Text Data

We'll clean the text by:




*   Converting to lowercase
*   Removing URLs, mentions, hashtags, and special characters
*   Removing extra spaces
*   List item


✅


In [12]:
import re

def preprocess_text(text):
    """Clean and normalize text data."""
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"@\w+", "", text)  # Remove mentions
    text = re.sub(r"#\w+", "", text)  # Remove hashtags
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove special characters
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text

# Apply preprocessing
df["clean_text"] = df["Text"].apply(preprocess_text)

# Show processed data
print(df[["Text", "clean_text"]].head())


                                                Text  \
0  ChatGPTで遊ぶの忘れてた！！\n書類作るコード書いてみてほしいのと、\nどこまで思考整...   
1  @AlexandrovnaIng Prohibition of ChatGPT has be...   
2  Schaut Euch an, was @fobizz @DianaKnodel alles...   
3  Bow down to chatGPT 🫡..... https://t.co/ENTSzi...   
4  Profilinde vatan, Türkiye falan yazan bireyler...   

                                          clean_text  
0                                            chatgpt  
1  prohibition of chatgpt has been added to the h...  
2  schaut euch an was alles auf die piste bringt ...  
3                                bow down to chatgpt  
4  profilinde vatan trkiye falan yazan bireylerin...  


# **Step 4: Tokenization**

Now, we'll tokenize the cleaned text using a pre-trained tokenizer (e.g., bert-base-uncased).

In [13]:
from transformers import AutoTokenizer

# Load a tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the text
df["tokenized"] = df["clean_text"].apply(lambda x: tokenizer.encode(x, truncation=True, max_length=50))

# Show processed tokens
print(df[["clean_text", "tokenized"]].head())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

                                          clean_text  \
0                                            chatgpt   
1  prohibition of chatgpt has been added to the h...   
2  schaut euch an was alles auf die piste bringt ...   
3                                bow down to chatgpt   
4  profilinde vatan trkiye falan yazan bireylerin...   

                                           tokenized  
0                     [101, 11834, 21600, 2102, 102]  
1  [101, 13574, 1997, 11834, 21600, 2102, 2038, 2...  
2  [101, 8040, 3270, 4904, 7327, 2818, 2019, 2001...  
3   [101, 6812, 2091, 2000, 11834, 21600, 2102, 102]  
4  [101, 11268, 18622, 13629, 12436, 5794, 19817,...  


Step 5: Create a PyTorch Dataset

PyTorch requires data to be in Dataset format. We will:
*   Convert text tokens into tensors

*   Pad sequences to a fixed length
*   Create a custom TweetDataset

In [14]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Define dataset class
class TweetDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=50):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )
        return encoding["input_ids"].squeeze(0), encoding["attention_mask"].squeeze(0)

# Convert text to dataset
tweet_dataset = TweetDataset(df["clean_text"].tolist(), tokenizer)

# Create DataLoader
train_loader = DataLoader(tweet_dataset, batch_size=16, shuffle=True)

# Check sample batch
for batch in train_loader:
    input_ids, attention_mask = batch
    print("Input IDs:", input_ids.shape)
    print("Attention Mask:", attention_mask.shape)
    break  # Print only first batch


Input IDs: torch.Size([16, 50])
Attention Mask: torch.Size([16, 50])


# ** Step 6: Define a Small Transformer Model
 **
We'll build a lightweight transformer-based model with:
✅ Embedding layer to represent words as vectors
✅ Transformer encoder to process input sequences
✅ Linear layer for output predictions

In [15]:
import torch.nn as nn
import torch

class SmallLanguageModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, num_heads=4, hidden_dim=256, num_layers=2):
        super(SmallLanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=hidden_dim),
            num_layers=num_layers
        )
        self.fc_out = nn.Linear(embed_dim, vocab_size)  # Predict next token

    def forward(self, x, mask=None):
        x = self.embedding(x)  # Convert input IDs to embeddings
        x = self.transformer(x, src_key_padding_mask=mask)  # Transformer encoder
        x = self.fc_out(x)  # Output layer
        return x

# Define model parameters
VOCAB_SIZE = tokenizer.vocab_size  # Use BERT's vocab size

# Initialize model
model = SmallLanguageModel(VOCAB_SIZE)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Print model architecture
print(model)


SmallLanguageModel(
  (embedding): Embedding(30522, 128)
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=256, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=256, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (fc_out): Linear(in_features=128, out_features=30522, bias=True)
)




# **Step 7: Define Loss and Optimizer**

Since this is a language model, we use:
✅ CrossEntropyLoss (for predicting next words)
✅ Adam optimizer

In [16]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


# **Step 8: Train the Model**
Now, we set up the training loop:
* Forward pass through the model
* Compute loss
* Backpropagation
* Update weights

In [17]:
EPOCHS = 3  # Adjust based on dataset size
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids, attention_mask = batch
        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids)
        loss = criterion(outputs.view(-1, VOCAB_SIZE), input_ids.view(-1))  # Compare predicted vs actual

        # Backward pass
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")

print("Training complete! 🚀")


KeyboardInterrupt: 

# **Step 9: Generate Text from the Model**

In [None]:
import torch.nn.functional as F

def generate_text(model, tokenizer, prompt, max_length=20):
    model.eval()
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

    for _ in range(max_length):
        with torch.no_grad():
            output = model(input_ids)  # Get model predictions
            next_token_logits = output[:, -1, :]  # Last token predictions
            next_token = torch.argmax(F.softmax(next_token_logits, dim=-1), dim=-1)  # Pick most likely word

            # Stop if EOS (end-of-sequence) token is generated
            if next_token.item() == tokenizer.eos_token_id:
                break

            input_ids = torch.cat([input_ids, next_token.unsqueeze(0)], dim=1)  # Append to input

    return tokenizer.decode(input_ids.squeeze(), skip_special_tokens=True)

# Example prompt
prompt_text = "Artificial intelligence is"
generated_text = generate_text(model, tokenizer, prompt_text)

print("Generated Text:", generated_text)


Happy coding