## Imports

In [1]:
import pandas as pd
import dgl
import torch
import nltk
from nltk.tokenize import word_tokenize
import plotly.graph_objects as go
import networkx as nx
import dgl.nn as dglnn
from dgl import function as fn
from transformers import GPT2Tokenizer, GPT2Model
import dgl.nn as dglnn
import torch.nn as nn
import numpy as np
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


## Submission Flag

In [2]:
is_submission = False

## Read Datasets

In [15]:
train_path = r"data\train_essays.csv" if not is_submission else r"/kaggle/input/llm-detect-ai-generated-text/train_essays.csv"
test_path = r"data\test_essays.csv" if not is_submission else r"/kaggle/input/llm-detect-ai-generated-text/test_essays.csv"
train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

In [19]:
train_data['prompt_id'].value_counts()

prompt_id
0    708
1    670
Name: count, dtype: int64

In [4]:
train_texts, test_texts, train_labels, test_labels = train_test_split(train_data['text'].values, train_data['generated'].values, test_size=0.33, random_state=42)

## Build Graph

In [5]:
# Sample text
text = train_texts[0]

# Tokenize the text using NLTK
tokens = word_tokenize(text)

# Create a directed graph with DGL
g = dgl.DGLGraph()
g.add_nodes(len(tokens))  # Add nodes to the graph based on the number of tokens

# Add edges between consecutive tokens to create a sequential graph
src = list(range(len(tokens) - 1))
dst = list(range(1, len(tokens)))
g.add_edges(src, dst)

# Convert DGL graph to NetworkX graph
nx_g = g.to_networkx().to_undirected()

# Create Plotly figure
fig = go.Figure()

# Add nodes
pos = nx.spring_layout(nx_g)
node_trace = go.Scatter(
    x=[pos[node][0] for node in nx_g],
    y=[pos[node][1] for node in nx_g],
    text=tokens,
    mode='markers+text',
    marker=dict(size=10, color='blue'),
    textposition='top center'
)
fig.add_trace(node_trace)

# Add edges
for edge in nx_g.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    fig.add_trace(go.Scatter(
        x=[x0, x1, None],
        y=[y0, y1, None],
        mode='lines',
        line=dict(color='black', width=1)
    ))

# Update layout
fig.update_layout(
    title='Tokenized Text Visualization',
    showlegend=False,
    hovermode='closest',
    margin=dict(b=20, l=5, r=5, t=40),
    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
    yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
)

# Show plot
fig.show()


Global warming is a huge concern, and most of the blame is on cars. In Paris bans driving due to smog" by Robert Duffer, Andrew Selsky's Carfree day is spinning into a big hit in Bogota , and elizabeth Rosenthal's In German Suburb, Life Goes On Without Cars and The End of Car Culture they explain everything happens in areas where they suspend car usage, regulate it, ban it and seeing trends of areas where they just do not care about cars. Limiting the use of vehicles can have a enormous beneficial impact.

One reason for a minial car usage, is that people tend to get stressed with cars in order to do errands or go to work. Following with Rosenthal's article, she mentions a media trainer with two childern named Heidrun. The mother states "When I had a car I was always tense. I'm much happier this way." In Vauban, many people move there with having sold a car that particular area has 30% of families using cars. Many people want to worry less about having to errands. It is much easier to 



In [13]:
num_nodes = 539
# Define a graph convolutional layer
conv = dglnn.GraphConv(in_feats=1, out_feats=1)

# Define a pooling function
def simple_pool(g, nodes):
    return {'h': torch.mean(nodes.mailbox['m'], dim=1)}  # Calculate mean of node features

# Add self-loops to the graph
g = dgl.add_self_loop(g)

# Perform graph convolution
h = torch.ones(num_nodes, 1)  # Initialize node features
for i in range(5):  # Example: Perform 5 iterations of graph convolution
    g.ndata['h'] = h
    g.update_all(fn.copy_u('h', 'm'), fn.mean('m', 'h_neigh'))  # Updated message passing functions
    h = torch.relu(conv(g, h))

# Convert DGL graph to NetworkX graph
nx_g = g.to_networkx().to_undirected()

# Get the layout for plotting
pos = nx.spring_layout(nx_g)

# Create edge traces
edge_x = []
edge_y = []
for edge in nx_g.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.extend([x0, x1, None])
    edge_y.extend([y0, y1, None])

edge_trace = go.Scatter(
    x=edge_x,
    y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines')

# Create node traces
node_trace = go.Scatter(
    x=[],  # Initialize 'x' as an empty list
    y=[],  # Initialize 'y' as an empty list
    text=list(nx_g.nodes()),  # Show node IDs as text
    mode='markers+text',
    hoverinfo='text',
    marker=dict(
        showscale=True,
        colorscale='YlGnBu',
        reversescale=True,
        color=[],
        size=10,
        colorbar=dict(
            thickness=15,
            title='Node Connections',
            xanchor='left',
            titleside='right'
        ),
        line=dict(width=2)))

# Extract node coordinates and store them in lists
x_list = []
y_list = []
for node in nx_g.nodes():
    x, y = pos[node]
    x_list.append(x)
    y_list.append(y)

# Update node_trace 'x' and 'y' with the extracted coordinates
node_trace['x'] = x_list
node_trace['y'] = y_list


fig = go.Figure(data=[edge_trace, node_trace],
             layout=go.Layout(
                title='Graph Visualization',
                titlefont=dict(size=16),
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20, l=5, r=5, t=40),
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
             ))

fig.show()

## Create Dataset

## Create Model

In [32]:
batch_size = 32  # Define your desired batch size

# Split train_texts, train_labels into batches
train_text_batches = [train_texts[i:i+batch_size] for i in range(0, len(train_texts), batch_size)]
train_label_batches = [train_labels[i:i+batch_size] for i in range(0, len(train_labels), batch_size)]

# Split test_texts, test_labels into batches
test_text_batches = [test_texts[i:i+batch_size] for i in range(0, len(test_texts), batch_size)]
test_label_batches = [test_labels[i:i+batch_size] for i in range(0, len(test_labels), batch_size)]

# Now, iterate through the batches and perform your processing for each batch
for train_texts_batch, train_labels_batch in zip(train_text_batches, train_label_batches):
    # Initialize GPT-2 tokenizer and model
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2Model.from_pretrained('gpt2')
    # default to left padding
    tokenizer.padding_side = "left"
    # Define PAD Token = EOS Token = 50256
    tokenizer.pad_token = tokenizer.eos_token


    # Tokenize texts and get token embeddings
    encoded_texts = tokenizer(train_texts_batch, padding=True, return_tensors='pt', truncation=True)
    
    print(encoded_texts)

    # Create a graph where tokens are nodes and their embeddings are node features
    graphs = []
    for i, text in enumerate(train_texts_batch):
        num_tokens = len(tokenizer.tokenize(text))
        g = dgl.graph(([i] * num_tokens, list(range(num_tokens))), num_nodes=num_tokens)
        # Convert the NumPy array to a PyTorch tensor
        embeddings_tensor = torch.tensor(embeddings[i][:num_tokens])

        # Ensure the tensor is placed on the correct device (CPU or GPU)
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        embeddings_tensor = embeddings_tensor.to(device)

        # Assign the tensor to the node data in the DGL graph
        g.ndata['embeddings'] = embeddings_tensor
        graphs.append(g)

    # Concatenate graphs into a single batched graph
    batched_graph = dgl.batch(graphs)

    # Create a synthetic dataset with labels
    labels = torch.tensor(train_labels_batch)
    dataset = dgl.data.DGLDataset({'graph': batched_graph, 'label': labels})

    # Define and train the GraphConv-based classifier
    class GraphConvClassifier(nn.Module):
        def __init__(self, in_feats, hidden_feats, out_feats):
            super(GraphConvClassifier, self).__init__()
            self.conv1 = dglnn.GraphConv(in_feats, hidden_feats)
            self.conv2 = dglnn.GraphConv(hidden_feats, out_feats)

        def forward(self, g, features):
            h = self.conv1(g, features)
            h = torch.relu(h)
            h = self.conv2(g, h)
            return h

    # Model parameters
    input_dim = embeddings.size(-1)
    hidden_dim = 64
    output_dim = 1  # Single output for binary classification

    # Initialize the graph classification model
    model = GraphConvClassifier(input_dim, hidden_dim, output_dim)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.BCEWithLogitsLoss()

    # Training loop
    epochs = 50
    for epoch in range(epochs):
        model.train()
        logits = model(batched_graph, batched_graph.ndata['embeddings'])
        loss = criterion(logits.squeeze(), labels.float())
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item()}")


for test_texts_batch, test_labels_batch in zip(test_text_batches, test_label_batches):
    # Perform processing for testing batches

    # Predict labels for test set 
    # Tokenize and obtain embeddings for new texts
    encoded_new_texts = tokenizer(test_texts_batch, padding=True, return_tensors='pt', truncation=True)
    with torch.no_grad():
        outputs = model(**encoded_new_texts)
    new_embeddings = outputs.last_hidden_state

    # Create graphs for new texts
    new_graphs = []
    for i, text in enumerate(test_texts_batch):
        num_tokens = len(tokenizer.tokenize(text))
        g = dgl.graph(([i] * num_tokens, list(range(num_tokens))), num_nodes=num_tokens)
        g.ndata['embeddings'] = new_embeddings[i][:num_tokens].detach().numpy()
        new_graphs.append(g)

    # Concatenate graphs into a single batched graph for prediction
    batched_new_graph = dgl.batch(new_graphs)

    # Make predictions for the new texts
    predictions_logits = model(batched_new_graph, batched_new_graph.ndata['embeddings']).squeeze()
    predictions = torch.round(torch.sigmoid(predictions_logits)).cpu().detach().numpy()

    print("Predictions:", predictions)

{'input_ids': tensor([[50256, 50256, 50256,  ...,  1365,   995,    13],
        [50256, 50256, 50256,  ...,  1342, 12231,    13],
        [50256, 50256, 50256,  ..., 11675,  3071,    13],
        ...,
        [50256, 50256, 50256,  ..., 21565,  6942,    13],
        [50256, 50256, 50256,  ...,   262,  4675,    13],
        [50256, 50256, 50256,  ..., 15596,   935,    13]]), 'attention_mask': tensor([[0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        ...,
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1]])}



To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).



DGLError: Expect number of features to match number of nodes (len(u)). Got 1024 and 1494 instead.

## Fit Model

In [None]:
# Loss function
loss_func = nn.CrossEntropyLoss()
#Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [None]:
# Assume your dataset consists of graphs and labels
num_epochs = 10
for epoch in range(num_epochs):
    for graph, label in your_dataset:  # Iterate through the dataset
        # Perform a forward pass
        features = graph.ndata['h']  # Assuming node features are stored in 'feat'
        prediction = model(graph, features)
        
        # Calculate loss
        loss = loss_func(prediction, label.unsqueeze(0))  # Assuming label is a tensor
        
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Optional: Print loss for each epoch
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")


## Predict Test Set

## Create Submission

In [None]:
submission = pd.DataFrame({"id": test_data["id"], "generated": predictions})
submission_path = r"data\submission.csv" if not is_submission else r"/kaggle/working/submission.csv"
submission.to_csv(submission_path, index=False)