<a href="https://colab.research.google.com/github/pythonuzgit/elmurodov/blob/master/Graph%20Neural%20Network/Graph_Neural_Network_in_NLP_with_Movies_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.3.1.tar.gz (661 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m661.6/661.6 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: torch_geometric
  Building wheel for torch_geometric (pyproject.toml) ... [?25l[?25hdone
  Created wheel for torch_geometric: filename=torch_geometric-2.3.1-py3-none-any.whl size=910454 sha256=4a1173eff1c699fe25bc55024335f165c52eb9b810c6f846411825c364a455dc
  Stored in directory: /root/.cache/pip/wheels/ac/dc/30/e2874821ff308ee67dcd7a66dbde912411e19e35a1addda028
Successfully built torch_geometric
Installing collected packages: torch_geometric
Successfully installed torch_geometric-2.3.1


Import

In [None]:
import pandas as pd
import numpy as np
import networkx as nx
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

Load the code search dataset

In [None]:
df = pd.read_csv('/content/Movies_dataset.csv')

In [None]:
df.head()

Unnamed: 0,index,title,original_language,release_date,popularity,vote_average,vote_count,overview
0,0,Spider-Man: Across the Spider-Verse,en,31-05-2023,3368.627,8.5,3386,"After reuniting with Gwen Stacy, Brooklyn’s fu..."
1,1,Transformers: Rise of the Beasts,en,06-06-2023,2160.316,7.5,2747,When a new threat capable of destroying the en...
2,2,The Flash,en,13-06-2023,2108.713,7.0,2305,When his attempt to save his family inadverten...
3,3,No Hard Feelings,en,15-06-2023,1978.358,7.1,562,"On the brink of losing her childhood home, Mad..."
4,4,Barbie,en,19-07-2023,1976.513,7.5,3013,Barbie and Ken are having the time of their li...


In [None]:
df.shape

(10000, 8)

In [None]:
df.columns

Index(['index', 'title', 'original_language', 'release_date', 'popularity',
       'vote_average', 'vote_count', 'overview'],
      dtype='object')

In [None]:
df.isnull().sum()

index                 0
title                 0
original_language     0
release_date         18
popularity            0
vote_average          0
vote_count            0
overview             95
dtype: int64

In [None]:
df.fillna(value='', inplace=True)

Select the relevant columns

In [None]:
selected_columns = ['index', 'title', 'overview']
df = df[selected_columns]

In [None]:
def preprocess_data(data):
    # Drop any rows with missing values
    #data.dropna(inplace=True)

    # Tokenize 'text' and 'code' columns
    vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
    text_features = vectorizer.fit_transform(data['title'])
    code_features = vectorizer.fit_transform(data['overview'])

    # Normalize features
    scaler = StandardScaler()
    text_features = scaler.fit_transform(text_features.toarray())
    code_features = scaler.fit_transform(code_features.toarray())

    # Split into features and labels
    X = np.concatenate((text_features, code_features), axis=1)
    y = data['index']

     #Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create the processed DataFrame
    column_names = list(data.columns)[:-1]  # Exclude the 'task_id' column
    processed_data = pd.DataFrame({col: X_train[:, i] for i, col in enumerate(column_names)})
    processed_data['index'] = y_train

    return processed_data, X_test, y_test

In [None]:
processed_data, X_test, y_test = preprocess_data(df)

Create graph representation

In [None]:

def create_graph_representation(data):
    edge_index = np.array([[i, i] for i in range(len(data))]).T
    x = torch.tensor(data.drop(['index'], axis=1).values, dtype=torch.float)
    y = torch.tensor(data['index'].values, dtype=torch.long)
    return Data(x=x, edge_index=edge_index, y=y)

In [None]:
graph_data = create_graph_representation(processed_data)

Define the GNN model

In [None]:

class GNNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GNNModel, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

Set the device for computation

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Initialize the GNN model

In [None]:

input_dim = graph_data.num_node_features
hidden_dim = 64
output_dim = len(processed_data['index'].unique())
model = GNNModel(input_dim, hidden_dim, output_dim).to(device)

Define the loss function and optimizer

In [None]:

criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

Define the custom collate function

In [None]:
def collate_fn(batch):
    return batch

# Create the DataLoader
data_loader = DataLoader([graph_data], batch_size=1, shuffle=True, collate_fn=collate_fn)

Define the custom collate function

In [None]:

def collate_fn(batch):
    x = [data.x for data in batch]
    edge_index = [data.edge_index for data in batch]
    y = [data.y for data in batch]
    return {'x': x, 'edge_index': edge_index, 'y': y}

# Convert the graph data to PyTorch DataLoader
data_loader = DataLoader([graph_data], batch_size=1, collate_fn=collate_fn)

In [None]:
for data in data_loader:
    x = torch.tensor(data['x'][0], dtype=torch.float).to(device)
    edge_index = torch.tensor(data['edge_index'][0], dtype=torch.long).to(device)
    y = torch.tensor(data['y'][0], dtype=torch.float).to(device)

    # Normalize the target values
    y_min = torch.min(y)
    y_max = torch.max(y)
    y_normalized = (y - y_min) / (y_max - y_min)

    optimizer.zero_grad()
    out = model(x, edge_index)

    # Verify the target values range
    print(f"Min target value: {torch.min(y_normalized).item()}")
    print(f"Max target value: {torch.max(y_normalized).item()}")

    # Convert y_normalized to torch.long
    y_normalized = y_normalized.long()

    loss = criterion(out, y_normalized)
    loss.backward()
    optimizer.step()

  x = torch.tensor(data['x'][0], dtype=torch.float).to(device)
  y = torch.tensor(data['y'][0], dtype=torch.float).to(device)


Min target value: 0.0
Max target value: 1.0


Testing the model

In [None]:

model.eval()
total_correct = 0
total_samples = 0

with torch.no_grad():
    for data in data_loader:
        x = torch.tensor(data['x'][0], dtype=torch.float).to(device)
        edge_index = torch.tensor(data['edge_index'][0], dtype=torch.long).to(device)
        y = torch.tensor(data['y'][0], dtype=torch.float).to(device)

        # Normalize the target values
        y_min = torch.min(y)
        y_max = torch.max(y)
        y_normalized = (y - y_min) / (y_max - y_min)

        out = model(x, edge_index)
        _, predicted = torch.max(out, dim=1)
        total_correct += (predicted == y_normalized).sum().item()
        total_samples += y_normalized.size(0)

accuracy = total_correct / total_samples
print(f"Accuracy: {accuracy * 100:.2f}%")



  x = torch.tensor(data['x'][0], dtype=torch.float).to(device)
  y = torch.tensor(data['y'][0], dtype=torch.float).to(device)


Accuracy: 79.84%
