<a href="https://colab.research.google.com/github/plue1011/GNN/blob/master/RecSys.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[Hands-on Graph Neural Networks with PyTorch & PyTorch Geometric](https://towardsdatascience.com/hands-on-graph-neural-networks-with-pytorch-pytorch-geometric-359487e221a8)

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


# PyTorch Geometric

## install

[公式リファレンス](https://pytorch-geometric.readthedocs.io/en/latest/notes/installation.html)

In [0]:
!pip install torch-scatter==latest+cu101 -f https://pytorch-geometric.com/whl/torch-1.4.0.html
!pip install torch-sparse==latest+cu101 -f https://pytorch-geometric.com/whl/torch-1.4.0.html
!pip install torch-cluster==latest+cu101 -f https://pytorch-geometric.com/whl/torch-1.4.0.html
!pip install torch-spline-conv==latest+cu101 -f https://pytorch-geometric.com/whl/torch-1.4.0.html
!pip install torch-geometric

Looking in links: https://pytorch-geometric.com/whl/torch-1.4.0.html
Collecting torch-scatter==latest+cu101
[?25l  Downloading https://pytorch-geometric.com/whl/torch-1.4.0/torch_scatter-latest%2Bcu101-cp36-cp36m-linux_x86_64.whl (10.6MB)
[K     |████████████████████████████████| 10.6MB 2.5MB/s 
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.0.4
Looking in links: https://pytorch-geometric.com/whl/torch-1.4.0.html
Collecting torch-sparse==latest+cu101
[?25l  Downloading https://pytorch-geometric.com/whl/torch-1.4.0/torch_sparse-latest%2Bcu101-cp36-cp36m-linux_x86_64.whl (15.2MB)
[K     |████████████████████████████████| 15.2MB 207kB/s 
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.1
Looking in links: https://pytorch-geometric.com/whl/torch-1.4.0.html
Collecting torch-cluster==latest+cu101
[?25l  Downloading https://pytorch-geometric.com/whl/torch-1.4.0/torch_cluster-latest%2Bcu101-cp36-cp36m-linux_x86_

In [0]:
import numpy as np
import pandas as pd
import itertools
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

import torch
import torch.nn.functional as F

from torch_geometric.data import Data
from torch_geometric.data import InMemoryDataset
from torch_geometric.data import DataLoader
from torch_geometric.nn import TopKPooling
from torch_geometric.nn import SAGEConv
from torch_geometric.nn import global_mean_pool as gap
from torch_geometric.nn import global_max_pool as gmp
# from torch_geometric.nn import MessagePassing
# from torch_geometric.utils import remove_self_loops, add_self_loops

from tqdm.notebook import tqdm as tqdm

%matplotlib inline

# データの読み込み

In [0]:
path = "/content/drive/My Drive/fueki/GNN/dataset/RecSys/"

In [0]:
df = pd.read_csv(path+"yoochoose-clicks.dat", header=None)
df.columns=['session_id','timestamp','item_id','category']

buy_df = pd.read_csv(path+"yoochoose-buys.dat", header=None)
buy_df.columns=['session_id','timestamp','item_id','price','quantity']

# https://qiita.com/yoshimo123/items/3717bd17ba74764dbc69
item_encoder = LabelEncoder()
df['item_id'] = item_encoder.fit_transform(df.item_id)
df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,session_id,timestamp,item_id,category
0,1,2014-04-07T10:51:09.277Z,2053,0
1,1,2014-04-07T10:54:09.868Z,2052,0
2,1,2014-04-07T10:54:46.998Z,2054,0
3,1,2014-04-07T10:57:00.306Z,9876,0
4,2,2014-04-07T13:56:37.614Z,19448,0


In [0]:
buy_df.head()

Unnamed: 0,session_id,timestamp,item_id,price,quantity
0,420374,2014-04-06T18:44:58.314Z,214537888,12462,1
1,420374,2014-04-06T18:44:58.325Z,214537850,10471,1
2,281626,2014-04-06T09:40:13.032Z,214535653,1883,1
3,420368,2014-04-04T06:13:28.848Z,214530572,6073,1
4,420368,2014-04-04T06:13:28.858Z,214835025,2617,1


# データの前処理

In [0]:
# データ数が多いため、サンプリングする
sampled_session_id = np.random.choice(df.session_id.unique(), 1000000, replace=False)
df = df.loc[df.session_id.isin(sampled_session_id)]
df.nunique()

session_id    1000000
timestamp     3564874
item_id         35651
category          228
dtype: int64

In [0]:
# 購入フラグ
df['label'] = df.session_id.isin(buy_df.session_id)
df.head()

Unnamed: 0,session_id,timestamp,item_id,category,label
0,1,2014-04-07T10:51:09.277Z,2053,0,False
1,1,2014-04-07T10:54:09.868Z,2052,0,False
2,1,2014-04-07T10:54:46.998Z,2054,0,False
3,1,2014-04-07T10:57:00.306Z,9876,0,False
4,2,2014-04-07T13:56:37.614Z,19448,0,False


# Dataset作成

In [0]:
class YooChooseDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(YooChooseDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return []
    @property
    def processed_file_names(self):
        return ["/content/drive/My Drive/fueki/GNN/dataset/RecSys/yoochoose_click_binary_1M_sess.dataset"]

    def download(self):
        pass
    
    def process(self):
        
        data_list = []

        # process by session_id
        grouped = df.groupby('session_id')
        for session_id, group in tqdm(grouped):
            # itemをidに変換
            sess_item_id = LabelEncoder().fit_transform(group.item_id)
            group = group.reset_index(drop=True)
            group['sess_item_id'] = sess_item_id

            # 同じsession中のitem番号を抽出
            node_features = group.loc[group.session_id==session_id,['sess_item_id','item_id']].sort_values('sess_item_id').item_id.drop_duplicates().values

            # node_features=[1, 2] => [[1],[2]]
            node_features = torch.LongTensor(node_features).unsqueeze(1)

            # itemを頂点として、itemの閲覧遷移で枝を接続させている [0,1,2,3] => [[0,1,2], [1,2,3]]
            source_nodes = group.sess_item_id.values[:-1]
            target_nodes = group.sess_item_id.values[1:]
            edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

            x = node_features

            # session内でlabelの値は一緒なため、0番目を見るだけ
            y = torch.FloatTensor([group.label.values[0]])

            data = Data(x=x, edge_index=edge_index, y=y)
            data_list.append(data)
        
        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

process by session_idで行っていること

In [0]:
# grouped = df.groupby('session_id')
# data_list = []
# for session_id, group in tqdm(grouped):
#     # itemをidに変換
#     sess_item_id = LabelEncoder().fit_transform(group.item_id)
#     group = group.reset_index(drop=True)
#     group['sess_item_id'] = sess_item_id

#     # 同じsession中のitem番号を抽出
#     node_features = group.loc[group.session_id==session_id,['sess_item_id','item_id']].sort_values('sess_item_id').item_id.drop_duplicates().values

#     # node_features=[1, 2] => [[1],[2]]
#     node_features = torch.LongTensor(node_features).unsqueeze(1)

#     # itemを頂点として、itemの閲覧遷移で枝を接続させている [0,1,2,3] => [[0,1,2], [1,2,3]]
#     source_nodes = group.sess_item_id.values[:-1]
#     target_nodes = group.sess_item_id.values[1:]
#     edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)
    
#     x = node_features

#     # session内でlabelの値は一緒なため、0番目を見るだけ
#     y = torch.FloatTensor([group.label.values[0]])

#     data = Data(x=x, edge_index=edge_index, y=y)
#     data_list.append(data)

In [0]:
# パスにデータが存在しないときデータを作成し、存在するときは読み込んでくれる
dataset = YooChooseDataset("/content/drive/My Drive/fueki/GNN/dataset/RecSys/")

Processing...


HBox(children=(IntProgress(value=0, max=1000000), HTML(value='')))

In [0]:
# train, val, testにデータを分割する
dataset = dataset.shuffle()
data_len = len(dataset)
train_pos = int(data_len * 0.8)
val_pos = train_pos + int(data_len * 0.1)
train_dataset = dataset[:train_pos]
val_dataset = dataset[train_pos:val_pos]
test_dataset = dataset[val_pos:]
print(len(train_dataset), len(val_dataset), len(test_dataset))

In [0]:
# DataLoaderの作成
batch_size= 512
train_loader = DataLoader(train_dataset, batch_size=batch_size)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# モデル

In [0]:
# modelの定義
embed_dim = 128
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        self.conv1 = SAGEConv(embed_dim, 128)
        self.pool1 = TopKPooling(128, ratio=0.8)
        self.conv2 = SAGEConv(128, 128)
        self.pool2 = TopKPooling(128, ratio=0.8)
        self.conv3 = SAGEConv(128, 128)
        self.pool3 = TopKPooling(128, ratio=0.8)
        self.item_embedding = torch.nn.Embedding(num_embeddings=df.item_id.max() +1, embedding_dim=embed_dim)
        self.lin1 = torch.nn.Linear(256, 128)
        self.lin2 = torch.nn.Linear(128, 64)
        self.lin3 = torch.nn.Linear(64, 1)
        self.bn1 = torch.nn.BatchNorm1d(128)
        self.bn2 = torch.nn.BatchNorm1d(64)
        self.act1 = torch.nn.ReLU()
        self.act2 = torch.nn.ReLU()        
  
    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = self.item_embedding(x)
        x = x.squeeze(1)  # 1次元の特徴量       

        # 入力層から畳み込んでRelu
        x = F.relu(self.conv1(x, edge_index))

        # 
        x, edge_index, _, batch, _, _ = self.pool1(x, edge_index, None, batch)
        x1 = torch.cat([gmp(x, batch), gap(x, batch)], dim=1)

        x = F.relu(self.conv2(x, edge_index))
     
        x, edge_index, _, batch, _, _ = self.pool2(x, edge_index, None, batch)
        x2 = torch.cat([gmp(x, batch), gap(x, batch)], dim=1)

        x = F.relu(self.conv3(x, edge_index))

        x, edge_index, _, batch, _, _ = self.pool3(x, edge_index, None, batch)
        x3 = torch.cat([gmp(x, batch), gap(x, batch)], dim=1)

        x = x1 + x2 + x3

        x = self.lin1(x)
        x = self.act1(x)
        x = self.lin2(x)
        x = self.act2(x)      
        x = F.dropout(x, p=0.5, training=self.training)

        x = torch.sigmoid(self.lin3(x)).squeeze(1)

        return x

In [0]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
crit = torch.nn.BCELoss()

# 学習

In [0]:
# 学習関数
def train():
    model.train()
    loss_all = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data)
        label = data.y.to(device)
        loss = crit(output, label)
        loss.backward()
        loss_all += data.num_graphs * loss.item()
        optimizer.step()
    return loss_all / len(train_dataset)

In [0]:
def evaluate(loader):
    model.eval()

    predictions = []
    labels = []

    with torch.no_grad():
        for data in loader:

            data = data.to(device)
            pred = model(data).detach().cpu().numpy()

            label = data.y.detach().cpu().numpy()
            predictions.append(pred)
            labels.append(label)

    predictions = np.hstack(predictions)
    labels = np.hstack(labels)
    
    return roc_auc_score(labels, predictions)

In [0]:
for epoch in tqdm(range(20)):
    loss = train()
    train_acc = evaluate(train_loader)
    val_acc = evaluate(val_loader)
    test_acc = evaluate(test_loader)
    print('Epoch: {:03d}, Loss: {:.5f}, Train Auc: {:.5f}, Val Auc: {:.5f}, Test Auc: {:.5f}'.
          format(epoch, loss, train_acc, val_acc, test_acc))