# Setup
Install necessary packages, mount drive

Notes and reminders below

* I belive this version of the DGL API only works on CPU. We need to find a version that works on GPU. Follow this setup guide here. However, it looks like most google collab notebooks use CUDA version 11.2+, which I don't think there's DGL support for yet (need to confirm). Check cuda version with !nvidia-smi

In [170]:
!pip install dgl
import dgl
import dgl.nn as dglnn
import dgl.function as fn
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import scipy.sparse as sp
import itertools

from sklearn.preprocessing import StandardScaler

from tqdm import tqdm, tqdm_notebook, tnrange



In [171]:
device = 'cpu'      # change to 'cuda' for GPU

In [172]:
#note if anyone other than Nick is running this, you may need to update paths
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [173]:
# Import the pre-build Austin TX dataset
users_ = pd.read_pickle('/content/drive/MyDrive/210: Capstone/w210: Capstone Team Folder/notebooks/data/austin_users.pkl')
ratings_ = pd.read_pickle('/content/drive/MyDrive/210: Capstone/w210: Capstone Team Folder/notebooks/data/austin_review_v2.pkl')
business_ = pd.read_pickle('/content/drive/MyDrive/210: Capstone/w210: Capstone Team Folder/notebooks/data/austin_business.pkl')


In [174]:
ratings_.head()

Unnamed: 0,user_id,business_id,rating,date,useful,funny,cool,text,rating_binary,split_label
20,1RCRKuHgP3FskGUVnmFdxg,mOnesB4IF9j6-ZmHoOHOig,4.0,2017-05-26 03:05:46,0,0,0,I think their rice dishes are way better than ...,1,train
23,d01SZKYmReEar7varZB0HQ,I2OblwJG8_XzFxxoHU0vVQ,4.0,2010-05-14 14:33:54,5,1,1,I just had my lasik done last week and I am re...,1,train
30,pRPT3vqhqpU7kHgmKJamvw,-_GnwXmzC3DXsHR9nyaC2g,3.0,2012-11-06 07:09:57,0,0,0,3.5 stars! I got the avocado margarita and it ...,0,test
41,uUrXZ2guG27PQUeR6u8K3w,WtDOs3a6k_oPJmwiDh4qBQ,2.0,2009-02-28 22:47:35,3,1,2,"I wanted this to be a great place, but I wasn'...",0,test
66,-5qiq9PWVeb0IICefvAHCQ,n66LuZ8NooZIcAfYvI4s5A,1.0,2010-08-20 20:35:26,3,4,2,So my roommate borrowed my clock radio and app...,0,train


In [175]:
#subset data to only have reviews of places where the rating was greater than 3
# indicating that the person "liked" the restaurant
ratings_subset = ratings_[ratings_['rating_binary'] == 1]
print("# of reviews in reduced dataset: {}".format(len(ratings_subset)))
print("# of reviews in full dataset: {}".format(len(ratings_)))

# of reviews in reduced dataset: 647007
# of reviews in full dataset: 922780


In [176]:
# Get a unique ID for each node starting at 0
user_ids = ratings_subset['user_id'].unique()
biz_ids = ratings_subset['business_id'].unique()
all_ids = np.append(user_ids, biz_ids)

#note - there are some user ids that are also business ids, removing these
all_ids = np.unique(all_ids)

global_id_map = {ele: i for i, ele in enumerate(all_ids)}
id_lookup = {v:k for k,v in global_id_map.items()}

In [177]:
#set up tuple of rating pairs (user, business) IDs
rating_pairs = (np.array([global_id_map[ele] for ele in ratings_subset["user_id"]],
                          dtype=np.int64),
                np.array([global_id_map[ele] for ele in ratings_subset["business_id"]],
                          dtype=np.int64))

In [178]:
g = dgl.graph(rating_pairs)

In [179]:
#subset user data to only have users who's IDs are covered in this new review subset
user_id_subset = ratings_subset['user_id'].unique()
users_subset = users_[users_['user_id'].isin(user_id_subset)]

scaler = StandardScaler()

#process user features - scale all
users_subset['user_review_count'] = scaler.fit_transform(users_subset['user_review_count'].values.astype(np.float32).reshape(-1,1))
users_subset['funny_reviews'] = scaler.fit_transform(users_subset['funny_reviews'].values.astype(np.float32).reshape(-1,1))
users_subset['cool_reviews'] = scaler.fit_transform(users_subset['cool_reviews'].values.astype(np.float32).reshape(-1,1))
users_subset['n_fans'] = scaler.fit_transform(users_subset['n_fans'].values.astype(np.float32).reshape(-1,1))
users_subset['average_stars'] = scaler.fit_transform(users_subset['average_stars'].values.astype(np.float32).reshape(-1,1))
users_subset['useful_reviews'] = scaler.fit_transform(users_subset['useful_reviews'].values.astype(np.float32).reshape(-1,1))

#simplified version just counting the number of years elite
users_subset['years_elite'] = scaler.fit_transform(np.asarray([len(x) for x in users_subset['years_elite']], dtype = np.float32).reshape(-1,1))

users_subset.drop(['user_name', 'user_yelp_since', 'friends'], axis=1, inplace=True)
users_subset.set_index('user_id', inplace=True)
users_subset.head(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

S

Unnamed: 0_level_0,user_review_count,useful_reviews,funny_reviews,cool_reviews,n_fans,years_elite,average_stars
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
dIIKEfOgo0KqUfGQvGikPg,9.361635,13.362535,9.362913,13.001664,14.603428,7.577426,0.367995


In [180]:
#subset business data to only have biz who's IDs are covered in this new review subset
biz_id_subset = ratings_subset['business_id'].unique()
business_subset = business_[business_['business_id'].isin(biz_id_subset)]

drop_list = ['name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'is_open']

business_subset.drop(drop_list, axis=1, inplace=True)
business_subset.set_index('business_id', inplace=True)
business_subset.head(1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0_level_0,stars,review_count,RestaurantsPriceRange2_2,RestaurantsPriceRange2_3,RestaurantsPriceRange2_4,RestaurantsPriceRange2_None,ByAppointmentOnly_None,ByAppointmentOnly_True,BusinessAcceptsCreditCards_None,BusinessAcceptsCreditCards_True,DogsAllowed_None,DogsAllowed_True,RestaurantsDelivery_None,RestaurantsDelivery_True,BusinessAcceptsBitcoin_True,BikeParking_None,BikeParking_True,RestaurantsTakeOut_None,RestaurantsTakeOut_True,WheelchairAccessible_None,WheelchairAccessible_True,WiFi_'no',WiFi_'paid',WiFi_None,WiFi_u'free',WiFi_u'no',WiFi_u'paid',AcceptsInsurance_None,AcceptsInsurance_True,RestaurantsGoodForGroups_None,RestaurantsGoodForGroups_True,HasTV_None,HasTV_True,RestaurantsReservations_None,RestaurantsReservations_True,OutdoorSeating_None,OutdoorSeating_True,NoiseLevel_'loud',NoiseLevel_'quiet',NoiseLevel_'very_loud',...,"BestNights_{'monday': True, 'tuesday': True, 'friday': False, 'wednesday': True, 'thursday': False, 'sunday': False, 'saturday': False}","BestNights_{'monday': True, 'tuesday': True, 'friday': True, 'wednesday': False, 'thursday': False, 'sunday': False, 'saturday': False}",BYOBCorkage_'yes_corkage',BYOBCorkage_'yes_free',BYOBCorkage_None,BYOBCorkage_u'yes_corkage',BYOBCorkage_u'yes_free',DriveThru_None,DriveThru_True,BYOB_None,BYOB_True,Corkage_None,Corkage_True,RestaurantsCounterService_True,AgesAllowed_u'21plus',AgesAllowed_u'allages',"DietaryRestrictions_{'dairy-free': False, 'gluten-free': False, 'vegan': False, 'kosher': False, 'halal': False, 'soy-free': False, 'vegetarian': True}","DietaryRestrictions_{'dairy-free': False, 'gluten-free': False, 'vegan': True, 'kosher': False, 'halal': False, 'soy-free': False, 'vegetarian': False}","DietaryRestrictions_{'dairy-free': False, 'gluten-free': True, 'vegan': False, 'kosher': False, 'halal': False, 'soy-free': False, 'vegetarian': False}","DietaryRestrictions_{'dairy-free': False, 'gluten-free': True, 'vegan': True, 'kosher': False, 'halal': False, 'soy-free': False, 'vegetarian': False}",garage_True,street_True,validated_True,lot_True,valet_True,romantic_True,intimate_True,classy_True,hipster_True,divey_True,touristy_True,trendy_True,upscale_True,casual_True,dessert_True,latenight_True,lunch_True,dinner_True,brunch_True,breakfast_True
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
N3_Gs3DnX4k9SgpwJxdEfw,5.0,30,0,0,0,0,0,0,0,1,0,1,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [181]:
user_ids = set(user_ids)
biz_ids = set(biz_ids)

feats = []

for key,value in tqdm(id_lookup.items()):
  if value in user_ids:
    u_feats = users_subset.loc[value]
    u_feats_torch = torch.tensor(u_feats)
    u_feats_torch = F.pad(input=u_feats_torch, 
                           pad=(0,156), 
                           mode='constant', 
                           value=0)
    feats.append(u_feats_torch)
  else:
    b_feats = business_subset.loc[value]
    b_feats_torch = torch.tensor(b_feats)
    feats.append(b_feats_torch)

#convert to pytorch tensor
torch_feats = torch.stack(feats)

100%|██████████| 83283/83283 [00:33<00:00, 2488.56it/s]


In [182]:
#add node features to graph
g.ndata['feat'] = torch_feats
g

Graph(num_nodes=83283, num_edges=647007,
      ndata_schemes={'feat': Scheme(shape=(163,), dtype=torch.float64)}
      edata_schemes={})

In [183]:
# Split edge set for training and testing
u, v = g.edges()

eids = np.arange(g.number_of_edges())
eids = np.random.permutation(eids)
test_size = int(len(eids) * 0.1)
train_size = g.number_of_edges() - test_size

#create subgraph with training data
train_g = dgl.remove_edges(g, eids[:test_size])
test_g = dgl.remove_edges(g, eids[test_size:])

In [184]:
#set up dataloader samplers
negative_sampler = dgl.dataloading.negative_sampler.Uniform(5)
sampler = dgl.dataloading.MultiLayerNeighborSampler([4, 4])

In [185]:
# Build train and train dataloaders
train_dataloader = dgl.dataloading.EdgeDataLoader(
    # The following arguments are specific to NodeDataLoader.
    train_g,                                  # The training graph
    torch.arange(train_g.number_of_edges()),  # The edges to iterate over
    sampler,                                # The neighbor sampler
    negative_sampler=negative_sampler,      # The negative sampler
    device=device,                          # Put the MFGs on CPU or GPU
    # The following arguments are inherited from PyTorch DataLoader.
    batch_size=1024,    # Batch size
    shuffle=True,       # Whether to shuffle the nodes for every epoch
    drop_last=False,    # Whether to drop the last incomplete batch
    num_workers=0       # Number of sampler processes
)

In [186]:
# Build train and test dataloaders
test_dataloader = dgl.dataloading.EdgeDataLoader(
    # The following arguments are specific to NodeDataLoader.
    test_g,                                  # The training graph
    torch.arange(test_g.number_of_edges()),  # The edges to iterate over
    sampler,                                # The neighbor sampler
    negative_sampler=negative_sampler,      # The negative sampler
    device=device,                          # Put the MFGs on CPU or GPU
    # The following arguments are inherited from PyTorch DataLoader.
    batch_size=1024,    # Batch size
    shuffle=True,       # Whether to shuffle the nodes for every epoch
    drop_last=False,    # Whether to drop the last incomplete batch
    num_workers=0       # Number of sampler processes
)

In [187]:
input_nodes, pos_graph, neg_graph, mfgs = next(iter(train_dataloader))
print('Number of input nodes:', len(input_nodes))
print('Positive graph # nodes:', pos_graph.number_of_nodes(), '# edges:', pos_graph.number_of_edges())
print('Negative graph # nodes:', neg_graph.number_of_nodes(), '# edges:', neg_graph.number_of_edges())
print(mfgs)

Number of input nodes: 15535
Positive graph # nodes: 6748 # edges: 1024
Negative graph # nodes: 6748 # edges: 5120
[Block(num_src_nodes=15535, num_dst_nodes=12472, num_edges=7695), Block(num_src_nodes=12472, num_dst_nodes=6748, num_edges=7695)]


In [188]:
import torch.nn as nn
import torch.nn.functional as F
from dgl.nn import SAGEConv

class Model(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(Model, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, aggregator_type='mean')
        self.conv2 = SAGEConv(h_feats, h_feats, aggregator_type='mean')
        self.h_feats = h_feats

    def forward(self, mfgs, x):
        h_dst = x[:mfgs[0].num_dst_nodes()]
        h = self.conv1(mfgs[0], (x, h_dst))
        h = F.relu(h)
        h_dst = h[:mfgs[1].num_dst_nodes()]
        h = self.conv2(mfgs[1], (h, h_dst))
        return h

In [189]:
import dgl.function as fn

class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            # Compute a new edge feature named 'score' by a dot-product between the
            # source node feature 'h' and destination node feature 'h'.
            g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            # u_dot_v returns a 1-element vector for each edge so you need to squeeze it.
            return g.edata['score'][:, 0]

In [190]:
#optional score predictor to try out
class MLPPredictor(nn.Module):
    def __init__(self, h_feats):
        super().__init__()
        self.W1 = nn.Linear(h_feats * 2, h_feats)
        self.W2 = nn.Linear(h_feats, 1)

    def apply_edges(self, edges):
        """
        Computes a scalar score for each edge of the given graph.

        Parameters
        ----------
        edges :
            Has three members ``src``, ``dst`` and ``data``, each of
            which is a dictionary representing the features of the
            source nodes, the destination nodes, and the edges
            themselves.

        Returns
        -------
        dict
            A dictionary of new edge features.
        """
        h = torch.cat([edges.src['h'], edges.dst['h']], 1)
        return {'score': self.W2(F.relu(self.W1(h))).squeeze(1)}

    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            g.apply_edges(self.apply_edges)
            return g.edata['score']

In [191]:
#set up functions for loss and eval
def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
    return F.binary_cross_entropy_with_logits(scores, labels)

def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return roc_auc_score(labels, scores)

def compute_accuracy(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    scores = [1 if score > .5 else 0 for score in scores]
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return accuracy_score(labels, scores)

In [192]:
model = Model(train_g.ndata['feat'].shape[1], 163).to(device) #163 features, in and out layers the same shape

# You can replace DotPredictor with MLPPredictor.
#pred = MLPPredictor(163)
pred = DotPredictor()

In [193]:
import tqdm

# ----------- 3. set up loss and optimizer -------------- #
# in this case, loss will in training loop
optimizer = torch.optim.Adam(itertools.chain(model.parameters(), pred.parameters()), lr=0.01)

# ----------- 4. training -------------------------------- #
for epoch in range(1):
    model.train()
    with tqdm.tqdm(train_dataloader) as tq:
        for step, (input_nodes, train_pos_graph, train_neg_graph, mfgs) in enumerate(tq):
            
            # forward pass
            inputs = mfgs[0].srcdata['feat']
            h = model(mfgs, inputs.float())
            pos_score = pred(train_pos_graph, h)
            neg_score = pred(train_neg_graph, h)
            loss = compute_loss(pos_score, neg_score)

            #backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            tq.set_postfix({'loss': '%.03f' % loss.item()}, refresh=False)

100%|██████████| 569/569 [01:55<00:00,  4.92it/s, loss=0.621]


In [194]:
from sklearn.metrics import roc_auc_score, accuracy_score

auc_scores = []
accuracy_scores = []

#select which scoreing fuction you want to use
# pred = MLPPredictor(163)
pred = DotPredictor()

pos_scores = []
neg_scores = []

#set model into evaluation mode
model.eval()
with torch.no_grad():
  for input_nodes, test_pos_g, test_neg_g, mfgs in test_dataloader:
    # feature copy from CPU to GPU takes place here
    inputs = mfgs[0].srcdata['feat']
    h = model(mfgs, inputs.float())

    pos_score = pred(test_pos_g, h)
    neg_score = pred(test_neg_g, h)

    pos_scores.append(pos_score)
    neg_scores.append(neg_score)

    block_auc = compute_auc(pos_score, neg_score)
    auc_scores.append(block_auc)

    block_accuracy = compute_accuracy(pos_score, neg_score)
    accuracy_scores.append(block_accuracy)

# Print scores for test set
print('Accuracy: {:.04f}'.format(np.mean(accuracy_scores)))
print('AURoC: {:.04f}'.format(np.mean(auc_scores)))

Accuracy: 0.9105
AURoC: 0.7260


In [195]:
auc_scores

[0.7056476593017579,
 0.7103742599487305,
 0.7422914505004883,
 0.7116771697998048,
 0.7395185470581055,
 0.7354099273681641,
 0.716645622253418,
 0.7007894515991211,
 0.7271276473999022,
 0.7260248184204101,
 0.724253273010254,
 0.7188493728637695,
 0.7214777946472168,
 0.734033203125,
 0.728388786315918,
 0.7397422790527344,
 0.7119598388671875,
 0.7368038177490235,
 0.6853460311889648,
 0.7226156234741211,
 0.7177248001098633,
 0.7076971054077148,
 0.7235158920288086,
 0.7264865875244141,
 0.7281171798706054,
 0.71782865524292,
 0.7339494705200196,
 0.7260492324829102,
 0.7274847030639648,
 0.71634521484375,
 0.7285631179809571,
 0.733920669555664,
 0.7366113662719727,
 0.7015331268310547,
 0.7279899597167969,
 0.7188461303710938,
 0.7340316772460938,
 0.7216946601867676,
 0.7317811965942383,
 0.7330778121948243,
 0.7100517272949218,
 0.7105819702148437,
 0.7236518859863281,
 0.723274040222168,
 0.7091470718383789,
 0.7006849288940429,
 0.7292215347290039,
 0.7115530014038086,
 0.74