# Download link for ML-10M dataset (User-Tag): 

http://networkrepository.com/ia-movielens-user2tags-10m.php

In [1]:

# coding: utf-8


import dill
from collections import defaultdict
from datetime import datetime, timedelta


node_idx = {}
idx_node = []


users = []
movies = []
with open('movielens-10m_rating') as f:
    
    lines = f.read().splitlines()
    for l in lines:
        if l[0] == '%':
            continue
            
        u_id, m_id, r, t = map(float, l.split(" "))
        
        users.append(int(u_id))
        movies.append(int(m_id))
        
user = list(set(users))


node_cnt = 0

for u in users:
    if "u_"+str(u) not in node_idx:
        node_idx["u_"+str(u)] = node_cnt
        node_cnt += 1

for t in movies: 
    if "m_"+str(t) not in node_idx:
        node_idx["m_"+str(t)] = node_cnt
        node_cnt += 1
print ("# nodes", node_cnt)


# nodes 80555


In [2]:
links = []
ts = []
ctr = 0
with open('movielens-10m_rating') as f:
    
    lines = f.read().splitlines()
    for l in lines:
        if l[0] == '%':
            continue
            
        x, y, r, t = map(float, l.split(" "))
        
        assert (r > 0)
        
        timestamp = datetime.fromtimestamp(int(t))
        ts.append(timestamp)
        
        ctr += 1
        if ctr % 1000000 == 0:
            print (ctr)
        links.append((node_idx["u_"+str(int(x))],node_idx["m_"+str(int(y))], r, timestamp))
            

print ("Min ts", min(ts), "max ts", max(ts))
print ("Total time span: {} days".format((max(ts) - min(ts)).days))
links.sort(key =lambda x: x[3])
print ("# temporal links", len(links))

import networkx as nx
agg_G = nx.Graph()
for a,b,r,t in links:
    agg_G.add_edge(a,b, weight=r)

print ("Agg graph {}:{}".format(agg_G.number_of_nodes(), agg_G.number_of_edges()))



1000000
2000000
3000000
4000000
5000000
6000000
7000000
8000000
9000000
10000000
Min ts 1995-01-09 12:46:49 max ts 2009-01-05 06:02:16
Total time span: 5109 days
# temporal links 10000054
Agg graph 80555:10000054


In [3]:
import networkx as nx
import numpy as np
from datetime import datetime, timedelta
SLICE_DAYS = 30*12
START_DATE = min(ts) + timedelta(days=SLICE_DAYS)
END_DATE = max(ts) - timedelta(20)

slices_links = defaultdict(lambda : nx.MultiGraph())
slices_features = defaultdict(lambda : {})

print ("Start date", START_DATE)
print ("End date", END_DATE)


slice_id = 0
# Split the set of links in order by slices to create the graphs. 

for (a, b, r, time) in links:
    prev_slice_id = slice_id
    datetime_object = time

    days_diff = (datetime_object - START_DATE).days
        
    slice_id = days_diff // SLICE_DAYS
    if slice_id >= 0:
        if slice_id == 1+prev_slice_id and slice_id > 0:
            slices_links[slice_id] = nx.MultiGraph()
            slices_links[slice_id].add_nodes_from(slices_links[slice_id-1].nodes(data=True))
            assert (len(slices_links[slice_id].edges()) ==0)
            #assert len(slices_links[slice_id].nodes()) >0

        if slice_id == 1+prev_slice_id and slice_id ==0:
            slices_links[slice_id] = nx.MultiGraph()

        if a not in slices_links[slice_id]:
            slices_links[slice_id].add_node(a)
        if b not in slices_links[slice_id]:
            slices_links[slice_id].add_node(b)    
        slices_links[slice_id].add_edge(a,b, weight= float(r/5))


for slice_id in slices_links:
    print ("# nodes in slice", slice_id, len(slices_links[slice_id].nodes()))
    print ("# edges in slice", slice_id, len(slices_links[slice_id].edges()))
    
    # temp = np.identity(len(slices_links[max(slices_links.keys())].nodes()))
    # print ("Shape of temp matrix", temp.shape)
    # slices_features[slice_id] = {}
    # for idx, node in enumerate(slices_links[slice_id].nodes()):
    #     slices_features[slice_id][node] = temp[idx]



Start date 1996-01-04 12:46:49
End date 2008-12-16 06:02:16
# nodes in slice 0 18102
# edges in slice 0 1041157
# nodes in slice 1 24441
# edges in slice 1 462658
# nodes in slice 2 26826
# edges in slice 2 199023
# nodes in slice 3 32736
# edges in slice 3 650607
# nodes in slice 4 41847
# edges in slice 4 1347282
# nodes in slice 5 47370
# edges in slice 5 780626
# nodes in slice 6 51281
# edges in slice 6 572669
# nodes in slice 7 55656
# edges in slice 7 683930
# nodes in slice 8 60279
# edges in slice 8 746178
# nodes in slice 9 66506
# edges in slice 9 1150766
# nodes in slice 10 70715
# edges in slice 10 759183
# nodes in slice 11 74611
# edges in slice 11 707197
# nodes in slice 12 78690
# edges in slice 12 660098
# nodes in slice 13 80555
# edges in slice 13 238677


In [4]:
from scipy.sparse import csr_matrix

def remap(slices_graph, slices_features):
    all_nodes = []
    for slice_id in slices_graph:
        # assert len(slices_graph[slice_id].nodes()) == len(slices_features[slice_id])
        all_nodes.extend(slices_graph[slice_id].nodes())
    all_nodes = list(set(all_nodes))
    print ("Total # nodes", len(all_nodes), "max idx", max(all_nodes))
    ctr = 0
    node_idx = {}
    idx_node = []
    for slice_id in slices_graph:
        for node in slices_graph[slice_id].nodes():
            if node not in node_idx:
                node_idx[node] = ctr
                idx_node.append(node)
                ctr += 1
    slices_graph_remap = []
    slices_features_remap = []
    for slice_id in slices_graph:
        G = nx.Graph()
        for x in slices_graph[slice_id].nodes():
            G.add_node(node_idx[x])
        for x in slices_graph[slice_id].edges(data=True):
            G.add_edge(node_idx[x[0]], node_idx[x[1]], weight=x[2]['weight'])
        assert (len(G.nodes()) == len(slices_graph[slice_id].nodes()))
        assert (len(G.edges()) == len(slices_graph[slice_id].edges()))
        print(G.number_of_nodes())
        slices_graph_remap.append(G)
    
    # for slice_id in slices_features:
    #     features_remap = []
    #     for x in slices_graph_remap[slice_id].nodes():
    #         features_remap.append(slices_features[slice_id][idx_node[x]])
    #         #features_remap.append(np.array(slices_features[slice_id][idx_node[x]]).flatten())
    #     features_remap = csr_matrix(np.squeeze(np.array(features_remap)))
    #     slices_features_remap.append(features_remap)
    return (slices_graph_remap, slices_features_remap)



In [5]:
slices_links_remap, slices_features_remap = remap(slices_links, slices_features)


Total # nodes 80555 max idx 80554
18102
24441
26826
32736
41847
47370
51281
55656
60279
66506
70715
74611
78690
80555


In [6]:
for i,graph in enumerate(slices_links_remap):
    nx.write_gpickle(graph, f'ml-10m_new/graph_{i}.npz')

In [7]:
import random
def random_walk(graph, start_node, walk_length, rand):
    walk = [start_node]
    while len(walk) < walk_length:
        cur = walk[-1]
        if len(graph[cur]) > 0:
            neighbors = [n for n in graph.neighbors(cur)]
            rand.shuffle(neighbors)
            walk.append(neighbors[0])
        else:
            print("No neighbors")
            break
    return walk


def build_walk_corpus(graph, num_walks, walk_length):
    nodes = list(graph.nodes())
    rand = random.Random(0)
    walks = []
    for cnt in range(num_walks):
        rand.shuffle(nodes)
        for i, node in enumerate(nodes):
            walks.append(random_walk(graph, node, walk_length,rand))
    return walks

In [8]:
import pickle as pkl
for i, graph in enumerate(slices_links_remap):
    print(f"Random walks for graph {i}")
    nx_G = nx.Graph()
    nx_G.add_weighted_edges_from([(edge[0],edge[1],edge[2]['weight']) for edge in graph.edges(data=True)])
    walks = build_walk_corpus(nx_G, num_walks=10, walk_length=40)
    WINDOW_SIZE = 10
    pairs = defaultdict(lambda: [])
    pairs_cnt = 0
    for walk in walks:
        for word_index, word in enumerate(walk):
            for nb_word in walk[max(word_index - WINDOW_SIZE, 0): min(word_index + WINDOW_SIZE, len(walk)) + 1]:
                if nb_word != word:
                    pairs[word].append(nb_word)
                    pairs_cnt += 1
    pkl.dump(dict(pairs), open(f'ml-10m_new/pairs_{i}.pkl', 'wb'))

Random walks for graph 0


PicklingError: Can't pickle <function <lambda> at 0x7ff6910e5280>: attribute lookup <lambda> on __main__ failed

In [None]:
# np.savez('ml-10m_new/graphs.npz', graph=slices_links_remap)
# np.savez('ml-10m_new/features.npz', feats=slices_features_remap)



In [11]:
pkl.dump(dict(pairs), open(f'ml-10m_new/pairs_{i}.pkl', 'wb'))

In [13]:
pkl.load(open(f'ml-10m_new/pairs_{i}.pkl', 'rb'))

1724,
  118,
  2744,
  357,
  3201,
  591,
  11670,
  6707,
  221,
  6883,
  308,
  6903,
  210,
  3437,
  167,
  17551,
  173,
  2,
  5502,
  3795,
  6969,
  691,
  8050,
  368,
  1328,
  179,
  1862,
  11007,
  27,
  11518,
  484,
  12819,
  43,
  6882,
  43,
  13956,
  313,
  6259,
  17167,
  27,
  2568,
  93,
  2899,
  216,
  5687,
  344,
  10870,
  8890,
  305,
  9534,
  202,
  10063,
  2351,
  6526,
  10,
  2828,
  344,
  164,
  10416,
  179,
  11228,
  463,
  3778,
  117,
  15433,
  163,
  52,
  1565,
  446,
  13646,
  9892,
  10147,
  120,
  1883,
  235,
  4790,
  12032,
  10,
  15329,
  3047,
  13636,
  275,
  8562,
  210,
  4615,
  173,
  306,
  3635,
  1783,
  16253,
  152,
  17314,
  14520,
  18026,
  9889,
  14400,
  236,
  15111,
  361,
  344,
  6698,
  163,
  2346,
  316,
  11335,
  144,
  4503,
  27,
  17338,
  12054,
  305,
  15579,
  342,
  10502,
  2,
  3984,
  235,
  655,
  357,
  7,
  5955,
  205,
  1069,
  6259,
  17329,
  10944,
  11645,
  384,
  1882,
  3054,
  