# Facebook SNAP

In [1]:
import tarfile

# Path to your downloaded file
file_path = 'data/facebook.tar.gz'  # adjust if needed

# Extract the tar.gz file
with tarfile.open(file_path, 'r:gz') as tar:
    tar.extractall('facebook_data')  # extracts to a folder called facebook_data

In [7]:
import os

# List the extracted files
extracted_files = os.listdir('facebook_data')
print("Extracted files:", extracted_files)

Extracted files: ['facebook', 'facebook_combined.txt']


In [5]:
import pandas as pd

# Load edge list (adjust filename if different)
edges = pd.read_csv('facebook_data/facebook_combined.txt', 
                    sep=' ', 
                    header=None, 
                    names=['node1', 'node2'])

# Display first few rows
edges.head()

Unnamed: 0,node1,node2
0,0,1
1,0,2
2,0,3
3,0,4
4,0,5


### Reading Metadata

In [9]:
import pandas as pd

# Example: Load edges for ego network '0'
edges = pd.read_csv('facebook_data/facebook/0.edges', sep=' ', header=None, names=['node1', 'node2'])
print("Edges (first 5):")
print(edges.head())

Edges (first 5):
   node1  node2
0    236    186
1    122    285
2     24    346
3    271    304
4    176      9


In [10]:
# Load node features (rows = nodes, columns = features)
feat = pd.read_csv('facebook_data/facebook/0.feat', sep=' ', header=None)
feat.columns = ['node_id'] + [f'feat_{i}' for i in range(1, len(feat.columns))]
print("\nNode features (first 5):")
print(feat.head())


Node features (first 5):
   node_id  feat_1  feat_2  feat_3  feat_4  feat_5  feat_6  feat_7  feat_8  \
0        1       0       0       0       0       0       0       0       0   
1        2       0       0       0       0       0       0       0       0   
2        3       0       0       0       0       0       0       0       1   
3        4       0       0       0       0       0       0       0       0   
4        5       0       0       0       0       0       0       0       0   

   feat_9  ...  feat_215  feat_216  feat_217  feat_218  feat_219  feat_220  \
0       0  ...         0         0         0         0         0         0   
1       0  ...         0         0         0         0         0         0   
2       0  ...         0         0         0         1         0         0   
3       0  ...         0         0         0         0         0         0   
4       0  ...         0         0         0         0         0         0   

   feat_221  feat_222  feat_223  fea

In [11]:
# Load feature names (metadata for features)
with open('facebook_data/facebook/0.featnames', 'r') as f:
    featnames = [line.strip().split(' ', 1) for line in f]
    featnames = pd.DataFrame(featnames, columns=['feature_id', 'feature_description'])
print("\nFeature descriptions:")
print(featnames.head())


Feature descriptions:
  feature_id            feature_description
0          0  birthday;anonymized feature 0
1          1  birthday;anonymized feature 1
2          2  birthday;anonymized feature 2
3          3  birthday;anonymized feature 3
4          4  birthday;anonymized feature 4


In [14]:
import pandas as pd

# Load feature names and split into ID + description
with open('facebook_data/facebook/0.featnames', 'r') as f:
    featnames = [line.strip().split(' ', 1) for line in f]
    featnames_df = pd.DataFrame(featnames, columns=['feature_id', 'feature_description'])

# Extract unique descriptions (drop duplicates)
unique_descriptions = featnames_df['feature_description'].drop_duplicates()

print(f"Total features: {len(featnames_df)}")
print(f"Unique descriptions: {len(unique_descriptions)}")
print("\nSample unique descriptions:")
print(unique_descriptions.head(10))  # Show first 10 unique descriptions

Total features: 224
Unique descriptions: 224

Sample unique descriptions:
0                birthday;anonymized feature 0
1                birthday;anonymized feature 1
2                birthday;anonymized feature 2
3                birthday;anonymized feature 3
4                birthday;anonymized feature 4
5                birthday;anonymized feature 5
6                birthday;anonymized feature 6
7                birthday;anonymized feature 7
8    education;classes;id;anonymized feature 8
9    education;classes;id;anonymized feature 9
Name: feature_description, dtype: object


In [12]:
# Load ego node features (same structure as .feat but only for the ego)
egofeat = pd.read_csv('facebook_data/facebook/0.egofeat', sep=' ', header=None)
egofeat.columns = [f'feat_{i}' for i in range(len(egofeat.columns))]
print("\nEgo node features:")
print(egofeat)


Ego node features:
   feat_0  feat_1  feat_2  feat_3  feat_4  feat_5  feat_6  feat_7  feat_8  \
0       0       0       0       0       0       0       0       0       0   

   feat_9  ...  feat_214  feat_215  feat_216  feat_217  feat_218  feat_219  \
0       1  ...         0         0         0         0         0         1   

   feat_220  feat_221  feat_222  feat_223  
0         0         0         0         0  

[1 rows x 224 columns]


In [13]:
# Load circles (each line is a circle with its members)
with open('facebook_data/facebook/0.circles', 'r') as f:
    circles = [line.strip().split() for line in f]
    circles = {circle[0]: circle[1:] for circle in circles}  # {circle_name: [nodes]}
print("\nCircles:")
print(circles)


Circles:
{'circle0': ['71', '215', '54', '61', '298', '229', '81', '253', '193', '97', '264', '29', '132', '110', '163', '259', '183', '334', '245', '222'], 'circle1': ['173'], 'circle2': ['155', '99', '327', '140', '116', '147', '144', '150', '270'], 'circle3': ['51', '83', '237'], 'circle4': ['125', '344', '295', '257', '55', '122', '223', '59', '268', '280', '84', '156', '258', '236', '250', '239', '69'], 'circle5': ['23'], 'circle6': ['337', '289', '93', '17', '111', '52', '137', '343', '192', '35', '326', '310', '214', '32', '115', '321', '209', '312', '41', '20'], 'circle7': ['225', '46'], 'circle8': ['282'], 'circle9': ['336', '204', '74', '206', '292', '146', '154', '164', '279', '73'], 'circle10': ['42', '14', '216', '2'], 'circle11': ['324', '265', '54', '161', '298', '76', '165', '199', '203', '13', '66', '113', '97', '252', '313', '238', '158', '240', '331', '332', '134', '218', '118', '235', '311', '151', '308', '212', '70', '211'], 'circle12': ['278'], 'circle13': ['138'

#  LDBC Social Network Benchmark - SNB Dataset

In [16]:
!pip install networkx gqlalchemy faker
!pip freeze > requirements.txt

Collecting networkx
  Using cached networkx-3.4.2-py3-none-any.whl (1.7 MB)
Collecting gqlalchemy
  Downloading gqlalchemy-1.7.0-py3-none-any.whl (94 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.7/94.7 kB[0m [31m762.2 kB/s[0m eta [36m0:00:00[0m1m835.6 kB/s[0m eta [36m0:00:01[0m
[?25hCollecting faker
  Downloading faker-37.1.0-py3-none-any.whl (1.9 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hCollecting adlfs<2025.0.0,>=2023.9.0
  Downloading adlfs-2024.12.0-py3-none-any.whl (41 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.8/41.8 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dacite<2.0.0,>=1.6.0
  Downloading dacite-1.9.2-py3-none-any.whl (16 kB)
Collecting neo4j<6.0.0,>=4.4.3
  Using cached neo4j-5.28.1-py3-none-any.whl (312 kB)
Collecting numpy<2.0.0,>

In [1]:
!pip install -r requirements.txt



In [2]:
import random
import uuid
import faker
import networkx as nx
from gqlalchemy import Memgraph

In [3]:
# Initialize Faker
fake = faker.Faker()

# Create NetworkX Graph
G = nx.Graph()

# Utility function to create a fake user
def create_user(platform):
    return {
        "user_id": str(uuid.uuid4()),
        "platform": platform,
        "name": fake.name(),
        "email": fake.email(),
        "phone": fake.phone_number(),
        "age": random.randint(18, 65),
        "gender": random.choice(["Male", "Female", "Other"]),
        "nationality": fake.country(),
        "friendList": [],
        "connectionList": [],
        "emailList": [],
        "twitterInteraction": {
            "post_keywords": random.sample(
                ["floods", "company event", "conference", "catering", "parties"],
                k=random.randint(1, 5)
            ),
            "post_sentiment": [random.choice([0, 1]) for _ in range(4)],
            "time_spent_per_post": [f"{random.randint(400, 700)}ms" for _ in range(4)],
        }
    }

In [4]:
print("Generating Users")
# Generate Users
users = []

for platform in ["Facebook", "Twitter", "LinkedIn"]:
    for _ in range(100):
        users.append(create_user(platform))

# Map user_id to user for quick access
user_map = {user["user_id"]: user for user in users}

# Now randomly create relationships
user_ids = list(user_map.keys())
print("User generation complete\n")


Generating Users
User generation complete



In [5]:
print("Create the connection lists")
for user in users:
    # Random friends for Facebook
    if user["platform"] == "Facebook":
        friends = random.sample(user_ids, k=random.randint(5, 20))
        user["friendList"] = friends
    
    # Random connections for LinkedIn
    if user["platform"] == "LinkedIn":
        connections = random.sample(user_ids, k=random.randint(5, 20))
        user["connectionList"] = connections
    
    # Random email communications
    emails = random.sample(user_ids, k=random.randint(5, 20))
    user["emailList"] = emails
print("Connection list creation complete\n")

Create the connection lists
Connection list creation complete



In [6]:
print("Add users to graph")
# Add nodes and edges to NetworkX Graph
for user in users:
    G.add_node(user["user_id"], **user)

for user in users:
    for friend_id in user["friendList"]:
        if G.has_node(friend_id):
            G.add_edge(user["user_id"], friend_id, relationship="FRIEND")

    for connection_id in user["connectionList"]:
        if G.has_node(connection_id):
            G.add_edge(user["user_id"], connection_id, relationship="LINKEDIN_CONNECTION")

    for email_id in user["emailList"]:
        if G.has_node(email_id):
            G.add_edge(user["user_id"], email_id, relationship="EMAIL_CONTACT")
print("Addition of users to graph complete\n")

Add users to graph
Addition of users to graph complete



In [7]:
print("Dump data into memgraph")
# Connect to Memgraph and push the data
memgraph = Memgraph()

# Optional: Clear database
memgraph.drop_database()

# Create Nodes
for node_id, data in G.nodes(data=True):
    query = f"""
    CREATE (:User {{
        user_id: "{data['user_id']}",
        platform: "{data['platform']}",
        name: "{data['name']}",
        email: "{data['email']}",
        phone: "{data['phone']}",
        age: {data['age']},
        gender: "{data['gender']}",
        nationality: "{data['nationality']}",
        twitter_post_keywords: {data['twitterInteraction']['post_keywords']},
        twitter_post_sentiment: {data['twitterInteraction']['post_sentiment']},
        twitter_time_spent_per_post: {data['twitterInteraction']['time_spent_per_post']}
    }})
    """
    memgraph.execute(query)

# Create Edges
for source, target, data in G.edges(data=True):
    relationship = data["relationship"]
    query = f"""
    MATCH (a:User {{user_id: "{source}"}})
    MATCH (b:User {{user_id: "{target}"}})
    CREATE (a)-[:{relationship}]->(b)
    """
    memgraph.execute(query)

print("Data pushed to Memgraph successfully!")


Dump data into memgraph
Data pushed to Memgraph successfully!
