### Bishoy Sokkar
### Project: Network Analysis of Davis Southern Women Dataset

#### Dataset Selection and Description




In [14]:
import networkx as nx
import numpy as np
from scipy.stats import f_oneway
import pandas as pd
import matplotlib.pyplot as plt
import os

# Define absolute file paths based on provided location
edges_path = 'C:\\Users\\PC\\Documents\\DS MS\\DATA 620\\Data 620 Github\\DATA620\\musae_facebook_edges'
target_path = 'C:\\Users\\PC\\Documents\\DS MS\\DATA 620\\Data 620 Github\\DATA620\\musae_facebook_target'

# Debug: Print working directory and file existence
print(f"Working directory: {os.getcwd()}")
print(f"Checking {edges_path}: {os.path.exists(edges_path)}")
print(f"Checking {target_path}: {os.path.exists(target_path)}")

# Check if files exist
if not (os.path.exists(edges_path) and os.path.exists(target_path)):
    raise FileNotFoundError(
        f"Files not found at {edges_path} or {target_path}. "
        "Please ensure musae_facebook_edges and musae_facebook_target are at "
        "C:\\Users\\PC\\Documents\\DS MS\\DATA 620\\Data 620 Github\\, or update paths above."
    )

# Attempt to load edges file (try common delimiters)
try:
    edges = pd.read_csv(edges_path, header=None, names=['source', 'target'], sep=None, engine='python')
except pd.errors.ParserError:
    edges = pd.read_csv(edges_path, header=None, names=['source', 'target'], sep=r'\s+', engine='python')

G = nx.from_pandas_edgelist(edges, 'source', 'target')

# Load labels and set as node attributes
try:
    targets = pd.read_csv(target_path, sep=None, engine='python')
    label_dict = targets.set_index('id')['ml_target'].to_dict()
    nx.set_node_attributes(G, label_dict, 'label')
except pd.errors.ParserError:
    targets = pd.read_csv(target_path, sep=r'\s+', engine='python')
    label_dict = targets.set_index('id')['ml_target'].to_dict()
    nx.set_node_attributes(G, label_dict, 'label')

# Calculate centrality measures
degree = nx.degree_centrality(G)
eigen = nx.eigenvector_centrality(G, max_iter=500)

# Group nodes by category (ml_target 0-3)
groups = {0: [], 1: [], 2: [], 3: []}
for n in G.nodes:
    l = G.nodes[n].get('label')
    if l is not None:
        groups[l].append(n)

# Extract centrality values for each group
deg_groups = [[degree[n] for n in group] for group in groups.values()]
eig_groups = [[eigen[n] for n in group] for group in groups.values()]

# Compute means
results = {
    'Category': ['Politicians', 'Governmental', 'TV Shows', 'Companies'],
    'Nodes': [len(group) for group in groups.values()],
    'Mean Degree Centrality': [np.mean(d) for d in deg_groups],
    'Mean Eigenvector Centrality': [np.mean(e) for e in eig_groups]
}
df = pd.DataFrame(results)
print("Centrality Comparison:")
print(df)

# ANOVA tests
f_deg, p_deg = f_oneway(*deg_groups)
f_eig, p_eig = f_oneway(*eig_groups)
print(f"\nDegree Centrality ANOVA: F = {f_deg:.3f}, p = {p_deg:.3f}")
print(f"Eigenvector Centrality ANOVA: F = {f_eig:.3f}, p = {p_eig:.3f}")

# Visualize a subsample (full graph too large)
sub_G = G.subgraph(list(G.nodes)[:500])
pos = nx.spring_layout(sub_G)
colors = ['red' if sub_G.nodes[n].get('label') == 0 else 'blue' if sub_G.nodes[n].get('label') == 1 else 'green' if sub_G.nodes[n].get('label') == 2 else 'yellow' for n in sub_G.nodes]
plt.figure(figsize=(6, 4))
nx.draw(sub_G, pos, node_color=colors, node_size=20, edge_color='gray')
plt.title("Subsample of Facebook Page-Page Network (colored by category)")
plt.legend(['Politicians', 'Governmental', 'TV Shows', 'Companies'])
plt.show()


Working directory: C:\Users\PC\Documents\DS MS\DATA 620\Data 620 Github\DATA620
Checking C:\Users\PC\Documents\DS MS\DATA 620\Data 620 Github\DATA620\musae_facebook_edges: False
Checking C:\Users\PC\Documents\DS MS\DATA 620\Data 620 Github\DATA620\musae_facebook_target: False


FileNotFoundError: Files not found at C:\Users\PC\Documents\DS MS\DATA 620\Data 620 Github\DATA620\musae_facebook_edges or C:\Users\PC\Documents\DS MS\DATA 620\Data 620 Github\DATA620\musae_facebook_target. Please ensure musae_facebook_edges and musae_facebook_target are at C:\Users\PC\Documents\DS MS\DATA 620\Data 620 Github\, or update paths above.