In [2]:
import pandas as pd
import networkx as nx

# Read the data
data = pd.read_csv('soc-redditHyperlinks-body.tsv', delimiter='\t')

# Create a directed multigraph using NetworkX
G = nx.MultiDiGraph()
community_to_id = {}

# Add edges to the graph
for _, row in data.iterrows():
    source_community = row['SOURCE_SUBREDDIT']
    target_community = row['TARGET_SUBREDDIT']
    sentiment = int(row['LINK_SENTIMENT'])  # Assuming POST_LABEL contains -1 or 1
    
    # Add nodes if they don't exist
    if source_community not in community_to_id:
        source_community_id = len(community_to_id)  # Assign a unique numeric ID
        community_to_id[source_community] = source_community_id
        G.add_node(source_community_id, name=source_community)

    if target_community not in community_to_id:
        target_community_id = len(community_to_id)  # Assign a unique numeric ID
        community_to_id[target_community] = target_community_id
        G.add_node(target_community_id, name=target_community)
    
    # Add edge and set sentiment attribute
    G.add_edge(source_community_id, target_community_id, sentiment=sentiment)

# Print the total number of nodes and edges
print("Total Nodes:", G.number_of_nodes())
print("Total Edges:", G.number_of_edges())


Total Nodes: 35776
Total Edges: 286561


In [13]:
import pandas as pd
import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForetClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import make_pipeline


# Extract positive and negative edges
positive_edges = [(source, target) for source, target, sentiment in G.edges(data='sentiment') if sentiment == 1]
negative_edges = [(source, target) for source, target, sentiment in G.edges(data='sentiment') if sentiment == -1]

# Split the data into training and testing sets
train_pos_edges, test_pos_edges = train_test_split(positive_edges, test_size=0.2, random_state=42)
train_neg_edges, test_neg_edges = train_test_split(negative_edges, test_size=0.2, random_state=42)

# Create a simple graph for Jaccard coefficient calculation
G_simple = nx.Graph(G)

# Calculate Jaccard coefficients for training set
jaccard_train_pos = [list(nx.jaccard_coefficient(G_simple, [(u, v)]))[0] for u, v in train_pos_edges]
jaccard_train_neg = [list(nx.jaccard_coefficient(G_simple, [(u, v)]))[0] for u, v in train_neg_edges]

# Create features and labels for training
X_train = jaccard_train_pos + jaccard_train_neg
y_train = [1] * len(jaccard_train_pos) + [0] * len(jaccard_train_neg)

# Calculate class weights
class_weights = {0: 4, 1: 1}

# Create and train the classifier using a pipeline with class weights
classifier = make_pipeline(
    StandardScaler(),
    RandomOverSampler(random_state=42),
    RandomForestClassifier(class_weight=dict(zip(np.unique(y_train), class_weights)), random_state=42)
)

classifier.fit(X_train, y_train)

# Calculate Jaccard coefficients for testing set
jaccard_test_pos = [list(nx.jaccard_coefficient(G_simple, [(u, v)]))[0] for u, v in test_pos_edges]
jaccard_test_neg = [list(nx.jaccard_coefficient(G_simple, [(u, v)]))[0] for u, v in test_neg_edges]

# Create features and labels for testing
X_test = jaccard_test_pos + jaccard_test_neg
y_test = [1] * len(jaccard_test_pos) + [0] * len(jaccard_test_neg)

# Predict and evaluate the model
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.9264739238916128
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00      4214
           1       0.93      1.00      0.96     53099

    accuracy                           0.93     57313
   macro avg       0.46      0.50      0.48     57313
weighted avg       0.86      0.93      0.89     57313



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
pip install imbalanced-learn


In [10]:
import numpy as np
import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.pipeline import make_pipeline

# Extract positive and negative edges
positive_edges = [(source, target) for source, target, sentiment in G.edges(data='sentiment') if sentiment == 1]
negative_edges = [(source, target) for source, target, sentiment in G.edges(data='sentiment') if sentiment == -1]

# Split the data into training and testing sets
train_pos_edges, test_pos_edges = train_test_split(positive_edges, test_size=0.2, random_state=42)
train_neg_edges, test_neg_edges = train_test_split(negative_edges, test_size=0.2, random_state=42)

# Create a simple graph for Adamic/Adar similarity calculation
G_simple = nx.Graph(G)

# Calculate Adamic/Adar coefficients for training set
adamic_train_pos = [list(nx.adamic_adar_index(G_simple, [(u, v)]))[0][2] for u, v in train_pos_edges]
adamic_train_neg = [list(nx.adamic_adar_index(G_simple, [(u, v)]))[0][2] for u, v in train_neg_edges]

# Create features and labels for training
X_train = adamic_train_pos + adamic_train_neg
y_train = [1] * len(adamic_train_pos) + [0] * len(adamic_train_neg)

# Reshape X_train to make it 2D
X_train = np.array(X_train).reshape(-1, 1)

# Calculate class weights
class_weights = {0: 6, 1: 1}  # Assign higher weight to the minority class (class zero)

# Create and train the classifier using a pipeline with class weights
classifier = make_pipeline(
    StandardScaler(),
    RandomForestClassifier(class_weight=class_weights, random_state=42)
)

classifier.fit(X_train, y_train)

# Calculate Adamic/Adar coefficients for testing set
adamic_test_pos = [list(nx.adamic_adar_index(G_simple, [(u, v)]))[0][2] for u, v in test_pos_edges]
adamic_test_neg = [list(nx.adamic_adar_index(G_simple, [(u, v)]))[0][2] for u, v in test_neg_edges]

# Create features and labels for testing
X_test = adamic_test_pos + adamic_test_neg
y_test = [1] * len(adamic_test_pos) + [0] * len(adamic_test_neg)

# Reshape X_test to make it 2D
X_test = np.array(X_test).reshape(-1, 1)

# Predict and evaluate the model
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.9264739238916128
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00      4214
           1       0.93      1.00      0.96     53099

    accuracy                           0.93     57313
   macro avg       0.46      0.50      0.48     57313
weighted avg       0.86      0.93      0.89     57313



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
