In [2]:
import snap
import re

# ENCRYPT COZ POST ID CAN ONLY BE INT

def convert_to_integer(post_id):
    base = 36  # 10 digits + 26 letters

    # Remove leading and trailing single quotes if present
    post_id = post_id.strip("'")

    # Convert each character to its corresponding integer value
    int_values = [int(c, base) for c in post_id]

    # Combine the integer values to get a unique integer for the whole string
    result = 0
    for value in int_values:
        result = result * base + value

    return result



# Create a bipartite graph
G = snap.TNEANet.New()

# Load post crosslinks info
with open('formatted_data_file.txt', 'r') as file:
    data = file.read()

# Split the data into lines
lines = data.strip().split('\n')

# Mapping between community and user names and their node IDs
community_to_id = {}
user_to_id = {}

# Add nodes and edges to the bipartite graph
for line in lines:
    parts = re.split(r'\t|\s', line)

    # Extract values into variables
    source_community = parts[0]
    target_community = parts[1]
    post_id_source = parts[2]
    timestamp_source = parts[3] + " " + parts[4]  # Combine date and time
    user = parts[5]
    post_id_target = parts[6]
    timestamp_target = parts[7] + " " + parts[8]  # Combine date and time
    
    # Add nodes if they don't exist
    if source_community not in community_to_id:
        source_community_id = G.AddNode()
        community_to_id[source_community] = source_community_id
    else:
        source_community_id = community_to_id[source_community]

    if user not in user_to_id:
        user_id = G.AddNode()
        user_to_id[user] = user_id
    else:
        user_id = user_to_id[user]

    if target_community not in community_to_id:
        target_community_id = G.AddNode()
        community_to_id[target_community] = target_community_id
    else:
        target_community_id = community_to_id[target_community]

    # Add edges
    # G.AddEdge(source_community_id, user_id)
    G.AddEdge(user_id, target_community_id)

# Load label info
with open('label_info.tsv', 'r') as file:
    label_info = file.read()

# Split label info into lines
label_lines = label_info.strip().split('\n')

# Add sentiment attribute to edges
for label_line in label_lines:
    parts = label_line.split()
    if len(parts) != 3:
        print(f"Skipping label line: {label_line}")
        continue

    post_id_from, post_id_to, sentiment = parts[0][1:-1], parts[1][:-1], parts[2]
    # Convert alphanumeric post IDs to integers
    post_id_from_int = convert_to_integer(post_id_from)
    post_id_to_int = convert_to_integer(post_id_to)
     
    # Find edge IDs based on source and target node IDs
    edge_id = G.GetEId(user_id, target_community_id)

    # Map sentiment values to 1 and 0
    sentiment_value = 1 if sentiment == 'burst' else 0 

    # Add sentiment attribute to edges
    G.AddIntAttrDatE(edge_id, sentiment_value, "sentiment")

# Print the total number of nodes and edges
print("Total Nodes:", G.GetNodes())
print("Total Edges:", G.GetEdges())


Total Nodes: 180283
Total Edges: 286561


In [4]:
import snap
import re
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Your existing code for creating the bipartite graph (up to the sentiment attribute)

# Feature extraction
user_out_degree = {}
user_in_degree = {}
user_negative_sentiments = {}

for user_id in user_to_id.values():
    # Out-degree of the user node
    out_degree = G.GetNI(user_id).GetOutDeg()
    user_out_degree[user_id] = out_degree

    # In-degree of the user node
    in_degree = G.GetNI(user_id).GetInDeg()
    user_in_degree[user_id] = in_degree

    # Sentiment of neighboring edges
    neighboring_sentiments = [G.GetIntAttrDatE(eid, "sentiment") for eid in G.GetNI(user_id).GetOutEdges()]
    user_negative_sentiments[user_id] = sum(1 for sentiment in neighboring_sentiments if sentiment == 1)

# Convert features to a numpy array
features = np.array([
    list(user_out_degree.values()),
    list(user_in_degree.values()),
    list(user_negative_sentiments.values())
]).T

# Load label info
with open('label_info.tsv', 'r') as file:
    label_info = file.read()

# Split label info into lines
label_lines = label_info.strip().split('\n')

# Labeling nodes based on sentiment
labels = [G.GetIntAttrDatE(int(parts[0]), "sentiment") for parts in (line.split() for line in label_lines)]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Initialize and train a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Plotting the graph
plt.figure(figsize=(10, 6))
plt.scatter(user_out_degree.values(), user_negative_sentiments.values(), alpha=0.5)
plt.title('User Out-Degree vs. Number of Negative Sentiments (Neighboring Edges)')
plt.xlabel('User Out-Degree')
plt.ylabel('Number of Negative Sentiments (Neighboring Edges)')
plt.show()


ValueError: invalid literal for int() with base 10: "('2vjbm2',"