In [None]:
# Import necessary libraries
import pandas as pd
import networkx as nx
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv("/content/merged_file.csv")  # Replace with the actual path to your dataset

# Display the first few rows of the dataset
print(df.head())

df.fillna(0, inplace=True)

# Step 2: Check the data types of the engagement features
engagement_features = ['favorite_count', 'quote_count', 'reply_count', 'retweet_count']

# Print data types for debugging
print("Data types before scaling:")
print(df[engagement_features].dtypes)

# Ensure engagement features are of numeric type
for feature in engagement_features:
    df[feature] = pd.to_numeric(df[feature], errors='coerce')  # Convert to numeric, forcing non-numeric to NaN

# After conversion, check for null values
print("Checking for null values after conversion:")
print(df[engagement_features].isnull().sum())

# Drop any rows with NaN values in engagement features
df.dropna(subset=engagement_features, inplace=True)

# Scale only the numeric engagement features
scaler = StandardScaler()
df[engagement_features] = scaler.fit_transform(df[engagement_features])

# Step 3: Create a graph using NetworkX
G = nx.Graph()

# Step 4: Add nodes to the graph using tweet_id
for index, row in df.iterrows():
    G.add_node(row['tweet_id'],  # Use tweet_id as the node
               text=row['TWEETS'],
               community_notes=row['COMMUNITY NOTES'],
               engagement={
                   'favorite_count': row['favorite_count'],
                   'quote_count': row['quote_count'],
                   'reply_count': row['reply_count'],
                   'retweet_count': row['retweet_count']
               },
               misleading_features={
                   'misleadingFactualError': row['misleadingFactualError'],
                   'misleadingManipulatedMedia': row['misleadingManipulatedMedia'],
                   'misleadingOutdatedInformation': row['misleadingOutdatedInformation'],
                   'misleadingMissingImportantContext': row['misleadingMissingImportantContext'],
                   'misleadingUnverifiedClaimAsFact': row['misleadingUnverifiedClaimAsFact'],
                   'misleadingSatire': row['misleadingSatire']
               },
               not_misleading_features={
                   'notMisleadingOther': row['notMisleadingOther'],
                   'notMisleadingFactuallyCorrect': row['notMisleadingFactuallyCorrect'],
               },
               classification=row['classification'])  # Classification as the target variable

# Step 5: Add edges based on retweets and replies
# Assuming your dataset includes columns for retweet and reply IDs (modify as necessary)
# for index, row in df.iterrows():
    # Placeholder for retweet and reply logic
    # Uncomment and modify based on your dataset structure
    # if 'retweet_id' in row and row['retweet_id']:
    #     G.add_edge(row['tweet_id'], row['retweet_id'], relation='retweet')

    # if 'reply_to_id' in row and row['reply_to_id']:
    #     G.add_edge(row['tweet_id'], row['reply_to_id'], relation='reply')

# Print the graph summary
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

# Step 6: Prepare the data for training/testing
# Extract features and labels
X = pd.DataFrame.from_records([G.nodes[node] for node in G.nodes()])
y = X['classification']  # Assuming classification is the label for fake/misleading news
X.drop('classification', axis=1, inplace=True)  # Remove the label from features

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: You can now use X_train, y_train for training your GNN model
# Note: Implement the GNN model here, based on your requirements

print("Training and testing sets prepared.")


              tweet_id                                             TWEETS  \
0  1783159712986382830  TV/Digital home of Bill O‚ÄôReilly, Jesse Kell...   
1  1783171851818021181  WFLA News Channel 8 is a news leader in Tampa ...   
2  1783154445682979015  TV/Digital home of Bill O‚ÄôReilly, Jesse Kell...   
3  1377030478167937024  I am a United States Senator from the great st...   
4  1536848327979016193  We bring you the power to compare offers and r...   

                                     COMMUNITY NOTES favorite_count favorited  \
0  WHOA JOE!\n"I had a nurse named Pearl... She'd...          11057     FALSE   
1  DRAMATIC VIDEO: Florida cop treated for overdo...            379     FALSE   
2  WATCH - Bill Maher Trashes Woke Ideology\n"Fiv...             52     FALSE   
3  My personal Twitter account ‚Äì @BasedMikeLee ...          13328     FALSE   
4  Don't borrow from the bank.  Borrow from yours...            540     FALSE   

  is_quote_status lang possibly_sensitive possibly

In [None]:
# Step 1: Define the logic for community notes agreement/disagreement
def check_agreement(row):
    # Simple keyword-based logic for agreement/disagreement
    notes = row['COMMUNITY NOTES'].lower() if isinstance(row['COMMUNITY NOTES'], str) else ''
    if "misleading" in notes or "disagree" in notes or "false" in notes:
        return -1  # Disagreement
    elif "agree" in notes or "correct" in notes or "factually correct" in notes:
        return 1  # Agreement
    else:
        return 0  # Neutral/No relevant information

# Step 2: Apply the logic to create a new column in the DataFrame
df['community_agreement'] = df.apply(check_agreement, axis=1)

# Step 3: Ensure that attributes are added to the graph correctly, including the new feature
for index, row in df.iterrows():
    G.add_node(row['tweet_id'],
               favorite_count=row['favorite_count'],
               quote_count=row['quote_count'],
               reply_count=row['reply_count'],
               retweet_count=row['retweet_count'],
               classification=row['classification'],
               community_agreement=row['community_agreement'])  # Add new feature

# Step 4: Create Node Features Matrix including community_agreement
node_features = []
node_labels = []

for node in G.nodes(data=True):
    node_id = node[0]
    features = node[1]  # Node data (attributes)

    # Extract features safely
    favorite_count = features.get('favorite_count', 0)
    quote_count = features.get('quote_count', 0)
    reply_count = features.get('reply_count', 0)
    retweet_count = features.get('retweet_count', 0)
    community_agreement = features.get('community_agreement', 0)  # New feature

    # Append all features to node_features
    node_features.append([
        favorite_count,
        quote_count,
        reply_count,
        retweet_count,
        community_agreement  # Add community agreement/disagreement feature
    ])

    node_labels.append(features.get('classification', 0))  # Target label

# Convert to NumPy arrays
node_features = np.array(node_features)
node_labels = np.array(node_labels)

# Continue with the rest of the code...


In [None]:
import pandas as pd
import numpy as np
from itertools import combinations

# Assuming df is your DataFrame after all necessary preprocessing

# Step 1: Define the logic for community notes agreement/disagreement
def check_agreement(row):
    # Simple keyword-based logic for agreement/disagreement
    notes = row['COMMUNITY NOTES'].lower() if isinstance(row['COMMUNITY NOTES'], str) else ''
    if "misleading" in notes or "disagree" in notes or "false" in notes:
        return -1  # Disagreement
    elif "agree" in notes or "correct" in notes or "factually correct" in notes:
        return 1  # Agreement
    else:
        return 0  # Neutral/No relevant information

# Step 2: Apply the logic to create a new column in the DataFrame
df['community_agreement'] = df.apply(check_agreement, axis=1)

# Step 3: Extract Node Features
# Select relevant features for node representation, including the new community_agreement
engagement_features = ['favorite_count', 'quote_count', 'reply_count', 'retweet_count', 'community_agreement']
# Convert to numeric (handle any potential conversion issues)
df[engagement_features] = df[engagement_features].apply(pd.to_numeric, errors='coerce')

# Fill missing values with 0 for scaling
df[engagement_features] = df[engagement_features].fillna(0)

# Create a node feature matrix
node_features = df[engagement_features].values

# Step 4: Extract Labels
# Assuming the 'classification' column contains the label information
labels = df['classification'].map({
    'MISINFORMED_OR_POTENTIALLY_MISLEADING': 1,
    'NOT_MISLEADING': 0
})
labels = labels.fillna(0).values  # Fill any missing labels with 0

# Step 5: Extract Edges
edges = []
# Group by 'COMMUNITY NOTES' to find tweet IDs in the same note
for community_note in df['COMMUNITY NOTES'].dropna().unique():
    tweet_ids = df[df['COMMUNITY NOTES'] == community_note]['tweet_id'].values
    if len(tweet_ids) > 1:
        # Create edges for all unique pairs of tweet IDs in the same community note
        for i in range(len(tweet_ids)):
            for j in range(i + 1, len(tweet_ids)):
                # Ensure no self-loops
                if tweet_ids[i] != tweet_ids[j]:
                    edges.append((tweet_ids[i], tweet_ids[j]))  # Only add non-self-loops

# Convert edges to a NumPy array
edge_index = np.array(edges).T  # Transpose for edge_index format

# Now, you have updated edge_index
print("Edge index shape after correction:", edge_index.shape)
# Display sample edges to verify
print("Sample edges after correction:", edges[:5])

# Now, you have node_features, labels, and edge_index prepared

# Display the shapes of the created data components
print("Node features shape:", node_features.shape)
print("Labels shape:", labels.shape)
print("Edge index shape:", edge_index.shape)

# To verify edges created, you can print a sample
print("Sample edges:", edges[:5])


Edge index shape after correction: (2, 2336)
Sample edges after correction: [(1742206236420927754, 1816439860388848082), (1742206236420927754, 1638007824830849027), (1742206236420927754, 1376258278636855305), (1816439860388848082, 1638007824830849027), (1816439860388848082, 1376258278636855305)]
Node features shape: (1534, 5)
Labels shape: (1534,)
Edge index shape: (2, 2336)
Sample edges: [(1742206236420927754, 1816439860388848082), (1742206236420927754, 1638007824830849027), (1742206236420927754, 1376258278636855305), (1816439860388848082, 1638007824830849027), (1816439860388848082, 1376258278636855305)]


In [None]:
import numpy as np

# Create edge index based on train indices
train_mask = np.isin(edge_index[0], train_indices) & np.isin(edge_index[1], train_indices)
test_mask = np.isin(edge_index[0], test_indices) & np.isin(edge_index[1], test_indices)

# Filter the edge index based on the masks
train_edge_index = edge_index[:, train_mask]
test_edge_index = edge_index[:, test_mask]


In [None]:
!pip install torch-geometric

Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m49.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.6.1


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader

# Ensure node_features and labels are still DataFrames/Series
node_features = pd.DataFrame(node_features)  # Ensure it's a DataFrame
labels = pd.Series(labels)  # Ensure labels are a Series

# Perform the train-test split while preserving indices
X_train, X_test, y_train, y_test = train_test_split(
    node_features,
    labels,
    test_size=0.2,  # 20% for testing
    random_state=42,
    shuffle=True,
)

# Get the indices for training and testing sets
train_indices = X_train.index.tolist()
test_indices = X_test.index.tolist()

# Create edge index based on train indices
train_mask = np.isin(edge_index[0], train_indices) & np.isin(edge_index[1], train_indices)
test_mask = np.isin(edge_index[0], test_indices) & np.isin(edge_index[1], test_indices)

# Filter the edge index based on the masks
train_edge_index = edge_index[:, train_mask]
test_edge_index = edge_index[:, test_mask]

# Convert the edge index to torch tensors
train_edge_index = torch.tensor(train_edge_index, dtype=torch.long)
test_edge_index = torch.tensor(test_edge_index, dtype=torch.long)

# Convert node features and labels for training to tensors
x_train = torch.tensor(X_train.values, dtype=torch.float)
y_train = torch.tensor(y_train.values, dtype=torch.float).view(-1, 1)  # Ensure y has the right shape for regression

# Create Data object for training
train_data = Data(x=x_train, edge_index=train_edge_index, y=y_train)

# Convert node features and labels for testing to tensors
x_test = torch.tensor(X_test.values, dtype=torch.float)
y_test = torch.tensor(y_test.values, dtype=torch.float).view(-1, 1)  # Ensure y has the right shape for regression

# Create Data object for testing
test_data = Data(x=x_test, edge_index=test_edge_index, y=y_test)

# Print the final Data objects
print("Train Data:", train_data)
print("Test Data:", test_data)

# Create DataLoaders
train_loader = DataLoader([train_data], batch_size=32, shuffle=True)
test_loader = DataLoader([test_data], batch_size=32, shuffle=False)

# Checking the DataLoader
for batch in train_loader:
    print("Train Batch:", batch)

for batch in test_loader:
    print("Test Batch:", batch)


Train Data: Data(x=[1227, 5], edge_index=[2, 0], y=[1227, 1])
Test Data: Data(x=[307, 5], edge_index=[2, 0], y=[307, 1])
Train Batch: DataBatch(x=[1227, 5], edge_index=[2, 0], y=[1227, 1], batch=[1227], ptr=[2])
Test Batch: DataBatch(x=[307, 5], edge_index=[2, 0], y=[307, 1], batch=[307], ptr=[2])


In [None]:
print("Original edge index shape:", edge_index.shape)
print("Number of unique nodes in edge_index:", np.unique(edge_index).shape[0])
print("Train indices unique nodes:", np.unique(train_indices).shape[0])
print("Test indices unique nodes:", np.unique(test_indices).shape[0])


Original edge index shape: (2, 2336)
Number of unique nodes in edge_index: 223
Train indices unique nodes: 1227
Test indices unique nodes: 307


In [50]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.loader import DataLoader
from sklearn.metrics import f1_score

# Step 1: Define the GNN model
class GNNModel(torch.nn.Module):
    def __init__(self, num_features):
        super(GNNModel, self).__init__()
        self.conv1 = GCNConv(num_features, 16)
        self.conv2 = GCNConv(16, 2)  # Assuming binary classification (0 or 1)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return x

# Step 2: Initialize model, optimizer, and loss function
num_features = node_features.shape[1]  # Update based on your node feature shape
model = GNNModel(num_features)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = F.cross_entropy  # Assuming binary classification

# Step 3: Convert edge_index to tensor
edge_index_tensor = torch.tensor(edge_index, dtype=torch.long)  # Directly convert to tensor

# Step 4: Create mapping from original node IDs to their indices
original_node_ids = torch.unique(edge_index_tensor)
node_id_to_index = {node_id.item(): index for index, node_id in enumerate(original_node_ids)}

# Step 5: Update edge_index to use the new indices
updated_edge_index = []
for edge in edge_index_tensor.t().tolist():
    updated_edge_index.append([node_id_to_index[edge[0]], node_id_to_index[edge[1]]])
updated_edge_index = torch.tensor(updated_edge_index, dtype=torch.long).t().contiguous()

# Step 6: Create data loaders
train_data = train_data.__class__(x=train_data.x, edge_index=updated_edge_index, y=train_data.y.long())
test_data = test_data.__class__(x=test_data.x, edge_index=updated_edge_index, y=test_data.y.long())

train_loader = DataLoader([train_data], batch_size=32, shuffle=True)
test_loader = DataLoader([test_data], batch_size=32, shuffle=False)

from sklearn.metrics import accuracy_score

# Step 7: Train the model
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    train_loss = 0  # Initialize training loss

    for data in train_loader:
        optimizer.zero_grad()
        out = model(data.x, data.edge_index)

        # Calculate loss, squeeze data.y to get rid of the singleton dimension
        loss = loss_fn(out, data.y.squeeze().long())
        loss.backward()
        optimizer.step()

        train_loss += loss.item()  # Accumulate training loss

    # Average training loss for the epoch
    train_loss /= len(train_loader)

    # Step 8: Evaluate the model on the test set
    model.eval()
    test_loss = 0  # Initialize test loss
    y_true_test = []
    y_pred_test = []

    with torch.no_grad():
        for data in test_loader:
            out = model(data.x, data.edge_index)
            # Calculate test loss
            loss = loss_fn(out, data.y.squeeze().long())
            test_loss += loss.item()  # Accumulate test loss

            # Gather predictions and true labels
            y_true_test.extend(data.y.squeeze().tolist())
            y_pred_test.extend(out.argmax(dim=1).tolist())

    # Average test loss for the epoch
    test_loss /= len(test_loader)

    # Calculate metrics
    test_acc = accuracy_score(y_true_test, y_pred_test)
    test_f1 = f1_score(y_true_test, y_pred_test, average='weighted')

    # Display the metrics
    print(f'Epoch: {epoch + 1} | TrainLoss: {train_loss:.2f} | TestLoss: {test_loss:.2f} | TestAcc: {test_acc:.2f} | TestF1: {test_f1:.2f}')




# Step 8: Evaluate the model on the test set
y_true_test = []
y_pred_test = []

with torch.no_grad():
    for data in test_loader:
        out = model(data.x, data.edge_index)
        y_true_test.extend(data.y.tolist())
        y_pred_test.extend(out.argmax(dim=1).tolist())

# Final F1 score on the test set
f1_test = f1_score(y_true_test, y_pred_test, average='weighted')
print(f'Final Test F1 Score: {f1_test:.4f}')


Epoch: 1 | TrainLoss: 0.75 | TestLoss: 0.62 | TestAcc: 0.54 | TestF1: 0.59
Epoch: 2 | TrainLoss: 0.70 | TestLoss: 0.59 | TestAcc: 0.80 | TestF1: 0.73
Epoch: 3 | TrainLoss: 0.67 | TestLoss: 0.58 | TestAcc: 0.81 | TestF1: 0.73
Epoch: 4 | TrainLoss: 0.65 | TestLoss: 0.56 | TestAcc: 0.81 | TestF1: 0.72
Epoch: 5 | TrainLoss: 0.62 | TestLoss: 0.55 | TestAcc: 0.81 | TestF1: 0.72
Epoch: 6 | TrainLoss: 0.60 | TestLoss: 0.54 | TestAcc: 0.81 | TestF1: 0.72
Epoch: 7 | TrainLoss: 0.58 | TestLoss: 0.53 | TestAcc: 0.81 | TestF1: 0.72
Epoch: 8 | TrainLoss: 0.57 | TestLoss: 0.52 | TestAcc: 0.81 | TestF1: 0.72
Epoch: 9 | TrainLoss: 0.56 | TestLoss: 0.51 | TestAcc: 0.81 | TestF1: 0.72
Epoch: 10 | TrainLoss: 0.54 | TestLoss: 0.51 | TestAcc: 0.81 | TestF1: 0.72
Epoch: 11 | TrainLoss: 0.55 | TestLoss: 0.51 | TestAcc: 0.81 | TestF1: 0.72
Epoch: 12 | TrainLoss: 0.54 | TestLoss: 0.51 | TestAcc: 0.81 | TestF1: 0.72
Epoch: 13 | TrainLoss: 0.53 | TestLoss: 0.51 | TestAcc: 0.81 | TestF1: 0.72
Epoch: 14 | TrainLoss