<a href="https://colab.research.google.com/github/steliosg23/Data_Challenge_2025/blob/main/2_Node_Processing_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Cell 1: Import Libraries



In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from google.colab import drive
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, classification_report, confusion_matrix
from tqdm import tqdm
import networkx as nx
import pickle


### Cell 2: Mount Google Drive


In [None]:
# Mount Google Drive
drive.mount('/content/drive')

# Set the base directory (adjust this path to your directory)
base_dir = '/content/drive/MyDrive/Data Science AUEB/Data Challenge/data/'

# Define file paths
edgelist_path = f'{base_dir}edgelist.txt'
test_path = f'{base_dir}test.txt'
augmented_train_df = pd.read_csv(f'{base_dir}augmented_train_df.csv')

# Load dataset files
edgelist = pd.read_csv(edgelist_path, sep=',', header=None, names=['source', 'target'])

# Check if the data is loaded correctly
print(augmented_train_df.head())
print(edgelist.head())


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
   product_id  class_label                              title_and_description  \
0       56356            5  reelight sl flashing extended bicycle headligh...   
1      169687            2    troy sraidltcxbt troy deltacx black sraidltcxbt   
2      226442            5  nashbar torque wrench take agony anguish tryin...   
3      197572            1                                 casio men watch gv   
4      126461            2  five pack alabama style umbrella arm quot rig ...   

    price  
0   87.99  
1  217.07  
2   64.99  
3   85.00  
4   21.99  
   source  target
0  251528  237411
1  100805   74791
2   38634   97747
3  247470   77089
4  267060  250490


In [None]:
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from scipy.stats import zscore

# Build the graph (assuming you have 'edgelist' already)
G = nx.Graph()
G.add_edges_from(edgelist.values)

# Print the number of nodes and edges before filtering
before_nodes = len(G.nodes)
before_edges = len(G.edges)
print(f"Before filtering:")
print(f"Number of nodes: {before_nodes}")
print(f"Number of edges: {before_edges}")

# Calculate degree centrality
degree_centrality = nx.degree_centrality(G)

# Convert degree centrality to a numpy array for easier analysis
degree_values = np.array(list(degree_centrality.values()))

# Calculate percentiles for degree centrality
degree_25th = np.percentile(degree_values, 25)
degree_75th = np.percentile(degree_values, 75)

# Apply the threshold using percentiles:
degree_filtered_nodes = [node for node, dc in degree_centrality.items() if dc > degree_25th]

# Optionally, use z-scores to filter nodes based on degree centrality:
degree_zscore = zscore(degree_values)

# Define Z-score thresholds (e.g., nodes with Z-score < -1 or > 1 are considered extreme)
degree_zscore_filtered_nodes = [node for node, z in zip(degree_centrality.keys(), degree_zscore) if abs(z) < 1]

# Using only degree centrality filtering for the final set of nodes
important_nodes = degree_filtered_nodes

# Create a subgraph with the important nodes
G_preprocessed = G.subgraph(important_nodes).copy()

# Print the number of nodes and edges after filtering
after_nodes = len(important_nodes)
after_edges = G_preprocessed.number_of_edges()
print(f"After filtering:")
print(f"Number of nodes: {after_nodes}")
print(f"Number of edges: {after_edges}")


Before filtering:
Number of nodes: 276453
Number of edges: 1811087
After filtering:
Number of nodes: 202332
Number of edges: 1655182


In [None]:
import networkx as nx
import pandas as pd

# Step 1: Load the preprocessed graph (if not already loaded)
# Assuming G_preprocessed is already available as per the context provided

# Step 2: Create the get_neighbors function for G_preprocessed
def get_neighbors(product_id, graph):
    return list(graph.neighbors(product_id))

# Step 3: Expand the DataFrame such that each product_id has a row for each of its neighbors
expanded_rows = []

# Loop through each row in augmented_train_df
for _, row in augmented_train_df.iterrows():
    product_id = row['product_id']
    neighbors = get_neighbors(product_id, G_preprocessed)

    # For each neighbor, create a new row with the same data but with the neighbor
    for neighbor in neighbors:
        new_row = row.copy()
        new_row['neighbor_product_id'] = neighbor  # Add a new column for the neighbor product_id
        expanded_rows.append(new_row)

# Step 4: Create the new expanded DataFrame
expanded_df = pd.DataFrame(expanded_rows)



In [None]:
expanded_df

Unnamed: 0,product_id,class_label,title_and_description,price,neighbor_product_id
0,56356,5,reelight sl flashing extended bicycle headligh...,87.99,191931
0,56356,5,reelight sl flashing extended bicycle headligh...,87.99,70844
0,56356,5,reelight sl flashing extended bicycle headligh...,87.99,53239
0,56356,5,reelight sl flashing extended bicycle headligh...,87.99,120356
0,56356,5,reelight sl flashing extended bicycle headligh...,87.99,197800
...,...,...,...,...,...
124327,159573,15,bull style ft bull ft australian whip style mu...,19.79,182614
124327,159573,15,bull style ft bull ft australian whip style mu...,19.79,125879
124327,159573,15,bull style ft bull ft australian whip style mu...,19.79,215765
124327,159573,15,bull style ft bull ft australian whip style mu...,19.79,81130


In [None]:
# Display the updated DataFrame to the user
expanded_df.to_csv(f'{base_dir}expanded_df.csv', index=False)