In [116]:
import pandas as pd
import numpy as np
import stellargraph as sg
from stellargraph.mapper import GraphSAGENodeGenerator
from stellargraph.layer import GraphSAGE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import layers, models, optimizers, losses, metrics


In [117]:
import os
import pandas as pd

# Path to the folders containing the data
dyslexia_folder = "Data/Dyslexic"
control_folder = "Data/Control"

# Load dyslexia samples
dyslexia_data = []
for filename in os.listdir(dyslexia_folder):
    if filename.endswith(".csv"):
        data = pd.read_csv(os.path.join(dyslexia_folder, filename))
        data["label"] = "dyslexia"  # Add label column
        dyslexia_data.append(data)

# Load control samples
control_data = []
for filename in os.listdir(control_folder):
    if filename.endswith(".csv"):
        sample_data = pd.read_csv(os.path.join(control_folder, filename))
        sample_data["label"] = "control"  # Add label column
        control_data.append(sample_data)

# Display dyslexia data after adding the label column
print("Dyslexia Data:")
for idx, sample_data in enumerate(dyslexia_data):
    print(f"Sample {idx + 1}:")
    print(sample_data.head())  # Use sample_data instead of data
    print()

# Display control data after adding the label column
print("Control Data:")
for idx, sample_data in enumerate(control_data):
    print(f"Sample {idx + 1}:")
    print(sample_data.head())
    print()


Dyslexia Data:
Sample 1:
   Unnamed: 0     T       LX       LY       RX       RY     label
0           0   0.0  0.00000  0.00000  0.00000  0.00000  dyslexia
1           1  20.0 -0.00001  0.00000  0.65535 -0.65536  dyslexia
2           2  40.0 -0.00001 -0.65537  0.65536 -1.31073  dyslexia
3           3  60.0  0.65536 -1.96609  0.65536 -1.96609  dyslexia
4           4  80.0  0.65536 -1.96609  1.31072 -1.96609  dyslexia

Sample 2:
   Unnamed: 0     T       LX       LY       RX       RY     label
0           0   0.0  0.00000  0.00000  0.00000  0.00000  dyslexia
1           1  20.0  0.00000  0.00000  0.00000 -0.65536  dyslexia
2           2  40.0  0.00000 -0.00001  0.65536 -0.00001  dyslexia
3           3  60.0  0.65536 -0.00001  0.65537 -0.00001  dyslexia
4           4  80.0  0.65536  0.65535  0.65537 -0.00001  dyslexia

Sample 3:
   Unnamed: 0     T       LX       LY       RX       RY     label
0           0   0.0  0.00000  0.00000  0.00000  0.00000  dyslexia
1           1  20.0  0.00000 

In [118]:
print(sample_data.columns)


Index(['Unnamed: 0', 'T', 'LX', 'LY', 'RX', 'RY', 'label'], dtype='object')


In [119]:
print(data.columns)

Index(['Unnamed: 0', 'T', 'LX', 'LY', 'RX', 'RY', 'label'], dtype='object')


In [120]:
# Drop the "Unnamed: 0" column from each DataFrame in the dyslexia data list
for i in range(len(dyslexia_data)):
    dyslexia_data[i] = dyslexia_data[i].drop(columns=["Unnamed: 0"])

# Print each sample of dyslexia data separately
print("Dyslexia Data:")
for i, sample in enumerate(dyslexia_data):
    print(f"\nSample {i+1}:")
    print(sample.head())

# Extract features X and labels y from the last dyslexia sample
X = dyslexia_data[-1][["LX", "LY", "RX", "RY", "T"]]  # Assuming the last sample is the one you want
y = dyslexia_data[-1]["label"]  # Assuming the last sample is the one you want

# Encode labels (if necessary)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)


Dyslexia Data:

Sample 1:
      T       LX       LY       RX       RY     label
0   0.0  0.00000  0.00000  0.00000  0.00000  dyslexia
1  20.0 -0.00001  0.00000  0.65535 -0.65536  dyslexia
2  40.0 -0.00001 -0.65537  0.65536 -1.31073  dyslexia
3  60.0  0.65536 -1.96609  0.65536 -1.96609  dyslexia
4  80.0  0.65536 -1.96609  1.31072 -1.96609  dyslexia

Sample 2:
      T       LX       LY       RX       RY     label
0   0.0  0.00000  0.00000  0.00000  0.00000  dyslexia
1  20.0  0.00000  0.00000  0.00000 -0.65536  dyslexia
2  40.0  0.00000 -0.00001  0.65536 -0.00001  dyslexia
3  60.0  0.65536 -0.00001  0.65537 -0.00001  dyslexia
4  80.0  0.65536  0.65535  0.65537 -0.00001  dyslexia

Sample 3:
      T       LX       LY       RX       RY     label
0   0.0  0.00000  0.00000  0.00000  0.00000  dyslexia
1  20.0  0.00000 -0.65536 -0.00001  0.00000  dyslexia
2  40.0 -0.00001 -0.65537  0.65534 -1.96609  dyslexia
3  60.0  0.65534 -2.62145  1.31070 -2.62146  dyslexia
4  80.0  1.31069 -2.62145  1.96605

In [121]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [122]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [123]:
# Define preprocessing steps for numerical features
numeric_features = ["LX", "LY", "RX", "RY", "T"]
numeric_transformer = StandardScaler()

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply preprocessing to training set
X_train_scaled = X_train.copy()
X_train_scaled[numeric_features] = numeric_transformer.fit_transform(X_train_scaled[numeric_features])

# Apply preprocessing to testing set
X_test_scaled = X_test.copy()
X_test_scaled[numeric_features] = numeric_transformer.transform(X_test_scaled[numeric_features])


In [124]:
# Create the StellarGraph object with the training data and edges
graph = sg.StellarGraph(X_train_scaled, edges=None)

In [125]:
# Define the GraphSAGE node generator
generator = GraphSAGENodeGenerator(graph, batch_size=50, num_samples=[5, 5])



In [126]:
# Define the GraphSAGE model architecture
graphsage_model = GraphSAGE(
    layer_sizes=[32, 32], generator=generator, bias=True, dropout=0.5
)
# Build the model
x_inp, x_out = graphsage_model.in_out_tensors()
prediction = layers.Dense(units=1, activation="sigmoid")(x_out)
model = models.Model(inputs=x_inp, outputs=prediction)
model.compile(
    optimizer=optimizers.Adam(lr=0.01),
    loss=losses.binary_crossentropy,
    metrics=["acc"],
)



In [127]:
model = models.Model(inputs=x_inp, outputs=prediction)
model.compile(
    optimizer=optimizers.Adam(lr=0.01),
    loss=losses.binary_crossentropy,
    metrics=["acc"],
)




In [128]:
history = model.fit(generator.flow(X_train.index, y_train), epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [129]:
print("Unique node IDs in the graph:", len(graph.nodes()))


Unique node IDs in the graph: 1599


In [130]:
print("Unique node IDs in the test data:", len(X_test.index.unique()))


Unique node IDs in the test data: 400


In [131]:
missing_nodes = set(X_test.index.unique()) - set(graph.nodes())
print("Missing nodes in the graph:", missing_nodes)


Missing nodes in the graph: {23, 29, 30, 32, 44, 45, 49, 56, 59, 63, 65, 67, 69, 70, 73, 76, 78, 99, 100, 109, 111, 115, 120, 123, 124, 128, 135, 162, 163, 168, 173, 175, 185, 188, 194, 196, 203, 210, 211, 212, 218, 220, 231, 233, 237, 239, 247, 251, 254, 256, 261, 266, 270, 275, 281, 289, 297, 298, 303, 305, 306, 307, 316, 322, 324, 331, 342, 344, 350, 351, 352, 353, 354, 361, 366, 367, 368, 374, 382, 383, 393, 394, 411, 414, 416, 420, 422, 427, 429, 432, 433, 438, 450, 453, 462, 464, 471, 478, 479, 480, 482, 483, 485, 486, 495, 507, 514, 519, 526, 527, 529, 532, 534, 535, 538, 543, 548, 552, 555, 561, 570, 576, 581, 582, 583, 584, 590, 593, 596, 599, 607, 611, 618, 620, 628, 637, 650, 651, 670, 674, 679, 680, 694, 704, 705, 706, 707, 710, 715, 730, 733, 741, 744, 745, 746, 755, 757, 759, 771, 774, 777, 780, 781, 785, 792, 802, 806, 807, 808, 818, 819, 824, 832, 838, 845, 849, 855, 861, 865, 873, 886, 887, 892, 904, 905, 906, 907, 914, 923, 925, 926, 936, 939, 942, 943, 948, 949, 952,

In [132]:
import pandas as pd

# Assuming 'test_data' is your original test data DataFrame
# 'graph' is your graph object containing nodes

# Calculate the unique node IDs in the graph and test data
print("Unique node IDs in the graph:", len(graph.nodes()))
print("Unique node IDs in the test data:", len(X_test.index.unique()))

# Identify missing nodes in the graph compared to the test data
missing_nodes = list(set(X_test.index.unique()) - set(graph.nodes()))
#print("Missing nodes in the graph:", missing_nodes)

# Create new rows for missing nodes
missing_nodes_data = pd.DataFrame(index=missing_nodes, columns=X_test.columns)

# Assign default label values (if applicable)
missing_nodes_data['label'] = 'default_label'

# Concatenate missing nodes data with original test data
adjusted_test_data = pd.concat([X_test, missing_nodes_data])

# Now 'adjusted_test_data' contains all nodes present in the graph
#print(adjusted_test_data)


Unique node IDs in the graph: 1599
Unique node IDs in the test data: 400


In [133]:
# Create new rows for missing nodes
missing_nodes_data = pd.DataFrame(index=missing_nodes, columns=X_test.columns)

# Assign default label values (if applicable)
missing_nodes_data['label'] = 'default_label'

# Concatenate missing nodes data with original test data
adjusted_test_data = pd.concat([X_test, missing_nodes_data])

# Now 'adjusted_test_data' contains all nodes present in the graph


In [134]:
print("Length of adjusted_test_data.index:", len(adjusted_test_data.index))

Length of adjusted_test_data.index: 800


In [135]:
problematic_nodes = adjusted_test_data.index[~adjusted_test_data.index.isin(graph.nodes())]
print("Problematic node IDs:", problematic_nodes)


Problematic node IDs: Index([ 256,  352,  298,  581, 1288, 1765,  420, 1587,   65, 1611,
       ...
       1964, 1973, 1976, 1977, 1985, 1986, 1989, 1990, 1991, 1997],
      dtype='int64', length=800)


In [136]:
print("Length of adjusted_test_data.index:", len(adjusted_test_data.index))
print("Length of y_test:", len(y_test))


Length of adjusted_test_data.index: 800
Length of y_test: 400


In [137]:
# Check node indices in the graph
graph_node_indices = list(graph.nodes())

# Compare with test data indices
print("Graph Node Indices:", graph_node_indices)
print("Test Data Indices:", X_test.index)

# Reindex test data if necessary
X_test.reset_index(drop=True, inplace=True)

# Verify data alignment and consistency of preprocessing steps

# Debug data pipeline and graph creation if needed


Graph Node Indices: [240, 812, 1824, 1244, 1084, 579, 365, 1746, 1239, 1646, 1344, 1580, 985, 71, 1022, 259, 51, 198, 1189, 1760, 332, 1358, 591, 554, 1596, 1736, 1441, 1811, 1872, 426, 1744, 1503, 588, 979, 1975, 1392, 721, 398, 567, 436, 192, 1761, 787, 1918, 619, 1861, 1614, 1938, 141, 720, 2, 1852, 1868, 1157, 339, 630, 494, 572, 265, 376, 1949, 1538, 439, 271, 184, 937, 244, 544, 445, 1036, 1593, 752, 909, 1797, 678, 415, 273, 1720, 1813, 1453, 250, 413, 614, 1604, 610, 310, 1510, 381, 1788, 425, 530, 1317, 1237, 1355, 408, 1613, 692, 1553, 931, 1682, 1061, 1010, 1323, 654, 182, 1444, 1173, 170, 292, 1463, 585, 300, 1875, 1650, 1114, 272, 1548, 963, 900, 371, 1658, 1883, 978, 1922, 518, 334, 693, 1165, 1783, 602, 15, 1696, 1381, 1684, 1389, 964, 465, 767, 199, 1290, 1054, 1994, 1299, 1281, 1622, 1399, 1412, 1179, 1557, 43, 274, 1333, 101, 1361, 727, 1226, 107, 1554, 497, 1979, 879, 1083, 617, 1105, 1359, 613, 701, 72, 1220, 1509, 1309, 1144, 481, 1067, 506, 1164, 788, 1988, 1617, 

In [138]:
import numpy as np

# Reindexing the graph nodes to be a contiguous range starting from 0
graph_node_indices = np.arange(len(graph_node_indices))

# Now, graph_node_indices should contain a contiguous range of indices starting from 0
print("Reindexed Graph Node Indices:", graph_node_indices)


Reindexed Graph Node Indices: [   0    1    2 ... 1596 1597 1598]


In [139]:
test_data_indices = np.arange(len(test_data_indices))

# Now, graph_node_indices should contain a contiguous range of indices starting from 0
print("Reindexed test data Node Indices:", test_data_indices)


Reindexed test data Node Indices: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
 234 235 236 237 

In [140]:
import numpy as np

# Assuming reindexed_graph_node_indices is the array of reindexed graph node indices
# and test_data_indices is the range index of your test data

reindexed_graph_node_indices = np.array([0, 1, 2, ..., 1596, 1597, 1598])
test_data_indices = np.arange(0, 400)  # Assuming your test data has 400 entries

print("Length of reindexed graph node indices:", len(reindexed_graph_node_indices))
print("Length of test data indices:", len(test_data_indices))


Length of reindexed graph node indices: 7
Length of test data indices: 400


In [141]:
import numpy as np

reindexed_graph_node_indices = np.array([0, 1, 2, ..., 1596, 1597, 1598])
test_data_indices = np.arange(0, 400)

# Check if the lengths are the same
if len(reindexed_graph_node_indices) == len(test_data_indices):
    # Check if all elements match
    if np.all(reindexed_graph_node_indices == test_data_indices):
        print("Indices match between reindexed graph node indices and test data indices.")
    else:
        print("Indices do not match between reindexed graph node indices and test data indices.")
else:
    print("Lengths of reindexed graph node indices and test data indices do not match.")


Lengths of reindexed graph node indices and test data indices do not match.


In [142]:
# Extract the node IDs from the graph
graph_node_ids = graph.nodes()
graph_node_ids = np.array(graph_node_ids)

# Get the node indices corresponding to the indices in X_test
test_node_indices = X_test.index

# Find the node IDs corresponding to the node indices in X_test
test_node_ids = graph_node_ids[test_node_indices]

# Update X_test indices to match the node IDs
X_test.index = test_node_ids

# Initialize the generator using the updated X_test indices
generator = GraphSAGENodeGenerator(graph, batch_size=50, num_samples=[5, 5])

# Step 3: Evaluate the model using the adjusted X_test indices
eval_results = model.evaluate(generator.flow(X_test.index, y_test))

# Print test set metrics
print("\nTest Set Metrics:")
print("Loss: {:.4f}".format(eval_results[0]))
print("Accuracy: {:.4f}".format(eval_results[1]))


Test Set Metrics:
Loss: 0.0429
Accuracy: 1.0000


In [150]:
# Find intersection of valid_node_ids and index of y_test_series
valid_node_ids_in_y_test = np.intersect1d(valid_node_ids, y_test_series.index)

# Filter out y_test values corresponding to valid node IDs
y_test_valid = y_test_series.loc[valid_node_ids_in_y_test]

# Re-evaluate the flow using the adjusted X_test indices and filtered y_test values
eval_results = model.evaluate(generator.flow(valid_node_ids_in_y_test, y_test_valid))

# Print test set metrics
print("\nTest Set Metrics:")
for name, val in zip(model.metrics_names, eval_results):
    print(f"{name}: {val}")



Test Set Metrics:
loss: 0.04371925815939903
acc: 1.0


In [154]:
# Filter out y_test values corresponding to valid node IDs
y_test_valid = y_test_series.loc[y_test_series.index.intersection(valid_node_ids)]

# Filter the valid node IDs to only include those for which we have target values
valid_node_ids_with_targets = y_test_valid.index.intersection(valid_node_ids)

# Re-evaluate the flow using the adjusted X_test indices and filtered y_test values
eval_results = model.evaluate(generator.flow(valid_node_ids_with_targets, y_test_valid))

# Print test set metrics
print("\nTest Set Metrics:")
for name, val in zip(model.metrics_names, eval_results):
    print(f"{name}: {val}")



Test Set Metrics:
loss: 0.04371925815939903
acc: 1.0


In [156]:
model.summary()


Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_14 (InputLayer)       [(None, 5, 5)]               0         []                            
                                                                                                  
 input_15 (InputLayer)       [(None, 25, 5)]              0         []                            
                                                                                                  
 input_13 (InputLayer)       [(None, 1, 5)]               0         []                            
                                                                                                  
 reshape_16 (Reshape)        (None, 1, 5, 5)              0         ['input_14[0][0]']            
                                                                                            

In [171]:
import numpy as np  
import pandas as pd
import dgl

In [172]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt