### Record Serialization

In [1]:
import pandas as pd
import os 

In [2]:
os.listdir('../data/wdc-block/l_train_l/')

['tableA.csv', 'tableB.csv', 'test.csv', 'train.csv', 'valid.csv']

In [3]:
origin = '../data/wdc-block'
subfolders_mb = {}
for subfolder in os.listdir(origin):
    size=0
    for file in os.listdir(f'{origin}/{subfolder}'):
        size += os.path.getsize(f'{origin}/{subfolder}/{file}')
    subfolders_mb[subfolder] = round(size/1000000,2)

In [4]:
subfolders_mb = dict(sorted(subfolders_mb.items(),key=lambda x: x[1]))
subfolders_mb

{'s_train_s': 5.66,
 's_train_m': 5.82,
 's_train_l': 6.59,
 'm_train_s': 112.15,
 'm_train_m': 112.3,
 'm_train_l': 113.07,
 'l_train_s': 1128.89,
 'l_train_m': 1129.05,
 'l_train_l': 1129.81}

In [5]:
origin_s = f'{origin}/s_train_s'
list_of_files = os.listdir(origin_s)
list_of_dfs = []

for file in list_of_files:
    print(file)
    df = pd.read_csv(os.path.join(origin_s, file))
    list_of_dfs.append(df)
    
tableA, tableB, test, train, dev = list_of_dfs
del list_of_dfs

tableA.csv
tableB.csv
test.csv
train.csv
valid.csv


In [6]:
for i in train.columns:
    print(i,train[i].nunique())

sum = len(train) + len(dev) + len(test)
print(len(train),len(dev),len(test))
print(len(train)/sum,len(dev)/sum,len(test)/sum)

ltable_id 313
rtable_id 370
label 2
cluster_id_left 231
cluster_id_right 269
pair_id 674
674 336 4500
0.12232304900181488 0.060980036297640657 0.8166969147005445


In [7]:
train['label'].value_counts()

0    408
1    266
Name: label, dtype: int64

In [8]:
dev['label'].value_counts()

0    203
1    133
Name: label, dtype: int64

In [9]:
test['label'].value_counts()

0    4000
1     500
Name: label, dtype: int64

In [10]:
tableA.columns,set(tableA.columns)-set(tableB.columns)

(Index(['id', 'brand', 'title', 'description', 'price', 'cluster_id',
        'pricecurrency', 'page_url'],
       dtype='object'),
 set())

#### Implementing 𝑡 = 𝑆𝑒𝑟𝑖𝑎𝑙(𝑟).

In [36]:
def serialize_record(record):
    serialized_attributes = []
    for column in record.index:
        if pd.notna(record[column]):  # Check for non-null values
            attribute_name = f"[Col] {column}"
            attribute_value = f"[Val] {record[column]}"
            serialized_attribute = f"{attribute_name} {attribute_value}"
            serialized_attributes.append(serialized_attribute)
    return " ".join(serialized_attributes)

# Apply the record serialization function to each row of the DataFrame
tableA['serialized_record'] =  tableA.apply(serialize_record, axis=1)
tableB['serialized_record'] =  tableB.apply(serialize_record, axis=1)

#### Create my own graph (My own dataset will not have cluster_ids)

In [80]:
combined_tables = pd.concat([tableA,tableB]).drop('id',axis=1)
combined_tables = combined_tables.reset_index(drop=True)

In [81]:
len(set(combined_tables['cluster_id'].tolist()))

3591

In [82]:
len(set(tableA['cluster_id'].tolist()).union(set(tableB['cluster_id'].tolist())))

3591

In [133]:
import networkx as nx

graph = nx.Graph()
graph.add_nodes_from(combined_tables['serialized_record'].to_dict())

node_list = combined_tables.reset_index().groupby('cluster_id')['index'].agg(lambda x : x.tolist())
for nodes in node_list:
    graph.add_edges_from(nx.complete_graph(nodes).edges())

#### Nevermind, I'll do that later.

In [213]:
train

Unnamed: 0,ltable_id,rtable_id,label,cluster_id_left,cluster_id_right,pair_id
0,1124,545,0,918,501,5355657#45550541
1,2424,643,0,569,580,60508722#37530368
2,1178,303,0,191,286,98077329#36811471
3,2200,414,0,1466,387,46246978#29319775
4,214,186,0,209,178,69863476#69927371
...,...,...,...,...,...,...
669,3382,3343,1,2094,2094,25524192#96857288
670,3409,3370,1,2121,2121,95988#17111826
671,3417,3378,1,2129,2129,20702772#26727944
672,3430,3391,1,2142,2142,16072741#29209713


In [198]:
train.sort_values(['label','ltable_id','rtable_id'],ascending=[False,True,True])

Unnamed: 0,ltable_id,rtable_id,label,cluster_id_left,cluster_id_right,pair_id
247,13,13,1,13,13,35431801#45291729
272,13,1167,1,13,13,35431801#48307629
401,13,1177,1,13,948,35431801#96975145
287,13,1481,1,13,13,35431801#89896876
428,20,238,1,20,228,85241565#49536402
...,...,...,...,...,...,...
594,3840,2959,0,2432,1710,80461165#32345346
581,3843,3034,0,2432,1785,94452187#91965997
552,3857,3254,0,2432,2005,18824446#4518159
644,3880,3171,0,2432,1922,69731456#78541132


In [217]:
train[train.duplicated(['cluster_id_left','cluster_id_right'])].sort_values(['cluster_id_left','cluster_id_right'])

Unnamed: 0,ltable_id,rtable_id,label,cluster_id_left,cluster_id_right,pair_id
272,13,1167,1,13,13,35431801#48307629
287,13,1481,1,13,13,35431801#89896876
323,1930,1481,1,13,13,56230100#89896876
422,1930,1167,1,13,13,56230100#48307629
356,1930,163,1,13,157,56230100#64773733
...,...,...,...,...,...,...
581,3843,3034,0,2432,1785,94452187#91965997
595,3570,3034,0,2432,1785,94490280#91965997
572,3478,3057,0,2432,1808,66582202#72984007
552,3857,3254,0,2432,2005,18824446#4518159


In [169]:
combined_tables.sort_values('cluster_id')

Unnamed: 0,brand,title,description,price,cluster_id,pricecurrency,page_url,serialized_record
0,,Jabra Evolve 75 incl. charging stand UC Stereo,,21990,0,,,[Col] id [Val] 0 [Col] title [Val] Jabra Evolv...
2307,JABRA,JABRA EVOLVE 75+ STEREO UC HEADSET - ON-EAR,JABRA EVOLVE 75+ STEREO UC HEADSET - ON-EAR,204.12,0,,,[Col] id [Val] 2307 [Col] brand [Val] JABRA [C...
5000,,Jabra Evolve 75 UC Stereo Headset | 7599-838-199,Jabra Evolve 75 UC Stereo Headset: MSRP $346.0...,346.00,0,,,[Col] id [Val] 0 [Col] title [Val] Jabra Evolv...
7780,,Jabra Evolve 75 incl. charging stand UC Stereo,,21990,0,,,[Col] id [Val] 2780 [Col] title [Val] Jabra Ev...
7190,,Sony Wireless Digital Noise Cancelling Headpho...,Wireless Noise Cancelling Headphones,388.0000,1,,,[Col] id [Val] 2190 [Col] title [Val] Sony Wir...
...,...,...,...,...,...,...,...,...
9594,,"Glamorous Decorative Resin Vase, Brown",,138.43,79808975,USD,wdc_products_additional_rows,[Col] id [Val] 4594 [Col] title [Val] Glamorou...
9730,Second Nature,Good Luck You Can Do It Greeting Card,You are viewing:A pretty hand-finished greetin...,2.79,80071877,GBP,wdc_products_additional_rows,[Col] id [Val] 4730 [Col] brand [Val] Second N...
9766,Be My Bear,"Jet Bear 16\"" Baby Heartbeat Bear",,17.5,80187698,GBP,wdc_products_additional_rows,[Col] id [Val] 4766 [Col] brand [Val] Be My Be...
9655,Streets Ahead,Pastel Alphabety Wallpaper 420 x 297mm for Dol...,Suitable for 12th scale dolls houses Approxima...,1.99,80366037,GBP,wdc_products_additional_rows,[Col] id [Val] 4655 [Col] brand [Val] Streets ...


In [137]:
len([e for e in graph.edges])

630509

In [165]:
graph.has_edge(0,1)

False

In [126]:
[i for i in nx.connected_components(graph)]


[{0, 2307, 5000, 7780},
 {1, 806, 808, 2694, 5001, 5280, 7190, 7212},
 {2, 1653, 2179, 2685, 5002, 7063, 7144, 7736},
 {3, 1194, 1707, 2497, 5003, 5936, 6636, 7604},
 {4, 5004, 6995},
 {5, 716, 771, 5005, 5641, 7655},
 {6, 570, 991, 5006, 7094},
 {7, 1936, 5007, 7632},
 {8, 5008},
 {9, 5009},
 {10, 2695, 2698, 5010, 6841, 7597},
 {11, 631, 2734, 5011, 6041, 6335, 7463},
 {12, 2476, 5012},
 {13, 1930, 5013, 6167, 6481},
 {14, 2774, 5014, 6398},
 {15, 2294, 5015, 6001, 6139},
 {16, 2147, 5016, 6599, 7229, 7449},
 {17, 1880, 2575, 5017, 6716, 6911, 7370},
 {18, 5018},
 {19, 1747, 2718, 5019, 7112, 7616},
 {20, 1908, 5020, 7424},
 {21, 800, 5021, 5713},
 {22, 1528, 2383, 5022, 6587, 7012, 7672},
 {23, 870, 1378, 2001, 2604, 5023, 5640, 6792, 7249, 7478},
 {24, 1318, 2256, 5024},
 {25, 748, 1949, 5025},
 {26, 1600, 5026},
 {27, 5027, 5950},
 {28, 2667, 5028, 6770},
 {29, 376, 5029, 6879, 7341},
 {30, 1324, 2384, 5030, 6430},
 {31, 1067, 5031},
 {32, 5032},
 {33, 198, 2076, 2438, 5033, 5962,

In [118]:
training_data_replica = nx.to_pandas_edgelist(graph)
training_data_replica['duplicate'] = 0

In [155]:
train.sort_values('ltable_id')

Unnamed: 0,ltable_id,rtable_id,label,cluster_id_left,cluster_id_right,pair_id
204,1,1029,0,1,864,22819422#54181907
25,6,833,0,6,476,36288245#478870
247,13,13,1,13,13,35431801#45291729
287,13,1481,1,13,13,35431801#89896876
230,13,1961,0,13,1341,35431801#39883470
...,...,...,...,...,...,...
594,3840,2959,0,2432,1710,80461165#32345346
581,3843,3034,0,2432,1785,94452187#91965997
552,3857,3254,0,2432,2005,18824446#4518159
644,3880,3171,0,2432,1922,69731456#78541132


In [119]:
training_data_replica

Unnamed: 0,source,target,duplicate
0,0,2307,0
1,0,5000,0
2,0,7780,0
3,1,806,0
4,1,808,0
...,...,...,...
630504,9305,9347,0
630505,9306,9322,0
630506,9324,9337,0
630507,9333,9353,0


In [108]:
clusters = list(nx.connected_components(graph))

# Print the clusters
print(f'Number of clusters = {len(clusters)}')
for cluster in clusters:
    print(cluster)


Number of clusters = 3591
{0, 2307, 7780, 5000}
{5280, 1, 2694, 806, 808, 5001, 7212, 7190}
{2, 2179, 7144, 5002, 1653, 7063, 7736, 2685}
{2497, 3, 1194, 5003, 1707, 6636, 5936, 7604}
{5004, 6995, 4}
{771, 5, 7655, 5641, 716, 5005}
{6, 5006, 7094, 570, 991}
{1936, 7632, 5007, 7}
{8, 5008}
{9, 5009}
{2695, 10, 2698, 7597, 5010, 6841}
{7463, 11, 2734, 5011, 631, 6041, 6335}
{5012, 2476, 12}
{1930, 13, 6481, 5013, 6167}
{5014, 2774, 6398, 14}
{15, 6001, 2294, 5015, 6139}
{2147, 6599, 16, 5016, 7449, 7229}
{7370, 2575, 17, 1880, 5017, 6716, 6911}
{18, 5018}
{7616, 7112, 19, 1747, 5019, 2718}
{7424, 5020, 1908, 20}
{800, 5713, 21, 5021}
{7012, 1528, 2383, 22, 7672, 6587, 5022}
{1378, 870, 5640, 6792, 2604, 2001, 7249, 7478, 23, 5023}
{24, 1318, 5024, 2256}
{25, 5025, 748, 1949}
{1600, 26, 5026}
{27, 5027, 5950}
{6770, 2667, 28, 5028}
{5029, 7341, 376, 29, 6879}
{5030, 1324, 2384, 30, 6430}
{5031, 1067, 31}
{32, 5032}
{33, 198, 2438, 7303, 5033, 5962, 2076, 7292}
{5034, 34, 7717, 2554}
{35, 

### Training Data Preparation (Preprocessing)

### Source-aware sampling