In [9]:
face_path = "../face/features/"
iris_path = "../iris/features/"
finger_path = "../fingerprint/features/"

In [2]:
import os
import pandas as pd
import numpy as np

In [4]:
def load_features(path):
    embs = []
    labels = []
    for file in os.listdir(path):
        emb = np.load(path + file)
        
        label = file.split("_")[0].replace("sub", "")
        
        embs.append(emb)
        labels.append(label)
    return embs, labels

In [10]:
face_embs = pd.read_csv("../face/face_embeddings.csv")
iris_embs = load_features(iris_path)
finger_embs = load_features(finger_path)

In [11]:
def embeddings_to_dataframe(embeddings, labels):
    """
    Converts an array of embeddings and labels into a DataFrame.
    
    Parameters:
        embeddings (numpy.ndarray): A 2D array of shape (n_samples, n_dimensions) representing the embeddings.
        labels (numpy.ndarray or list): A 1D array or list of shape (n_samples,) representing the labels.
    
    Returns:
        pandas.DataFrame: A DataFrame where each column corresponds to an embedding dimension and includes a 'label' column.
    """
    # Ensure embeddings and labels are numpy arrays
    embeddings = np.array(embeddings)
    labels = np.array(labels)
    
    # Check for compatibility
    if embeddings.shape[0] != labels.shape[0]:
        raise ValueError("Number of embeddings and labels must match.")
    
    # Create a DataFrame with embedding columns
    df = pd.DataFrame(embeddings, columns=[f"dim_{i}" for i in range(embeddings.shape[1])])
    
    # Add labels as the last column
    df['label'] = labels
    
    return df


In [12]:
iris_embs = embeddings_to_dataframe(iris_embs[0], iris_embs[1]) 
finger_embs = embeddings_to_dataframe(finger_embs[0], finger_embs[1])

In [15]:
len(iris_embs), len(finger_embs), len(face_embs)

(5000, 10000, 11995)

In [36]:
iris_embs["label"] =iris_embs["label"].apply(lambda x: int(x))
face_embs["label"] = face_embs["label"].apply(lambda x: int(x))
finger_embs["label"] = finger_embs["label"].apply(lambda x: int(x))

In [37]:
iris_embs.to_csv("iris_embeddings.csv", index=False)    
finger_embs.to_csv("finger_embeddings.csv", index=False)
face_embs.to_csv("face_embeddings.csv", index=False)

Oversample pairs, to have 10k samples.

In [51]:
def remove_duplicate_columns(df):
    while True in df.columns.duplicated():
        df = df.loc[:, ~df.columns.duplicated()].copy()
    return df

In [None]:
# Get pairs
def get_pairs(df1, df2):
    """
    Concatenates two dataframes into a single dataframe with pairs of embeddings and labels.
    """
    final_df = pd.DataFrame()
    # 1. oversample the smaller dataset to get the same number of samples per subject
    for label in df1['label'].unique():
        sub_df1 = df1[df1['label'] == label]
        sub_df2 = df2[df2['label'] == label]
        
        # Oversample the smaller dataset
        num_samples = max(len(sub_df1), len(sub_df2))
        sub_df1 = sub_df1.sample(num_samples, replace=True).reset_index(drop=True)
        sub_df2 = sub_df2.sample(num_samples, replace=True).reset_index(drop=True)
        
        embeddings_for_label = pd.concat([sub_df1, sub_df2], axis=1)
        final_df = pd.concat([final_df, embeddings_for_label], axis=0)    
    
    return final_df
            

In [43]:
len(face_embs[face_embs['label'] == 0]    )

30

In [44]:
concat1 = get_pairs(iris_embs, face_embs)

0
30 10 30
100
30 10 30
101
30 10 30
102
30 10 30
103
30 10 30
104
30 10 30
105
20 10 20
106
30 10 30
107
30 10 30
108
13 10 13
109
25 10 25
10
25 10 25
110
22 10 22
111
30 10 30
112
30 10 30
113
31 10 31
114
26 10 26
115
23 10 23
116
18 10 18
117
10 10 10
118
30 10 30
119
29 10 29
11
22 10 22
120
19 10 19
121
30 10 30
122
30 10 30
123
29 10 29
124
17 10 17
125
10 10 8
126
30 10 30
127
25 10 25
128
30 10 30
129
30 10 30
12
20 10 20
130
30 10 30
131
30 10 30
132
30 10 30
133
20 10 20
134
24 10 24
135
12 10 12
136
21 10 21
137
30 10 30
138
19 10 19
139
29 10 29
13
30 10 30
140
30 10 30
141
21 10 21
142
30 10 30
143
20 10 20
144
30 10 30
145
25 10 25
146
23 10 23
147
31 10 31
148
21 10 21
149
25 10 25
14
30 10 30
150
30 10 30
151
30 10 30
152
30 10 30
153
22 10 22
154
30 10 30
155
20 10 20
156
24 10 24
157
30 10 30
158
30 10 30
159
30 10 30
15
30 10 30
160
29 10 29
161
20 10 20
162
29 10 29
163
30 10 30
164
28 10 28
165
30 10 30
166
23 10 23
167
23 10 23
168
30 10 30
169
30 10 30
16
20 10

In [46]:
concat1.to_csv("iris_face_pairs.csv", index=False)

In [47]:
concat2 = get_pairs(iris_embs, finger_embs)
concat2.to_csv("iris_finger_pairs.csv", index=False)

0
20 10 20
100
20 10 20
101
20 10 20
102
20 10 20
103
20 10 20
104
20 10 20
105
20 10 20
106
20 10 20
107
20 10 20
108
20 10 20
109
20 10 20
10
20 10 20
110
20 10 20
111
20 10 20
112
20 10 20
113
20 10 20
114
20 10 20
115
20 10 20
116
20 10 20
117
20 10 20
118
20 10 20
119
20 10 20
11
20 10 20
120
20 10 20
121
20 10 20
122
20 10 20
123
20 10 20
124
20 10 20
125
20 10 20
126
20 10 20
127
20 10 20
128
20 10 20
129
20 10 20
12
20 10 20
130
20 10 20
131
20 10 20
132
20 10 20
133
20 10 20
134
20 10 20
135
20 10 20
136
20 10 20
137
20 10 20
138
20 10 20
139
20 10 20
13
20 10 20
140
20 10 20
141
20 10 20
142
20 10 20
143
20 10 20
144
20 10 20
145
20 10 20
146
20 10 20
147
20 10 20
148
20 10 20
149
20 10 20
14
20 10 20
150
20 10 20
151
20 10 20
152
20 10 20
153
20 10 20
154
20 10 20
155
20 10 20
156
20 10 20
157
20 10 20
158
20 10 20
159
20 10 20
15
20 10 20
160
20 10 20
161
20 10 20
162
20 10 20
163
20 10 20
164
20 10 20
165
20 10 20
166
20 10 20
167
20 10 20
168
20 10 20
169
20 10 20
16
20 1

In [50]:
concat3 = get_pairs(face_embs, finger_embs)
concat3.to_csv("face_finger_pairs.csv", index=False)

0
30 30 20
100
30 30 20
101
30 30 20
102
30 30 20
103
30 30 20
104
30 30 20
105
20 20 20
106
30 30 20
107
30 30 20
108
20 13 20
109
25 25 20
10
25 25 20
110
22 22 20
111
30 30 20
112
30 30 20
113
31 31 20
114
26 26 20
115
23 23 20
116
20 18 20
117
20 10 20
118
30 30 20
119
29 29 20
11
22 22 20
120
20 19 20
121
30 30 20
122
30 30 20
123
29 29 20
124
20 17 20
125
20 8 20
126
30 30 20
127
25 25 20
128
30 30 20
129
30 30 20
12
20 20 20
130
30 30 20
131
30 30 20
132
30 30 20
133
20 20 20
134
24 24 20
135
20 12 20
136
21 21 20
137
30 30 20
138
20 19 20
139
29 29 20
13
30 30 20
140
30 30 20
141
21 21 20
142
30 30 20
143
20 20 20
144
30 30 20
145
25 25 20
146
23 23 20
147
31 31 20
148
21 21 20
149
25 25 20
14
30 30 20
150
30 30 20
151
30 30 20
152
30 30 20
153
22 22 20
154
30 30 20
155
20 20 20
156
24 24 20
157
30 30 20
158
30 30 20
159
30 30 20
15
30 30 20
160
29 29 20
161
20 20 20
162
29 29 20
163
30 30 20
164
28 28 20
165
30 30 20
166
23 23 20
167
23 23 20
168
30 30 20
169
30 30 20
16
20 20

In [53]:
def get_fusion(df1, df2, df3):
    """
    Concatenates three dataframes into a single dataframe with pairs of embeddings and labels.

    Parameters:
        df1, df2, df3 (pd.DataFrame): DataFrames containing embeddings and labels.
        
    Returns:
        pd.DataFrame: A concatenated DataFrame with balanced pairs of embeddings and labels.
    """
    final_df = pd.DataFrame()
    
    # Iterate over unique labels
    for label in df1['label'].unique():
        # Filter each DataFrame for the current label
        sub_df1 = df1[df1['label'] == label]
        sub_df2 = df2[df2['label'] == label]
        sub_df3 = df3[df3['label'] == label]
        
        # Determine the maximum number of samples across the three DataFrames
        num_samples = max(len(sub_df1), len(sub_df2), len(sub_df3))
        
        # Oversample each subset to match the maximum sample size
        sub_df1 = sub_df1.sample(num_samples, replace=True).reset_index(drop=True)
        sub_df2 = sub_df2.sample(num_samples, replace=True).reset_index(drop=True)
        sub_df3 = sub_df3.sample(num_samples, replace=True).reset_index(drop=True)
        
        # Concatenate the subsets horizontally
        embeddings_for_label = pd.concat([sub_df1, sub_df2, sub_df3], axis=1)
        
        # Append to the final DataFrame
        final_df = pd.concat([final_df, embeddings_for_label], axis=0)
    
    return final_df


In [54]:
fused = get_fusion(iris_embs, face_embs, finger_embs)
fused.to_csv("fused_embeddings_all.csv", index=False)

In [55]:
fused.head()

Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,...,dim_1399,dim_1400,dim_1401,dim_1402,dim_1403,dim_1404,dim_1405,dim_1406,dim_1407,label
0,2.104017,-0.170356,0.116948,0.863928,0.068629,0.349573,-0.002297,-0.173247,0.11598,-0.103387,...,-0.167258,-0.167281,0.8023,-0.051888,1.410862,0.137343,-0.041643,-0.183347,0.393928,0
1,1.196912,-0.163061,0.042709,1.435415,0.10221,0.11139,0.031204,-0.09085,0.132027,-0.141371,...,0.331119,-0.173812,0.547666,-0.083266,1.676898,0.135032,-0.047371,-0.115656,0.039445,0
2,0.69637,0.075619,-0.153844,0.613134,-0.154019,1.124316,0.294839,-0.175519,-0.158292,-0.156962,...,0.274028,0.290986,1.133154,-0.025173,0.70066,0.451025,-0.068381,-0.098868,0.075244,0
3,0.484032,1.097426,-0.004922,0.596132,-0.152265,2.167861,0.04013,-0.125078,-0.046076,0.273736,...,0.331119,-0.173812,0.547666,-0.083266,1.676898,0.135032,-0.047371,-0.115656,0.039445,0
4,0.408721,-0.015454,0.016189,-0.111631,0.614916,0.924748,0.155281,-0.095092,-0.025299,-0.170658,...,0.274028,0.290986,1.133154,-0.025173,0.70066,0.451025,-0.068381,-0.098868,0.075244,0
