In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, concatenate, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import euclidean_distances
import tensorflow as tf
import matplotlib.pyplot as plt

# =========================================================
# 1. 데이터 로딩 (모든 행 다 가져옴)
# =========================================================
train_path = '/content/drive/MyDrive/Toa_sort.csv'
test_path = '/content/drive/MyDrive/Toa_test_sort.csv'

csi_cols = [f'CSI_{i}' for i in range(1, 129)]
columns = ['re_x', 're_y', 'anchor_id', 'TOA'] + csi_cols

print("Loading Data...")
df_train = pd.read_csv(train_path, header=None, names=columns).fillna(0)
df_test = pd.read_csv(test_path, header=None, names=columns).fillna(0)

# [검증] 전체 행 개수 출력
print(f"Total Train Rows: {len(df_train)}")
print(f"Total Test Rows:  {len(df_test)}")

# =========================================================
# 2. 데이터 구조화 (Reshape Only - 자르지 않음!)
# =========================================================
# 사용자님 말씀대로 (-1, 8, 128)로 바로 변환합니다.
# 만약 8로 안 나누어 떨어지면, 마지막 부족한 부분만 '0'으로 패딩해서라도 다 씁니다.

def create_dataset_no_drop(df):
    Xv_csi = df[csi_cols].values
    Xv_toa = df['TOA'].values
    yv = df[['re_x', 're_y']].values

    # 8로 나누어 떨어지지 않을 경우를 대비해 패딩 (데이터 보존용)
    remainder = len(df) % 8
    if remainder != 0:
        pad_len = 8 - remainder
        # 마지막 행을 복사해서 패딩 (버리는 것보다 낫음)
        Xv_csi = np.pad(Xv_csi, ((0, pad_len), (0, 0)), mode='edge')
        Xv_toa = np.pad(Xv_toa, (0, pad_len), mode='edge')
        yv = np.pad(yv, ((0, pad_len), (0, 0)), mode='edge')
        print(f"Warning: Data length not divisible by 8. Padded {pad_len} rows to use ALL data.")

    # 1. CSI: (N, 8, 128, 1) -> 2D CNN 입력 (이미지 취급)
    X_csi = Xv_csi.reshape(-1, 8, 128, 1)

    # 2. TOA: (N, 8) -> Dense 입력
    X_toa = Xv_toa.reshape(-1, 8)
    
    # [Relative TOA] 각 그룹(8개 앵커)별 평균값을 빼서 상대값으로 변환
    # (사용자 확인: TOA 데이터에 0값 없음 -> 단순 평균 차감 사용)
    mean_val = np.mean(X_toa, axis=1, keepdims=True)
    X_toa = X_toa - mean_val

    # 3. Label: (N, 2)
    # 8개 행은 한 세트이므로, 그 중 대표값(마지막 행)을 정답으로 씁니다.
    y_pos = yv.reshape(-1, 8, 2)[:, -1, :]

    return X_csi, X_toa, y_pos

X_train_csi, X_train_toa, y_train = create_dataset_no_drop(df_train)
X_test_csi, X_test_toa, y_test = create_dataset_no_drop(df_test)

print(f"\n[Processing Complete]")
print(f"Train Samples (Groups): {len(X_train_csi)} (= {len(X_train_csi)*8} rows)")
print(f"Test Samples (Groups):  {len(X_test_csi)} (= {len(X_test_csi)*8} rows)")

# =========================================================
# 3. 정규화
# =========================================================
# [중요 수정] CSI 데이터의 편차가 매우 큼 (Max: ~2.6조, Median: ~700만)
# 단순히 Max로 나누면 대부분의 값이 0에 수렴하여 학습이 안됨.
# 따라서 Log 변환(log1p)을 먼저 적용하여 분포를 완화한 후 정규화 수행.
X_train_csi = np.log1p(X_train_csi)
X_test_csi = np.log1p(X_test_csi)

# CSI는 전체 Max 값으로 나누므로 'Global Scaling'입니다. (앵커 간 비율 유지됨)
max_val = np.max(np.abs(X_train_csi))
if max_val == 0: max_val = 1.0

X_train_csi_s = X_train_csi / max_val
X_test_csi_s = X_test_csi / max_val

# [TOA Scaling 수정]
# StandardScaler는 기본적으로 '열(Column)별로' 스케일링합니다.
# 하지만 앵커 간의 거리 차이(Magnitude)가 중요하므로, 전체 데이터를 하나로 묶어서(Global) 스케일링해야 합니다.
scaler_toa = StandardScaler()
# (N, 8) -> (N*8, 1)로 펴서 fit -> 모든 앵커에 동일한 평균/분산 적용
X_train_toa_s = scaler_toa.fit_transform(X_train_toa.reshape(-1, 1)).reshape(X_train_toa.shape)
X_test_toa_s = scaler_toa.transform(X_test_toa.reshape(-1, 1)).reshape(X_test_toa.shape)

scaler_y = StandardScaler()
y_train_s = scaler_y.fit_transform(y_train)

# =========================================================
# 4. 모델 구축 (2D CNN + Late Fusion + FSL Embedding)
# =========================================================
# Branch A: CSI (2D CNN) - (8, 128, 1) 이미지를 봄
input_csi = Input(shape=(8, 128, 1), name='input_csi')
x = Conv2D(32, (3, 3), padding='same', activation='relu')(input_csi)
x = BatchNormalization()(x)
x = MaxPooling2D((2, 2))(x)
x = Dropout(0.2)(x)

x = Conv2D(64, (3, 3), padding='same', activation='relu')(x)
x = BatchNormalization()(x)
x = MaxPooling2D((2, 2))(x)
x = Dropout(0.2)(x)

x = Conv2D(128, (3, 3), padding='same', activation='relu')(x)
x = BatchNormalization()(x)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)

# Branch B: TOA (Dense) - (8,) 벡터를 봄
input_toa = Input(shape=(8,), name='input_toa')
y = Dense(32, activation='relu')(input_toa)
y = BatchNormalization()(y)
y = Dense(32, activation='relu')(y)

# Late Fusion
combined = concatenate([x, y])

# FSL Embedding Layer
fc = Dense(128, activation='relu')(combined)
embedding_raw = Dense(128, name='embedding_raw')(fc)
embedding_norm = tf.keras.layers.Lambda(lambda x: tf.math.l2_normalize(x, axis=1), name='embedding_norm')(embedding_raw)

# Training Head
output = Dense(2, name='output')(embedding_norm)

model = Model(inputs=[input_csi, input_toa], outputs=output)
model.compile(optimizer=Adam(0.001), loss='mse', metrics=['mae'])

# =========================================================
# 5. 모델 학습
# =========================================================
# 사용자 요청: Validation Set을 랜덤하게 분리 (20%)
X_tr_csi, X_val_csi, X_tr_toa, X_val_toa, y_tr, y_val = train_test_split(
    X_train_csi_s, X_train_toa_s, y_train_s, test_size=0.2, random_state=42, shuffle=True
)

print("\n[Training Feature Extractor...]")
history = model.fit([X_tr_csi, X_tr_toa], y_tr,
                    epochs=30, batch_size=32, 
                    validation_data=([X_val_csi, X_val_toa], y_val),
                    verbose=1)

# =========================================================
# 6. FSL Inference (전체 테스트셋 평가)
# =========================================================
print("\n[Running FSL Inference on ALL Test Data]")

emb_model = Model(inputs=model.input, outputs=model.get_layer('embedding_norm').output)

# 족보(Support Set) 100개 랜덤 추출
# *** 중요: 족보 데이터도 (CSI, TOA) 쌍으로 이루어져 있습니다 ***
n_support = 100
total_samples = len(X_test_csi_s)
all_indices = np.arange(total_samples)

np.random.seed(42)
support_indices = np.random.choice(all_indices, n_support, replace=False)

# 나머지 전부는 Query Set
query_indices = np.setdiff1d(all_indices, support_indices)

print(f"Support Set Used: {len(support_indices)}")
print(f"Query Set Used:   {len(query_indices)} (Using ALL remaining data)")

# 데이터 분리
# 족보 데이터 A (CSI): 신호 패턴 정보
X_sup_csi = X_test_csi_s[support_indices]
# 족보 데이터 B (TOA): 거리 정보
X_sup_toa = X_test_toa_s[support_indices]
# 족보 정답 (좌표): 이 족보가 어디 위치인지 알려주는 라벨
y_sup_real = y_test[support_indices]

X_qry_csi = X_test_csi_s[query_indices]
X_qry_toa = X_test_toa_s[query_indices]
y_qry_real = y_test[query_indices]

# 임베딩 추출
print("Extracting embeddings...")
# 족보 데이터(A, B)를 모델에 넣어 '족보 임베딩'을 만듭니다.
sup_emb = emb_model.predict([X_sup_csi, X_sup_toa], verbose=0)
# 문제 데이터(A, B)를 모델에 넣어 '문제 임베딩'을 만듭니다.
qry_emb = emb_model.predict([X_qry_csi, X_qry_toa], verbose=1)

# 거리 계산
dists = euclidean_distances(qry_emb, sup_emb)

# Softmax
temperature = 0.2
weights = np.exp(-dists / temperature)
weights /= np.sum(weights, axis=1, keepdims=True)

# 예측
pred_fsl = np.dot(weights, y_sup_real)

# 평가
rmse = np.sqrt(mean_squared_error(y_qry_real, pred_fsl))
print(f"\n========================================================")
print(f"Final RMSE: {rmse:.4f} meters")
print(f"========================================================")

# 시각화
plt.figure(figsize=(10, 8))
plt.scatter(y_qry_real[:, 0], y_qry_real[:, 1], c='blue', label='Actual', alpha=0.5)
plt.scatter(pred_fsl[:, 0], pred_fsl[:, 1], c='orange', label='Predicted', alpha=0.5)
plt.xlabel('X'); plt.ylabel('Y')
plt.title(f'FSL Result (All Rows Used)')
plt.legend()
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8))
plt.scatter(actual_original[:, 0], actual_original[:, 1], label='Actual', alpha=0.5)
plt.scatter(predictions_original[:, 0], predictions_original[:, 1], label='Predicted', alpha=0.5)
plt.xlabel('X Position')
plt.ylabel('Y Position')
plt.title('Actual vs Predicted Positions')
plt.legend()
plt.show()