<a href="https://colab.research.google.com/github/shashi3876/kaggle/blob/main/BeyondVisibleSpectrum.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import glob
import os

In [None]:
folder = 'ot/ot/'
npy_data = {}

# Loop through all .npy files
for filepath in glob.glob(os.path.join(folder, '*.npy')):
    key = os.path.splitext(os.path.basename(filepath))[0]
    try:
        npy_data[key] = np.load(filepath)
    except Exception as e:
        print(f"Skipping {key} due to error: {e}")

# Optional: preview keys
print(npy_data.keys())


In [None]:
expected_shape = (128, 128, 125)

for key, arr in npy_data.items():
    if arr.shape != expected_shape:
        print(f"File: {key}.npy --> Shape: {arr.shape}")

# PCA

In [None]:
train = pd.read_csv('train.csv')
test= pd.read_csv('test.csv')

# Expected shape
expected_shape = (128, 128, 125)

# Step 1: Filter valid arrays in npy_data
valid_npy_data = {
    key: arr for key, arr in npy_data.items()
    if isinstance(arr, np.ndarray) and arr.shape == expected_shape
}

# Step 2: Extract base filenames (without .npy) from train and test
train['file_key'] = train['id'].str.replace('.npy', '', regex=False)
test['file_key'] = test['id'].str.replace('.npy', '', regex=False)

# Step 3: Keep only rows with corresponding valid arrays
train_clean = train[train['file_key'].isin(valid_npy_data)].copy()
test_clean = test[test['file_key'].isin(valid_npy_data)].copy()

# Step 4: Prepare arrays for NN training
X_train = np.stack([valid_npy_data[fid] for fid in train_clean['file_key']])
y_train = train_clean['label'].values

# Step 5: Prepare arrays for test set
X_test = np.stack([valid_npy_data[fid] for fid in test_clean['file_key']])
test_ids = test_clean['id'].values

# Step 6: Create final test DataFrame
test_df = pd.DataFrame({'id': test_ids})

from skimage.transform import resize

# Target shape for CNN input
target_shape = (64, 64, 64)

def resize_volume(volume, target_shape):
    return resize(volume, target_shape, mode='constant', preserve_range=True)

# Apply resizing to all training and test volumes
X_train_resized = np.array([resize_volume(arr, target_shape) for arr in X_train])
X_test_resized = np.array([resize_volume(arr, target_shape) for arr in X_test])

# Add channel dimension for CNN input
X_train_cnn = X_train_resized[..., np.newaxis] / 255.0
X_test_cnn = X_test_resized[..., np.newaxis] / 255.0

from sklearn.decomposition import PCA

# Flatten the training data
X_train_flat = X_train_cnn.reshape(X_train_cnn.shape[0], -1)

# Fit PCA on the flattened training data
pca = PCA()
pca.fit(X_train_flat)

# Calculate cumulative explained variance
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

# Determine number of components for each threshold
n_components_90 = np.argmax(cumulative_variance >= 0.90) + 1
n_components_95 = np.argmax(cumulative_variance >= 0.95) + 1
n_components_99 = np.argmax(cumulative_variance >= 0.99) + 1

print(f"Components for 90% variance: {n_components_90}")
print(f"Components for 95% variance: {n_components_95}")
print(f"Components for 99% variance: {n_components_99}")

from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
import numpy as np
import pandas as pd

# Step 1: Flatten the 3D arrays
X_train_flat = X_train_cnn.reshape(X_train_cnn.shape[0], -1)
X_test_flat = X_test_cnn.reshape(X_test_cnn.shape[0], -1)

# Step 2: Apply PCA with 337 components
pca = PCA(n_components=337)
X_train_pca = pca.fit_transform(X_train_flat)
X_test_pca = pca.transform(X_test_flat)

# Step 3: Fit Ridge regression
reg = Ridge()
reg.fit(X_train_pca, y_train)

# Step 4: Predict on test set
y_pred_pca = reg.predict(X_test_pca)

# Step 5: Format submission
submission_pca = pd.DataFrame({'id': test_df['id'], 'label': y_pred_pca})
submission_pca['label'] = submission_pca['label'].clip(0, 100)
submission_pca.to_csv('submission_pca.csv', index=False)

# Ensure 'id' columns are strings for safe comparison
submission_ids = submission_pca['id'].astype(str)
test_ids = test['id'].astype(str)

# Find missing ids from train
missing_ids = test_ids[~test_ids.isin(submission_ids)]

# Create new rows with label = 44
new_rows = pd.DataFrame({'id': missing_ids, 'label': 49})

# Append to submission_df
submission_pca_extended = pd.concat([submission_pca, new_rows], ignore_index=True)

# Optional: sort by id or reset index
submission_pca_extended = submission_pca_extended.sort_values('id').reset_index(drop=True)

submission_pca_extended.rename(columns={'id': 'ID'}, inplace=True)
submission_pca_extended.to_csv('submission_pca.csv', index=False)