# About

When I was trying to reproduce 1st place solution magic #2, I come up with a naiive algorithm which cancels background noize.
In this algorithm, it's not necessary to assume there exists the dupulicated image on the dataset.

The process is like below:

1. for each images, apply process below
1. normalize each column
1. for each columns, find similar column in the image by kNN search
1. replace the column with the difference of original and nearest matched column

It's not perfectly eliminate the background noize, and it sometimes eliminate the signal, but it shows the possibility that even if the duplicated image not exists on the dataset, the 1st place approach -- clean background noise -- can be effective for the general task.

In [None]:
from pathlib import Path

import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.neighbors import NearestNeighbors

from tqdm import tqdm

In [None]:
ROOT = Path('/kaggle')
INPUT = ROOT / 'input'
DATA = INPUT / 'seti-breakthrough-listen'
TRAIN = DATA / 'train'
TEST = DATA / 'test'

# Load Dataset

In [None]:
def add_image_path(df: pd.DataFrame, root):
    out_df = df.copy()
    out_df['image_path'] = df['id'].apply(lambda x: root / x[0] / f'{x}.npy')
    return out_df

In [None]:
train_df = pd.read_csv(DATA / 'train_labels.csv')
test_df = pd.read_csv(DATA / 'sample_submission.csv')
all_df = pd.concat([train_df, test_df]).reset_index(drop=True)

In [None]:
train_df = add_image_path(train_df, TRAIN)
test_df = add_image_path(test_df, TEST)

image_paths = np.concatenate([
    train_df['image_path'].values,
    test_df['image_path'].values,
])

# Utility Functions

In [None]:
def normalize(x: np.ndarray) -> np.ndarray:
    return (x - x.mean(keepdims=True)) / x.std(keepdims=True)

In [None]:
def normalize_columnwise(x: np.ndarray) -> np.ndarray:
    return (x - x.mean(axis=0, keepdims=True)) / x.std(axis=0, keepdims=True)

In [None]:
def clean_image(x: np.ndarray) -> np.ndarray:
    x = normalize_columnwise(x)
    neigh = NearestNeighbors(n_neighbors=2, algorithm="brute", metric="minkowski", p=2)
    neigh.fit(x.T)
    _, inds = neigh.kneighbors(x.T)
    nearest_columns = inds[:, 1]
    x = x - x[:, nearest_columns]
    x = x.clip(min=0.0)
    return x

In [None]:
def plot_cleaned_image(image_path):
    x = np.load(image_path)
    x = x[0]
    
    x1 = x.astype('f')
    x2 = clean_image(x1)

    fig, axes = plt.subplots(1, 2, figsize=(12, 6))
    axes[0].imshow(x1)
    axes[1].imshow(x2)
    
    plt.show()
    plt.close()

In [None]:
def get_index_by_id_starts_with(df: pd.DataFrame, image_id: str) -> int:
    return df[df['id'].str.startswith(image_id)].index.item()

# Positive Example

In [None]:
plot_cleaned_image(image_paths[594])

In [None]:
plot_cleaned_image(image_paths[19798])

In [None]:
plot_cleaned_image(image_paths[23742])

In [None]:
plot_cleaned_image(image_paths[59038])

In [None]:
plot_cleaned_image(image_paths[44])

In [None]:
plot_cleaned_image(image_paths[36])

In [None]:
plot_cleaned_image(image_paths[61])

In [None]:
plot_cleaned_image(image_paths[98])

In [None]:
plot_cleaned_image(image_paths[109])

In [None]:
plot_cleaned_image(image_paths[195])

In [None]:
plot_cleaned_image(image_paths[33930])

In [None]:
plot_cleaned_image(image_paths[get_index_by_id_starts_with(all_df, '38e4f')])

In [None]:
plot_cleaned_image(image_paths[get_index_by_id_starts_with(all_df, '9928b0')])

In [None]:
plot_cleaned_image(image_paths[get_index_by_id_starts_with(all_df, 'd74b3a')])

In [None]:
plot_cleaned_image(image_paths[get_index_by_id_starts_with(all_df, '04d80')])

In [None]:
plot_cleaned_image(image_paths[get_index_by_id_starts_with(all_df, '53f6a20')])

In [None]:
plot_cleaned_image(image_paths[get_index_by_id_starts_with(all_df, '896a08')])

In [None]:
plot_cleaned_image(image_paths[get_index_by_id_starts_with(all_df, '779062')])

# Negative Example

In [None]:
plot_cleaned_image(image_paths[get_index_by_id_starts_with(all_df, 'b49b6')])

In [None]:
plot_cleaned_image(image_paths[get_index_by_id_starts_with(all_df, 'd8664')])

In [None]:
plot_cleaned_image(image_paths[get_index_by_id_starts_with(all_df, '00865')])

In [None]:
plot_cleaned_image(image_paths[get_index_by_id_starts_with(all_df, '81b958')])

In [None]:
plot_cleaned_image(image_paths[get_index_by_id_starts_with(all_df, '41a48')])

In [None]:
plot_cleaned_image(image_paths[get_index_by_id_starts_with(all_df, 'd8ca4')])

In [None]:
plot_cleaned_image(image_paths[get_index_by_id_starts_with(all_df, '66c7f97')])

In [None]:
plot_cleaned_image(image_paths[get_index_by_id_starts_with(all_df, 'a325f3')])

# Test Data

In [None]:
plot_cleaned_image(image_paths[get_index_by_id_starts_with(all_df, 'bd3bb')])

In [None]:
plot_cleaned_image(image_paths[get_index_by_id_starts_with(all_df, 'afffc46a')])

In [None]:
plot_cleaned_image(image_paths[get_index_by_id_starts_with(all_df, 'bf9667')])

In [None]:
plot_cleaned_image(image_paths[get_index_by_id_starts_with(all_df, 'e1b81')])

In [None]:
plot_cleaned_image(image_paths[get_index_by_id_starts_with(all_df, 'b9e2c2')])

In [None]:
plot_cleaned_image(image_paths[get_index_by_id_starts_with(all_df, 'a9520d')])

In [None]:
plot_cleaned_image(image_paths[get_index_by_id_starts_with(all_df, '7a943b44')])