In [None]:
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master/')

import numpy as np
import pandas as pd
import random
import math
import itertools

import cv2
from PIL import Image
import matplotlib.pyplot as plt
import albumentations as A
from albumentations.pytorch import ToTensorV2

from sklearn import metrics
from tqdm.notebook import tqdm

from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import IncrementalPCA
from sklearn.neighbors import NearestNeighbors

import timm
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms

In [None]:
def getpath(x, mode='train'):
    if mode == 'train':
        return f'../input/seti-breakthrough-listen/train/{x[0]}/{x}.npy'
    return f'../input/seti-breakthrough-listen/test/{x[0]}/{x}.npy'

train_df = pd.read_csv('../input/seti-breakthrough-listen/train_labels.csv')
test_df = pd.read_csv('../input/seti-breakthrough-listen/sample_submission.csv')

train_df['filepath'] = train_df.id.apply(lambda x : getpath(x, 'train'))
test_df['filepath'] = test_df.id.apply(lambda x: getpath(x, 'test'))

In [None]:
class setiDataset(Dataset):
    def __init__(self, df, pesudo_targets=None, isVisual = False): 
        self.df = df
        self.pesudo_targets = pesudo_targets
        if not self.pesudo_targets:
            self.pesudo_targets = [0]*len(self.df)
            
        self.isVisual = isVisual
    
    def fileinfo(self, idx):
        return self.df.filepath.iloc[idx]

    def __getitem__(self, idx):
        filepath = self.fileinfo(idx)
        image = np.load(filepath).astype('float')
        image = np.vstack([image[0], image[2], image[4]]).transpose()
        
        if self.isVisual:
            target = self.df.target.iloc[idx]
            return image, target
        
        image = cv2.resize(image, (64, 64), interpolation=cv2.INTER_CUBIC)
        image = ToTensorV2()(image=image)['image']
        pesudo_target = self.pesudo_targets[idx]
        
        return image, torch.tensor(pesudo_target, dtype=torch.int64)
    
    def __len__(self):
        return len(self.df)

In [None]:
class models(nn.Module):
    def __init__(self,model_name = 'mobilenetv3_large_100', pretrained = False, kmeans_clusters = 100):
        super(models,self).__init__()
        print(f'Model: {model_name}')
        self.extract =  timm.create_model(model_name, 
                                          pretrained=pretrained,
                                          in_chans = 1)
        self.replace_relu_to_silu(self.extract)
        self.myfc = nn.Linear(1000, kmeans_clusters)
        
    def replace_relu_to_silu(self, model):
        for child_name, child in model.named_children():
            if isinstance(child, nn.ReLU):
                setattr(model, child_name, nn.SiLU(inplace=True))
            else:
                self.replace_relu_to_silu(child)
    
    def forward(self, x):
        x = self.extract(x)
        x = nn.Tanh()(x)
        x = self.myfc(x)
        return x

In [None]:
class LabelSmoothing(nn.Module):
    """NLL loss with label smoothing.
    """
    def __init__(self, smoothing=0.0):
        """Constructor for the LabelSmoothing module.
        :param smoothing: label smoothing factor
        """
        super(LabelSmoothing, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing

    def forward(self, x, target):
        logprobs = torch.nn.functional.log_softmax(x, dim=-1)
        nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
        nll_loss = nll_loss.squeeze(1)
        smooth_loss = -logprobs.mean(dim=-1)
        loss = self.confidence * nll_loss + self.smoothing * smooth_loss
        return loss.mean()

In [None]:
def feature_evaluate(data_loader, model, device):
    model.eval()
    
    final_outputs = []
    with torch.no_grad():
        for data in tqdm(data_loader, position=0, leave=True, desc='Pesudo Targeting'):
            images, targets = data
            images = images.to(device, dtype=torch.float)
            
            output = model(images)
            output = output.detach().cpu().numpy().tolist()
            final_outputs.extend(output)
        
    return final_outputs

In [None]:
def train(data_loader, model, optimizer, criterion, device):
    model.train()
    
    training_loss = 0
    train_bar = tqdm(data_loader, position=0, leave=True, desc='Training')
    for idx, data in enumerate(train_bar):
        images, targets = data
        images = images.to(device, dtype=torch.float)
        targets = targets.to(device, dtype=torch.int64)

        optimizer.zero_grad()
        output = model(images)
        
        loss = criterion(output, targets)
        loss.backward()
        training_loss += loss.item()
        
        train_bar.set_description(f'Training, loss: {training_loss/(idx + 1)}')  
        optimizer.step()

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device('cpu')
    print("GPU not available, CPU used")

In [None]:
Batch_Size = 64
criterion = LabelSmoothing(smoothing = 0.1)
pca = IncrementalPCA(n_components=50, whiten=True)
kmeans = MiniBatchKMeans(n_clusters=100, batch_size=4096, init_size=300)
df = pd.concat([train_df,test_df]).reset_index(drop=True)

model = models(pretrained = True)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(5):
    for trn_df in tqdm(np.array_split(df, 5)):
        cluster_dataset = setiDataset(trn_df)
        cluster_loader= torch.utils.data.DataLoader(cluster_dataset, batch_size=Batch_Size, shuffle=False)
    
        features = feature_evaluate(cluster_loader, model, device)
        reduced = pca.fit_transform(features)
        pesudo_targets = list(kmeans.fit_predict(reduced))
    
        train_dataset = setiDataset(trn_df, pesudo_targets=pesudo_targets)
        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=Batch_Size, shuffle=True)
        for sub_epoch in range(3):
            train(train_loader, model, optimizer, criterion, device)

In [None]:
import seaborn as sns

cluster_dataset = setiDataset(df)
cluster_loader= torch.utils.data.DataLoader(cluster_dataset, batch_size=Batch_Size, shuffle=False)
    
features = feature_evaluate(cluster_loader, model, device)
reduced = pca.fit_transform(features)
pesudo_targets = list(kmeans.fit_predict(reduced))


In [None]:
df['cluster'] = pesudo_targets

sns.histplot(data=df, x='cluster', hue='target', multiple="stack")
df.head()

In [None]:
def show_cluster(cluster, df, limit=16):
    tmp_df = df[df.cluster == cluster].reset_index(drop=True)
    dataset = setiDataset(tmp_df, isVisual=True)
    
    fig = plt.figure(figsize=(15, 10))
    for i in range(limit):
        image, _ = dataset[i]
        fig.add_subplot(4, 4, i+1)
        plt.imshow(image, aspect='auto')
    fig.suptitle(f'cluster: {cluster}',fontsize=20)
    plt.axis('off')
    plt.show()

In [None]:
for i in range(100):
    show_cluster(i,df)