In [1]:
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2
[0m

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.neighbors import KNeighborsRegressor

import faiss

import os
import gc
import torch
from tqdm import tqdm

In [3]:
gpu_ids = "0"  # can be e.g. "3,4" for multiple GPUs 
os.environ['CUDA_VISIBLE_DEVICES'] = gpu_ids
faiss.get_num_gpus()

1

In [4]:
class FaissKNeighbors:
    def __init__(self, k=5):
        self.index = None
        self.y = None
        self.k = k
        
    def fit(self, X, y):
        # self.cpu_index = faiss.IndexFlat(X.shape[1], faiss.METRIC_L1)
        self.cpu_index = faiss.IndexFlatL2(X.shape[1])
        self.gpu_index = faiss.index_cpu_to_all_gpus(self.cpu_index)
        self.gpu_index.add(X.astype(np.float32))

        self.y = y

    def predict(self, X):
        distances, indices = self.gpu_index.search(X.astype(np.float32), k=self.k, )
        predictions = np.array(self.y[indices])
        w = 1./(np.square(distances)+1e-8)
        return np.sum(w*predictions, axis=1)/np.sum(w, axis=1)

In [5]:
train_df = pd.read_csv("/kaggle/input/aml-dataset/train.csv")

In [6]:
train_Y = train_df['PRODUCT_LENGTH'].values
train_X = np.load("/kaggle/input/amlc-title-embeddings-vanilla/title_embeddings.npy")
train_Y_log = np.log(train_Y+1)

In [7]:
# X_train, X_test, y_train, y_test = train_test_split(train_X, train_Y_log, test_size=0.20, random_state=0)
# X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(train_X, train_Y_log, test_size=0.20, random_state=0)

In [8]:
model = FaissKNeighbors(k=10)
model.fit(train_X, train_Y_log)

In [9]:
gc.collect()
torch.cuda.empty_cache()

In [10]:
test_X = np.load("/kaggle/input/amlc-test-title-embeddings/title_embeddings_test.npy")

In [11]:
predictions = []

for test_x in tqdm(np.split(test_X, 16)):
    y_preds = model.predict(test_x)
    predictions.append(y_preds)
    gc.collect()
    torch.cuda.empty_cache()

100%|██████████| 16/16 [03:30<00:00, 13.15s/it]


In [12]:
y_preds = np.concatenate(predictions)
y_preds = np.exp(y_preds)-1

In [13]:
submission = test_df = pd.read_csv("/kaggle/input/aml-dataset/sample_submission.csv")
submission['PRODUCT_LENGTH'] = y_preds
submission.to_csv("KNN_submission.csv")