In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/learning-equality-curriculum-recommendations/sample_submission.csv
/kaggle/input/learning-equality-curriculum-recommendations/topics.csv
/kaggle/input/learning-equality-curriculum-recommendations/correlations.csv
/kaggle/input/learning-equality-curriculum-recommendations/content.csv


In [2]:
!pip install cupy

Collecting cupy
  Downloading cupy-13.3.0.tar.gz (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting fastrlock>=0.5 (from cupy)
  Using cached fastrlock-0.8.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_28_x86_64.whl.metadata (9.3 kB)
Using cached fastrlock-0.8.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_28_x86_64.whl (51 kB)
Building wheels for collected packages: cupy
  Building wheel for cupy (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[63 lines of output][0m
  [31m   [0m Generating cache key from header files...
  [31m   [0m Cache key (1610 files matching /tmp/pip-install-d86cn7ef/cupy_4d13ecad8c5c4144bd2f5586

### Load Python Packages

In [3]:
#basics
import numpy as np
import pandas as pd 
import seaborn as sns
import time
import matplotlib.pyplot as plt
import missingno as msno
from sklearn.base import clone
from sklearn import set_config
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from tqdm.auto import tqdm

import warnings
warnings.filterwarnings("ignore")

#preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import QuantileTransformer, quantile_transform

#statistics
from scipy.stats import randint, mode, pearsonr, norm, skew, kurtosis 
from scipy.optimize import minimize

#transformers and pipeline
from sklearn.base import BaseEstimator, TransformerMixin, clone, ClassifierMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion

#feature engineering
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import RFECV

#algorithms
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, Pool, CatBoostRegressor
import lightgbm as lgb
from lightgbm import LGBMClassifier, LGBMRegressor
from lightgbm.callback import early_stopping, log_evaluation
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingRegressor, VotingClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

#model evaluation
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score, log_loss, auc, accuracy_score, balanced_accuracy_score
from sklearn.metrics import mean_squared_error, make_scorer, RocCurveDisplay, confusion_matrix
from sklearn.metrics import roc_curve, matthews_corrcoef, cohen_kappa_score

# Optuna and visualization tools
import optuna
USE_OPTUNA = False
from optuna.samplers import TPESampler
from optuna.visualization import plot_contour
from optuna.visualization import plot_edf
from optuna.visualization import plot_intermediate_values
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_slice

from matplotlib.colors import LinearSegmentedColormap

random_state = 42

SEED = 42
n_splits = 5

In [4]:
import torch
from transformers import AutoTokenizer, AutoModel
import cupy as cp
from cuml.metrics import pairwise_distances

device = "cuda" if torch.cuda.is_available() else "cpu"
%env TOKENIZERS_PARALLELISM=true

import os
for dirname, _, filenames in os.walk('/kaggle/input/learning-equality-curriculum-recommendations'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

ModuleNotFoundError: No module named 'cupy'

### First Look into Data

#### Check for missing values

In [None]:
class CFG:
    INPUT = '/kaggle/input/learning-equality-curriculum-recommendations'
    MODEL = '/kaggle/input/sentence-embedding-models/paraphrase-MiniLM-L12-v2'
    MAX_LEN = 384
    SELECT_TOP_N = 5

In [None]:
content_df = pd.read_csv(f'{CFG.INPUT}/content.csv')
correlations_df = pd.read_csv(f'{CFG.INPUT}/correlations.csv')
topics_df = pd.read_csv(f'{CFG.INPUT}/topics.csv')
sub_df = pd.read_csv(f'{CFG.INPUT}/sample_submission.csv')

### Modeling

In [None]:
model = AutoModel.from_pretrained(CFG.MODEL)
model.eval()
model.to(device)

tokenizer = AutoTokenizer.from_pretrained(CFG.MODEL)

### Vector Creation

In [None]:
vecs = []
for _, row in tqdm(content_df.iterrows(), total=len(content_df)):
    title = row['title']
    if type(title) is float:
        title = row['description']
    if type(title) is float:
        title = row['text']
    
    tok = tokenizer(title)
    for k, v in tok.items():
        tok[k] = torch.tensor(v[:CFG.MAX_LEN]).to(device).unsqueeze(0)
    with torch.no_grad():
        output = model(**tok)
    vec = output.last_hidden_state.squeeze(0).mean(0).cpu()
    vecs.append(vec)
    
vecs1 = torch.stack(vecs)

In [None]:
sub_topic_ids = sub_df['topic_id'].tolist()
_topics_df = topics_df.query(f'id in {sub_topic_ids}')

In [None]:
vecs = []
for _, row in tqdm(_topics_df.iterrows(), total=len(_topics_df)):
    title = row['title']
    if type(title) is float:
        title = row['description']
    if type(title) is float:
        title = "This content contains no text."
    
    tok = tokenizer(title)
    for k, v in tok.items():
        tok[k] = torch.tensor(v[:CFG.MAX_LEN]).to(device).unsqueeze(0)
    with torch.no_grad():
        output = model(**tok)
    vec = output.last_hidden_state.squeeze(0).mean(0).cpu()
    vecs.append(vec)
    
vecs2 = torch.stack(vecs)

In [None]:
vecs1 = cp.asarray(vecs1)
vecs2 = cp.asarray(vecs2)

### Predictions through Models

In [None]:
predicts = []
for v2 in vecs2:
    sim = pairwise_distances(v2.reshape(1, len(v2)), vecs1, metric='cosine')
    p = " ".join([content_df.loc[s, 'id'] for s in sim.argsort(1)[0, :CFG.SELECT_TOP_N].get()])
    predicts.append(p)

In [None]:
sub_df['content_ids'] = predicts
sub_df.head()

In [None]:
sub_df.to_csv('submission.csv', index=None)