In [10]:
import os
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from scipy.stats import rankdata
from tqdm import tqdm
import scml

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Using device: cuda
NVIDIA GeForce GTX 1060 6GB
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [12]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [13]:
%%time
df = pd.read_csv("input/validation_data.csv", engine="c", low_memory=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30108 entries, 0 to 30107
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   worker      30108 non-null  int64 
 1   less_toxic  30108 non-null  object
 2   more_toxic  30108 non-null  object
dtypes: int64(1), object(2)
memory usage: 705.8+ KB
Wall time: 216 ms


In [14]:
texts = set(df["less_toxic"]) | set(df["more_toxic"])
texts = list(texts)
df = pd.DataFrame(data={"text": texts})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14251 entries, 0 to 14250
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    14251 non-null  object
dtypes: object(1)
memory usage: 111.5+ KB


# Preprocess Text

In [15]:
def preprocess(row) -> str:
    res = row["text"]
    res = " ".join(res.split())
    return res

In [16]:
col = "text"
df[col] = df.progress_apply(preprocess, axis=1)

100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 66593.17it/s]


# Embeddings

In [17]:
model = SentenceTransformer("pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2", device=device)
model.max_seq_length = 128
sentences = list(df["text"])

In [18]:
%%time
em = model.encode(sentences=sentences, batch_size=1000, show_progress_bar=True, convert_to_numpy=True)
print(f"em.shape={em.shape}")

Batches:   0%|          | 0/15 [00:00<?, ?it/s]

em.shape=(14251, 384)
Wall time: 23.4 s


In [19]:
%%time
em_size = em.shape[1]
em_cols = [f"zz{i:04d}" for i in range(em_size)]
df[em_cols] = em
df[em_cols] = df[em_cols].astype(np.float32)
# avoid fragmented frame
df = df.copy()
del sentences

  self[col] = igetitem(value, i)


Wall time: 305 ms


In [20]:
cols = ["text"]
cols += em_cols
df[cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14251 entries, 0 to 14250
Data columns (total 385 columns):
 #    Column  Non-Null Count  Dtype  
---   ------  --------------  -----  
 0    text    14251 non-null  object 
 1    zz0000  14251 non-null  float32
 2    zz0001  14251 non-null  float32
 3    zz0002  14251 non-null  float32
 4    zz0003  14251 non-null  float32
 5    zz0004  14251 non-null  float32
 6    zz0005  14251 non-null  float32
 7    zz0006  14251 non-null  float32
 8    zz0007  14251 non-null  float32
 9    zz0008  14251 non-null  float32
 10   zz0009  14251 non-null  float32
 11   zz0010  14251 non-null  float32
 12   zz0011  14251 non-null  float32
 13   zz0012  14251 non-null  float32
 14   zz0013  14251 non-null  float32
 15   zz0014  14251 non-null  float32
 16   zz0015  14251 non-null  float32
 17   zz0016  14251 non-null  float32
 18   zz0017  14251 non-null  float32
 19   zz0018  14251 non-null  float32
 20   zz0019  14251 non-null  float32
 21   zz0020  14

In [21]:
%%time
df[cols].to_parquet("output/val.parquet", index=False)

Wall time: 537 ms
