In [13]:
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from nltk.tokenize import sent_tokenize
!pip install datasets
from datasets import load_dataset



In [14]:
wiki = pd.DataFrame(load_dataset("wikipedia", "20220301.simple")['train'][76998:(25666*4)])
print(wiki.head())
print(wiki.columns.tolist())

       id                                                url  \
0  188301  https://simple.wikipedia.org/wiki/Western%20De...   
1  188303  https://simple.wikipedia.org/wiki/The%20Lonely...   
2  188309  https://simple.wikipedia.org/wiki/Portal%20%28...   
3  188310  https://simple.wikipedia.org/wiki/Valve%20Corp...   
4  188318  https://simple.wikipedia.org/wiki/Sistan%20and...   

                             title  \
0     Western Desert cultural bloc   
1                The Lonely Island   
2              Portal (video game)   
3                Valve Corporation   
4  Sistan and Baluchestan Province   

                                                text  
0  The Western Desert cultural bloc or just Weste...  
1  The Lonely Island is an American comedy group ...  
2  Portal is a puzzle video game made by Valve Co...  
3  Valve Corporation is an American video game de...  
4  Sistān o Balūchestān  is one of the 31 provinc...  
['id', 'url', 'title', 'text']


In [15]:
corpus = []
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device)
model.eval()

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [16]:
import nltk
nltk.download('punkt_tab')
print("Cuda available?: ", torch.cuda.is_available())
for row in tqdm(wiki.itertuples(index=False), total=len(wiki)):
  article_id = row.id
  article_url = row.url
  article_title = row.title
  article_text = row.text
  article_sentences = sent_tokenize(article_text)
  encoded_tensor = model.encode(article_sentences, batch_size=64, device=device, normalize_embeddings=True, convert_to_tensor=True)
  encoded_sentences = encoded_tensor.cpu().numpy()
  article_metadata = [{
      'article_id': article_id,
      'article_url': article_url,
      'article_title': article_title,
      'sentence_id': sentence_id,
      'sentence_text':sentence,
      'sentence_embedding': sentence_embedding
  } for sentence_id, (sentence, sentence_embedding) in enumerate(zip(article_sentences, encoded_sentences))]
  corpus.extend(article_metadata)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Cuda available?:  True


100%|██████████| 25666/25666 [05:55<00:00, 72.12it/s]


In [17]:
df = pd.DataFrame(corpus)
print(df.head())

  article_id                                        article_url  \
0     188301  https://simple.wikipedia.org/wiki/Western%20De...   
1     188301  https://simple.wikipedia.org/wiki/Western%20De...   
2     188301  https://simple.wikipedia.org/wiki/Western%20De...   
3     188301  https://simple.wikipedia.org/wiki/Western%20De...   
4     188303  https://simple.wikipedia.org/wiki/The%20Lonely...   

                  article_title  sentence_id  \
0  Western Desert cultural bloc            0   
1  Western Desert cultural bloc            1   
2  Western Desert cultural bloc            2   
3  Western Desert cultural bloc            3   
4             The Lonely Island            0   

                                       sentence_text  \
0  The Western Desert cultural bloc or just Weste...   
1  The Western Desert can be said to stretch from...   
2  This term is often used by anthropologists and...   
3  Indigenous Australian culture\nRegions of Aust...   
4  The Lonely Island is an A

In [18]:
!pip install fastparquet
df.to_parquet("wikisimple_embedded_4.parquet")
from google.colab import files
files.download("wikisimple_embedded_4.parquet")



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>