In [1]:
from transformers import AutoTokenizer
import pandas as pd
from tqdm import tqdm
from pathlib import Path
import os
import re

tokenizer = AutoTokenizer.from_pretrained('NbAiLab/nb-bert-base')

base_dir = Path(os.getcwd()).parent
train_dir = base_dir / 'data' / 'train'
dev_dir = base_dir / 'data' / 'dev'
test_dir = base_dir / 'data' / 'test'

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text) 
    text = re.sub(r'\s([?.!"](?:\s|$))', r'\1', text)  
    return text.strip()

all_texts = []

for folder in [train_dir, dev_dir, test_dir]:
    for file in folder.iterdir():
        with open(file, 'r', encoding='utf-8') as f:
            all_texts.append(f.read())

all_texts = [preprocess_text(text) for text in all_texts]

tokenized_lengths = []
for text in tqdm(all_texts, desc="Tokenizing texts"):
    length = len(tokenizer.encode(text, add_special_tokens=True))
    tokenized_lengths.append(length)

df = pd.DataFrame({"text": all_texts, "token_length": tokenized_lengths})

print(df.describe())  

Tokenizing texts: 100%|██████████| 43437/43437 [04:14<00:00, 170.64it/s]


       token_length
count  43437.000000
mean     706.799342
std      452.973931
min        7.000000
25%      396.000000
50%      629.000000
75%      888.000000
max     6466.000000


In [2]:
import json

with open (base_dir / 'data' / 'metadata.json', 'r', encoding='utf-8') as file:
    metadata = json.load(file)

def create_df(folder_path):

    folder_path = Path(folder_path)
    
    data = {'text': [], 'score': []}

    for file_path in folder_path.iterdir():
        file_id = file_path.stem
        if file_id in metadata:
            with file_path.open ('r', encoding='utf-8') as file:
                data['text'].append(preprocess_text(file.read()))
                data['score'].append(metadata[file_id]['rating'])
                
    return pd.DataFrame(data)
                     
train_df = create_df(train_dir)
print(train_df['score'].value_counts())

score
5    12614
4    11304
3     6004
2     2287
6     2161
1      379
Name: count, dtype: int64


In [4]:
all_texts[240][:300]

'master of none s01 mesterlig av aziz ansari. jeg blir truffet av alt aziz ansari kaster mot meg i hans nye komiserie master of none. jeg elsker musikken, digger humoren og storkoser meg med den laidbacke stemningen. jeg liker barene de henger på, jakkene de bruker, nabolagene de rusler rundt i, venn'