# Dataset Preprocessing

Now let's transform the dataset into the desired dataset.

Tasks:
- To balance the dataset (done)
- To tokenize text
- To pad text
- To transform categories into numerical categories

In [1]:
# Import necessary libraries
import pandas as pd

In [2]:
# Specify the file path where you want to save the CSV file
file_path = 'train_data.csv'
loaded_df = pd.read_csv(file_path)

In [3]:
loaded_df

Unnamed: 0,text,category
0,Estoy en contra con la acción de fumar de luga...,B2
1,"En mi opinión , fumar en lugares públicos debe...",B2
2,! Buenos días ! Me llamo Tokareva_Ekaterina . ...,B2
3,Por ultimo yo os animo a todos para ver esa pe...,C1
4,"En el año pasado , estuve con mi madre en Raba...",B1
...,...,...
17899,"¡ Hola , mamá , hermana y papá ! Por eso , no ...",A1
17900,Buenos tardes ! Yo soy Coreana . Tengo 42 años...,A1
17901,Me llamo Federico estoy escribiendo esté a la ...,C1
17902,"Eso es por un lado , por_otro_lado debemos que...",B2


In [4]:
# Count the number of elements per category
category_counts = loaded_df['category'].value_counts()
category_counts

category
C1    3640
A2    3575
B2    3563
B1    3563
A1    3563
Name: count, dtype: int64

# Tokenization and Padding

In [5]:
from transformers import AutoTokenizer

#2847 Iis the max lenght of the texts
SEQ_LEN = 512

#i chose bert-base-cased because for me it's important to recognize the capital letters 
# in the text to identify the language level
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
# tokenizer = AutoTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-cased")
# tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")



  from .autonotebook import tqdm as notebook_tqdm


In [6]:
#testing tokenizer
text = "Hugging Face Transformers library is great for NLP."
tokens = tokenizer.encode_plus(
    text, 
    max_length= SEQ_LEN, 
    truncation= True, 
    padding= "max_length",
    add_special_tokens = True,
    return_token_type_ids = False,
    return_attention_mask = True,
    return_tensors = 'tf'
    )
tokens

{'input_ids': <tf.Tensor: shape=(1, 512), dtype=int32, numpy=
array([[  101, 20164, 10932, 10289, 25267,  3340,  1110,  1632,  1111,
        21239,  2101,   119,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0, 

In [7]:
type(tokens)

transformers.tokenization_utils_base.BatchEncoding

In [8]:
tokens['input_ids']

<tf.Tensor: shape=(1, 512), dtype=int32, numpy=
array([[  101, 20164, 10932, 10289, 25267,  3340,  1110,  1632,  1111,
        21239,  2101,   119,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0, 

In [9]:
tokens['attention_mask']

<tf.Tensor: shape=(1, 512), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [10]:
def getTokens(token):
    encoded_token = tokenizer.encode_plus(
    text, 
    max_length= SEQ_LEN, 
    truncation= True, 
    padding= "max_length",
    add_special_tokens = True,
    return_token_type_ids = False,
    return_attention_mask = True,
    return_tensors = 'tf'
    )
    return encoded_token['input_ids'],encoded_token['attention_mask']

In [11]:
import numpy as np

Xids= np.zeros((len(loaded_df), SEQ_LEN))
Xmask= np.zeros((len(loaded_df), SEQ_LEN))

In [12]:
Xids.shape

(17904, 512)

In [13]:
loaded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17904 entries, 0 to 17903
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      16888 non-null  object
 1   category  17904 non-null  object
dtypes: object(2)
memory usage: 279.9+ KB


In [14]:
#casting text to String
loaded_df['text'] = loaded_df['text'].astype(str)

In [15]:
type(loaded_df['text'])

pandas.core.series.Series

In [16]:
#checking type
for i, sequence in enumerate(loaded_df['text']):
    if not isinstance(sequence, str):
        print(f"Warning: 'text' at index {i} is not a string: {sequence}")

In [17]:
for i, sequence in enumerate(loaded_df['text']):
    encoded_token = tokenizer.encode_plus(
        sequence, 
        max_length= SEQ_LEN, 
        truncation= True, 
        padding= "max_length",
        add_special_tokens = True,
        return_token_type_ids = False,
        return_attention_mask = True,
        return_tensors = 'tf'
    )
    Xids[i,:], Xmask[i,:] = encoded_token['input_ids'],encoded_token['attention_mask']

In [18]:
Xids

array([[  101.,   142., 12223., ...,     0.,     0.,     0.],
       [  101., 13832.,  1940., ...,     0.,     0.,     0.],
       [  101.,   106.,  8883., ...,     0.,     0.,     0.],
       ...,
       [  101.,  2508.,  1325., ...,     0.,     0.,     0.],
       [  101.,   142.,  7301., ...,     0.,     0.,     0.],
       [  101.,   159., 16931., ...,     0.,     0.,     0.]])

In [19]:
Xmask

array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.]])

In [20]:
loaded_df['category'].value_counts()

category
C1    3640
A2    3575
B2    3563
B1    3563
A1    3563
Name: count, dtype: int64

In [21]:
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder instance
label_encoder = LabelEncoder()

# Fit the encoder to the unique category strings and transform the 'category' column
loaded_df['numerical_category'] = label_encoder.fit_transform(loaded_df['category'])

# Display the DataFrame with the new 'numerical_category' column
print(loaded_df[['category', 'numerical_category']])


      category  numerical_category
0           B2                   3
1           B2                   3
2           B2                   3
3           C1                   4
4           B1                   2
...        ...                 ...
17899       A1                   0
17900       A1                   0
17901       C1                   4
17902       B2                   3
17903       B2                   3

[17904 rows x 2 columns]


In [22]:
original_label = label_encoder.inverse_transform([4])
original_label

array(['C1'], dtype=object)

Valores: 
- A1 = 0
- A2 = 1
- B1 = 2
- B2 = 3
- C1 = 4

# transforming categories into numerical categories

In [23]:
arr = loaded_df['numerical_category'].values
arr.size

17904

In [24]:
labels = np.zeros((arr.size, arr.max()+1))
labels.shape

(17904, 5)

In [25]:
labels[0]

array([0., 0., 0., 0., 0.])

In [26]:
labels[np.arange(arr.size), arr] = 1
labels

array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.]])

In [27]:
labels.shape

(17904, 5)

# Spliting

In [28]:
lenght = Xids.shape

In [29]:
lenght = lenght[0]
lenght

17904

In [30]:
split_radio = 0.8
# Calculate the split point
split_point = int(lenght* split_radio)

In [31]:
Xids_train = Xids[:split_point]
Xids_test = Xids[split_point:]

In [32]:
Xmask_train = Xmask[:split_point]
Xmask_test = Xmask[split_point:]

In [33]:
labels_train = labels[:split_point]
labels_test = labels[split_point:]

# saving Xid and Xmask arr in np arrays

In [34]:
with open('Xids.npy', 'wb') as f:
    np.save(f, Xids)
    
with open('Xmask.npy', 'wb') as f:
    np.save(f, Xmask)

with open('Labels.npy', 'wb') as f:
    np.save(f, labels)

In [35]:
with open('Xids_train.npy', 'wb') as f:
    np.save(f, Xids_train)
    
with open('Xmask_train.npy', 'wb') as f:
    np.save(f, Xmask_train)

with open('Labels_train.npy', 'wb') as f:
    np.save(f, labels_train)

In [36]:
with open('Xids_test.npy', 'wb') as f:
    np.save(f, Xids_test)
    
with open('Xmask_test.npy', 'wb') as f:
    np.save(f, Xmask_test)

with open('Labels_test.npy', 'wb') as f:
    np.save(f, labels_test)

### free up space

In [37]:
del loaded_df, Xids, Xmask, labels

In [38]:
# kill kernel

import IPython
app = IPython.Application.instance()
app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

: 

Tasks:
- To balance the dataset (done)
- To tokenize text (done)
- To pad text (done)
- To transform categories into numerical categories (done)