In [51]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np



In [52]:
from tensorflow.keras.layers import TextVectorization

In [53]:
data=pd.read_csv("HateSpeechDetection_cleaned.csv")
X = data['Text']
y = data[data.columns[1]].values

In [54]:
y

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

# Text Verctorization:
Vectorization is the process of converting text into numerical representations. The TextVectorization layer is designed to standardize the text data, tokenize it, and convert it into integer sequences that can be used as input for deep learning model.

In [55]:
vectorizer = TextVectorization(max_tokens=20000,
                               output_sequence_length=1800,
                               output_mode='int')
vectorizer.adapt(X.values)
vectorized_text = vectorizer(X.values)
vectorized_text

<tf.Tensor: shape=(17596, 1800), dtype=int64, numpy=
array([[ 460,    6,  293, ...,    0,    0,    0],
       [   6,   36,  190, ...,    0,    0,    0],
       [  33,   17,    4, ...,    0,    0,    0],
       ...,
       [  18,   78,   87, ...,    0,    0,    0],
       [  37, 2243, 2842, ...,    0,    0,    0],
       [  27, 2559,   57, ...,    0,    0,    0]], dtype=int64)>

Vocabulary Size (max_tokens=20000):

By setting max_tokens to 20,000, we limit the vocabulary to the 20,000 most frequent words in the dataset. This helps in reducing the computational complexity and memory usage while retaining the most important words for the task.

Sequence Length (output_sequence_length=1800):

The output_sequence_length parameter ensures that all text sequences are of equal length (1800 tokens in this case). Shorter sequences will be padded (usually with zeros), and longer sequences will be truncated. This uniformity is necessary for efficient batch processing and model training.

Integer Token Indices (output_mode='int'):

The output_mode='int' setting indicates that the output will be integer indices of tokens. This is a common approach in NLP tasks, where each unique token in the vocabulary is assigned a unique integer index.

In [56]:
vectorizer.get_vocabulary()

['',
 '[UNK]',
 'the',
 'to',
 'a',
 'and',
 'i',
 'of',
 'is',
 'are',
 'you',
 'in',
 'that',
 'they',
 'it',
 'for',
 'be',
 'not',
 'this',
 'people',
 'have',
 'with',
 'all',
 'as',
 'my',
 'but',
 'like',
 'so',
 'on',
 'woman',
 'just',
 'if',
 'their',
 'its',
 'them',
 'we',
 'dont',
 'me',
 'do',
 'was',
 'can',
 'black',
 'or',
 'who',
 'what',
 'about',
 'there',
 'at',
 'from',
 'no',
 'your',
 'get',
 'fucking',
 'he',
 'when',
 'will',
 'one',
 'up',
 'im',
 'because',
 'think',
 'more',
 'would',
 'should',
 'by',
 'how',
 'out',
 'an',
 'white',
 'want',
 'why',
 'she',
 'being',
 'u',
 'know',
 'our',
 'these',
 'muslim',
 'country',
 'some',
 'men',
 'even',
 'make',
 'her',
 'fuck',
 'only',
 'were',
 'has',
 'hate',
 'say',
 'than',
 'need',
 'really',
 'time',
 'gay',
 'here',
 'now',
 'go',
 'see',
 'any',
 'most',
 'good',
 'those',
 'shit',
 'other',
 'then',
 'never',
 'thing',
 'look',
 'right',
 'way',
 'many',
 'been',
 'jew',
 'man',
 'life',
 'cant',
 't

In [57]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.shuffle(18000)
dataset.as_numpy_iterator().next()


(array([2887, 1121,   11, ...,    0,    0,    0], dtype=int64), 0)

In [58]:
data.iloc[0]

Text     damn i thought they had strict gun law in germany
Label                                                    0
Name: 0, dtype: object

we can observe that the text has been vectorized 

In [59]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))