# Samuel Anozie
## NLP Text Classification
### Fall 2022

This project classifies text regarding stereotypical keywords.

1. Load the data into a pandas dataframe, taking relevant feature and target data. Models will be trained on the following data:

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from nltk.corpus import stopwords


stoplist = set(stopwords.words('english'))

stereotypes = pd.read_csv('Stereotypes.csv', usecols=[2, 5, 6, 7, 8, 14, 15, 16, 17, 18], names=["phrase", "sex", "age", "race", "politics", "friendly", "trustworthy", "confident", "competent", "wealthy"], header=2)
stereotypes

Unnamed: 0,phrase,sex,age,race,politics,friendly,trustworthy,confident,competent,wealthy
0,a drain,Female,39,Black,1) Extremely Liberal,2,4,3,3,2
1,a hassle,Male,19,White,4,3,3,3,3,3
2,a pain,Female,32,White,4,3,3,2,1,1
3,a person,Female,65,White,3,2,3,2,4,1
4,a thinker,Female,65,White,3,4,3,5,5,4
...,...,...,...,...,...,...,...,...,...,...
7021,true to self,Female,47,White,6,5,3,5,5,3
7022,true to self,Female,43,White,4,2,3,5,5,5
7023,i don't know.,Male,32,Hispanic,5,3,3,3,3,3
7024,to the radical left: racists,Male,32,Hispanic,5,3,3,3,3,3


Clean up data

In [2]:
stereotypes['phrase'].replace('', np.nan, inplace=True)
stereotypes.dropna(inplace=True)
stereotypes['politics'].replace('1) Extremely Liberal', '1', inplace=True)
stereotypes['politics'].replace('7) Extremely Conservative', '7', inplace=True)
stereotypes = stereotypes.astype({'politics': 'int'})
stereotypes

Unnamed: 0,phrase,sex,age,race,politics,friendly,trustworthy,confident,competent,wealthy
0,a drain,Female,39,Black,1,2,4,3,3,2
1,a hassle,Male,19,White,4,3,3,3,3,3
2,a pain,Female,32,White,4,3,3,2,1,1
3,a person,Female,65,White,3,2,3,2,4,1
4,a thinker,Female,65,White,3,4,3,5,5,4
...,...,...,...,...,...,...,...,...,...,...
7021,true to self,Female,47,White,6,5,3,5,5,3
7022,true to self,Female,43,White,4,2,3,5,5,5
7023,i don't know.,Male,32,Hispanic,5,3,3,3,3,3
7024,to the radical left: racists,Male,32,Hispanic,5,3,3,3,3,3


In [3]:
from sklearn.model_selection import train_test_split

# Split the data into train and test with 80 train / 20 test
train,test = train_test_split(stereotypes, test_size=0.2, random_state = 14)
# train,val = train_test_split(train, test_size=0.2, random_state = 1234)
train

Unnamed: 0,phrase,sex,age,race,politics,friendly,trustworthy,confident,competent,wealthy
403,book,Female,24,Black,3,4,3,3,3,3
5861,stupid,Female,36,White,3,4,2,3,1,1
4885,rich,Male,32,Hispanic,5,4,4,5,4,5
6256,undesirable,Male,29,White,1,2,1,2,1,1
3426,liberal,Female,39,White,4,3,3,5,5,5
...,...,...,...,...,...,...,...,...,...,...
6481,untrustworthy,Male,29,White,7,2,4,3,5,2
2454,hard working,Female,42,Black,5,5,3,5,5,4
1292,difficult,Male,51,White,3,2,3,2,1,3
5474,smart,Male,37,White,2,3,3,3,3,3


Make targets/features with tf

In [4]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32, targets=[]):
    df = dataframe.copy()
    labels = pd.concat([df.pop(x) for x in targets], axis=1)
    df = {key: value[:,tf.newaxis] for key, value in dataframe.items()}
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    ds = ds.prefetch(batch_size)
    return ds

targets = ["friendly", "trustworthy", "confident", "competent", "wealthy"]
batch_size = 16
train_ds = df_to_dataset(train, batch_size=batch_size, targets=targets)
# val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size, targets=targets)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size, targets=targets)
train_ds

  df = {key: value[:,tf.newaxis] for key, value in dataframe.items()}
2022-12-08 11:29:42.880502: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  df = {key: value[:,tf.newaxis] for key, value in dataframe.items()}


<PrefetchDataset element_spec=({'phrase': TensorSpec(shape=(None, 1), dtype=tf.string, name=None), 'sex': TensorSpec(shape=(None, 1), dtype=tf.string, name=None), 'age': TensorSpec(shape=(None, 1), dtype=tf.int64, name=None), 'race': TensorSpec(shape=(None, 1), dtype=tf.string, name=None), 'politics': TensorSpec(shape=(None, 1), dtype=tf.int64, name=None), 'friendly': TensorSpec(shape=(None, 1), dtype=tf.int64, name=None), 'trustworthy': TensorSpec(shape=(None, 1), dtype=tf.int64, name=None), 'confident': TensorSpec(shape=(None, 1), dtype=tf.int64, name=None), 'competent': TensorSpec(shape=(None, 1), dtype=tf.int64, name=None), 'wealthy': TensorSpec(shape=(None, 1), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, 5), dtype=tf.int64, name=None))>

Do smtn else

In [5]:
import re
import string

def normalize(text):
    remove_regex = f'[{re.escape(string.punctuation)}]'
    space_regex = '...'
    result = tf.strings.lower(text)
    result = tf.strings.regex_replace(result, remove_regex, '')
    result = tf.strings.regex_replace(result, space_regex, ' ')
    return result

def get_vectorization_layer(name, dataset):
    # Create a Normalization layer for the feature.
    vectorizer = tf.keras.layers.experimental.preprocessing.TextVectorization(
        standardize=normalize,
        output_mode='tf_idf')

    # Prepare a Dataset that only yields the feature.
    feature_ds = dataset.map(lambda x, y: x[name])

    # Learn the statistics of the data.
    vectorizer.adapt(feature_ds)

    return vectorizer

def get_normalization_layer(name, dataset):
    # Create a Normalization layer for the feature.
    normalizer = tf.keras.layers.Normalization(axis=None)

    # Prepare a Dataset that only yields the feature.
    feature_ds = dataset.map(lambda x, y: x[name])

    # Learn the statistics of the data.
    normalizer.adapt(feature_ds)

    return normalizer

def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
    # Create a layer that turns strings into integer indices.
    if dtype == 'string':
        index = tf.keras.layers.StringLookup(max_tokens=max_tokens)
    # Otherwise, create a layer that turns integer values into integer indices.
    else:
        index = tf.keras.layers.IntegerLookup(max_tokens=max_tokens)

    # Prepare a `tf.data.Dataset` that only yields the feature.
    feature_ds = dataset.map(lambda x, y: x[name])

    # Learn the set of possible values and assign them a fixed integer index.
    index.adapt(feature_ds)

    # Encode the integer indices.
    encoder = tf.keras.layers.CategoryEncoding(num_tokens=index.vocabulary_size())

    # Apply multi-hot encoding to the indices. The lambda function captures the
    # layer, so you can use them, or include them in the Keras Functional model later.
    return lambda feature: encoder(index(feature))

In [6]:

# Text features.
def vector_features(headers):
    inputs = []
    encoded_features = []
    for header in headers:
        text_col = tf.keras.Input(shape=(1,), name=header, dtype=tf.string)
        vectorization_layer = get_vectorization_layer(header, train_ds)
        print(vectorization_layer)
        encoded_text_col = vectorization_layer(text_col)
        # embed_text_col = tf.keras.layers.Embedding(8000, 64, name="embedding")(encoded_text_col)
        # encoded_text_col = tf.keras.layers.GlobalAveragePooling1D()(embed_text_col)
        encoded_text_col = tf.cast(encoded_text_col, tf.float64)
        inputs.append(text_col)
        encoded_features.append(encoded_text_col)

    return inputs, encoded_features

# Text features (embeddings).
def embedding_features(headers):
    inputs = []
    encoded_features = []
    for header in headers:
        text_col = tf.keras.Input(shape=(1,), name=header, dtype=tf.string)
        vectorization_layer = get_vectorization_layer(header, train_ds)
        encoded_text_col = vectorization_layer(text_col)
        embed_text_col = tf.keras.layers.Embedding(8000, 64, name="embedding")(encoded_text_col)
        encoded_text_col = tf.keras.layers.GlobalAveragePooling1D()(embed_text_col)
        inputs.append(text_col)
        encoded_features.append(encoded_text_col)

    return inputs, encoded_features

# Numerical features.
def numerical_features(headers):
    inputs = []
    encoded_features = []
    for header in headers:
        numeric_col = tf.keras.Input(shape=(1,), name=header)
        normalization_layer = get_normalization_layer(header, train_ds)
        encoded_numeric_col = normalization_layer(numeric_col)
        inputs.append(numeric_col)
        encoded_features.append(encoded_numeric_col)

    return inputs, encoded_features


def numerical_category_features(headers):
    inputs = []
    encoded_features = []
    for header in headers:
        num_category_col = tf.keras.Input(shape=(1,), name=header, dtype='int64')
        num_category_layer = get_category_encoding_layer(name=header,
                                                     dataset=train_ds,
                                                     dtype='int64',
                                                     max_tokens=5)
        encoded_num_category_col = num_category_layer(num_category_col)
        inputs.append(num_category_col)
        encoded_features.append(encoded_num_category_col)

    return inputs, encoded_features

def category_features(headers):
    inputs = []
    encoded_features = []
    for header in headers:
        categorical_col = tf.keras.Input(shape=(1,), name=header, dtype='string')
        encoding_layer = get_category_encoding_layer(name=header,
                                                     dataset=train_ds,
                                                     dtype='string',
                                                     max_tokens=5)
        encoded_categorical_col = encoding_layer(categorical_col)
        inputs.append(categorical_col)
        encoded_features.append(encoded_categorical_col)

    return inputs, encoded_features

Wowowowowow

In [7]:
vector_inputs, vector_encoded = vector_features(['phrase'])
number_inputs, number_encoded = numerical_features(['age'])
number_category_inputs, number_category_encoded = numerical_category_features(['politics'])
category_inputs, category_encoded = category_features(['race', 'sex'])

all_inputs = [*vector_inputs, *number_inputs, *number_category_inputs, *category_inputs]
all_encoded_features = [*vector_encoded, *number_encoded, *number_category_encoded, *category_encoded]

# Combine all of the input layers into one
all_features = tf.keras.layers.Concatenate()(all_encoded_features)
base_layer = tf.keras.layers.Dense(units='128', activation='relu')(all_features)

outputs = []

for target in targets:
    layer = tf.keras.layers.Dense(units='64', activation='relu')(base_layer)
    layer = tf.keras.layers.Dense(units='32', activation='relu')(layer)
    outputs.append(tf.keras.layers.Dense(units='1', activation='relu', name=target)(layer))

model = tf.keras.Model(all_inputs, outputs=outputs)
model.summary()

<keras.layers.preprocessing.text_vectorization.TextVectorization object at 0x7ff438bc7730>
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 phrase (InputLayer)            [(None, 1)]          0           []                               
                                                                                                  
 politics (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 race (InputLayer)              [(None, 1)]          0           []                               
                                                                                                  
 sex (InputLayer)               [(None, 1)]          0           []                               
   

In [8]:
optimizer = tf.keras.optimizers.SGD(learning_rate=0.001)
model.compile(optimizer=optimizer,
              loss=tf.keras.losses.MeanAbsolutePercentageError(reduction="auto"),
              metrics=tf.keras.metrics.RootMeanSquaredError())

history = model.fit(train_ds, epochs=48, validation_data=test_ds)
evaluation = model.evaluate(test_ds)
print(evaluation)

Epoch 1/48


  inputs = self._flatten_to_reference_inputs(inputs)


Epoch 2/48
Epoch 3/48
Epoch 4/48
Epoch 5/48
Epoch 6/48
Epoch 7/48
Epoch 8/48
Epoch 9/48
Epoch 10/48
Epoch 11/48
Epoch 12/48
Epoch 13/48
Epoch 14/48
Epoch 15/48
Epoch 16/48
Epoch 17/48
Epoch 18/48
Epoch 19/48
Epoch 20/48
Epoch 21/48
Epoch 22/48
Epoch 23/48
Epoch 24/48
Epoch 25/48
Epoch 26/48
Epoch 27/48
Epoch 28/48
Epoch 29/48
Epoch 30/48
Epoch 31/48
Epoch 32/48
Epoch 33/48
Epoch 34/48
Epoch 35/48
Epoch 36/48
Epoch 37/48
Epoch 38/48
Epoch 39/48
Epoch 40/48
Epoch 41/48
Epoch 42/48
Epoch 43/48
Epoch 44/48
Epoch 45/48
Epoch 46/48
Epoch 47/48
Epoch 48/48
[231.31642150878906, 46.128910064697266, 46.25299072265625, 46.59382629394531, 46.25303649902344, 46.08761215209961, 1.5400482416152954, 1.5268834829330444, 1.5213350057601929, 1.5387673377990723, 1.5233330726623535]


Try with embeddings

Build the model