In [1]:
import os
import pandas as pd
import numpy as np

import tensorflow as tf
import tensorflow_datasets as tfds

In [2]:
BUFFER_SIZE = 50000
BATCH_SIZE = 64
TAKE_SIZE = 5000

In [23]:
df = pd.read_csv('datasets/clean/davidson.csv')
uniques = df['class'].unique()
indices = np.arange(len(uniques))
forward_map = dict(zip(uniques, indices))
reverse_map = dict(zip(indices, uniques))
df['class'] = df['class'].map(forward_map)

In [26]:
import re
pattern = re.compile("(?:\!*(\s*RT)*(?:\")?(\s*\@.*(?:\:)))?(?:\")?(.*?)(?:\")?(?:\n)")
def  clean_text(df, text_field):
#     df[text_field] = df[text_field].str.lower()
    df[text_field] = df[text_field].apply(lambda x: re.match(pattern, x + "\n").groups()[-1])
#     df[text_field] = df[text_field].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))  
    return df

clean_df = clean_text(df, 'tweet')
clean_df

Unnamed: 0,class,tweet
0,0,As a woman you shouldn't complain about clean...
1,1,boy dats cold...tyga dwn bad for cuffin dat h...
2,1,You ever fuck a bitch and she start to cry? Y...
3,1,@viva_based she look like a tranny
4,1,The shit you hear about me might be true or i...
...,...,...
24778,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,0,"you've gone and broke the wrong heart baby, an..."
24780,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,1,youu got wild bitches tellin you lies


In [None]:
tokenizer = tfds.features.text.Tokenizer()

vocabulary_set = set()
for text_tensor, _ in all_labeled_data:
    some_tokens = tokenizer.tokenize(text_tensor.numpy())
    vocabulary_set.update(some_tokens)

vocab_size = len(vocabulary_set)
print("Vocab Size", vocab_size)

In [None]:
def to_dataset(input_df):
    return tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(input_df['tweet'].values, tf.string),
            tf.cast(input_df['class'].values, tf.int64)
        )
    )

    

all_labeled_data = to_dataset(df)

all_labeled_data = all_labeled_data.shuffle(BUFFER_SIZE, reshuffle_each_iteration=False)

In [5]:
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)
# vocab_size = 100000
# encoder = tfds.features.text.SubwordTextEncoder.build_from_corpus((token for elem in df['tweet'] for token in tokenizer.tokenize(elem)), target_vocab_size=vocab_size)

In [6]:
def encode(text_tensor, label):
    encoded_text = encoder.encode(text_tensor.numpy())
    return encoded_text, label

def encode_map_fn(text, label):
    return tf.py_function(encode, inp=[text, label], Tout=(tf.int64, tf.int64))

all_encoded_data = all_labeled_data.map(encode_map_fn)

In [7]:
train_data = all_encoded_data.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE, padded_shapes=([-1],[]))

test_data = all_encoded_data.take(TAKE_SIZE)
test_data = test_data.padded_batch(BATCH_SIZE, padded_shapes=([-1],[]))

In [8]:
vocab_size += 1

In [9]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, 64))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
for units in [64, 64]:
    model.add(tf.keras.layers.Dense(units, activation='relu'))

# Output layer. The first argument is the number of labels.
model.add(tf.keras.layers.Dense(len(uniques), activation='softmax'))

In [10]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [11]:
model.fit(train_data, epochs=3, validation_data=test_data)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x145d3a850>

In [12]:
eval_loss, eval_acc = model.evaluate(test_data)



In [13]:
def predict(input_str):
    sample = tf.data.Dataset.from_tensor_slices((tf.cast([input_str], tf.string)))
    encoded_sample = encoder.encode(input_str)
    pred = model.predict([encoded_sample])
    return reverse_map[np.argmax(pred.squeeze())]

In [17]:
predict("You bitch")

'offensive_language'