In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds

from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('datasets/clean/davidson.csv')
forward_map = dict(zip(df['class'].unique(), np.arange(3)))
reverse_map = dict(zip(np.arange(3), df['class'].unique()))
df['class'] = df['class'].map(forward_map)
# df['class'] = (df['class'] > 0).astype(int)
df.head()

Unnamed: 0,class,tweet
0,0,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [3]:
df['class'].value_counts()

1    19190
0     4163
2     1430
Name: class, dtype: int64

In [4]:
def to_dataset(input_df):
    return tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(input_df['tweet'].values, tf.string),
            tf.cast(input_df['class'].values, tf.int32)
        )
    )
    

In [5]:

train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['class'])
train_df, validation_df = train_test_split(train_df, test_size=0.1, stratify=train_df['class'])

In [6]:
train_data, validation_data, test_data = to_dataset(train_df), to_dataset(validation_df), to_dataset(test_df)

In [7]:
embedding = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
hub_layer = hub.KerasLayer(embedding, input_shape=[], 
                           dtype=tf.string, trainable=True)

In [8]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dropout(.5))
model.add(tf.keras.layers.Dense(3, activation='softmax'))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer (KerasLayer)     (None, 20)                400020    
_________________________________________________________________
dense (Dense)                (None, 16)                336       
_________________________________________________________________
dropout (Dropout)            (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 51        
Total params: 400,407
Trainable params: 400,407
Non-trainable params: 0
_________________________________________________________________


In [9]:
model.compile(optimizer='adam',
              learning_rate=1e-5,
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [10]:
history = model.fit(train_data.shuffle(10000).batch(512),
                    epochs=20,
                    validation_data=validation_data.batch(512),
                    verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [11]:
results = model.evaluate(test_data.batch(512), verbose=2)
for name, value in zip(model.metrics_names, results):
    print("%s: %.3f" % (name, value))

10/10 - 0s - loss: 82.1442 - accuracy: 4.0347e-04
loss: 82.144
accuracy: 0.000


In [12]:
pred = model.predict(test_data.batch(512))

In [13]:
unique, counts = np.unique(np.round(pred), return_counts=True)

In [14]:
def predict(input_str):
    sample = tf.data.Dataset.from_tensor_slices((tf.cast([input_str], tf.string)))
    pred = model.predict(sample.batch(1))
    print(pred)
    return reverse_map[int(np.argmax(pred.squeeze()))]

In [15]:
predict("Asshole!")

[[5.5353720e-02 2.1554046e-10 9.4464624e-01]]


'hate_speech'