# IMPORTS

In [96]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras import regularizers
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

# !pip install -q git+https://github.com/tensorflow/docs

import tensorflow_docs as tfdocs
import tensorflow_docs.modeling
import tensorflow_docs.plots

import os

import nltk
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk import word_tokenize   
from nltk.corpus import stopwords

import re
import emoji
import string

print(tf.__version__)

2.3.1


# CONSTANTS

In [2]:
def train_test_val_paths(path):
    # Get directory and filename
    dirname = os.path.dirname(path)
    basename = os.path.basename(path)
    
    # Create path for each dataset
    train_path = os.path.join(dirname, 'train_' + basename)
    test_path = os.path.join(dirname, 'test_' + basename)
    val_path = os.path.join(dirname, 'val_' + basename)
    
    return train_path, test_path, val_path

In [16]:
BATCH_SIZE= 1000
TEST_SIZE = 0.15 
VAL_SIZE= 0.15

trump_path = './input/hashtag_donaldtrump.csv'
biden_path = './input/hashtag_joebiden.csv'

train_trump, test_trump, val_trump = train_test_val_paths(trump_path)
train_biden, test_biden, val_biden = train_test_val_paths(biden_path)

# TRAIN - TEST SPLIT

In [27]:
def get_tweets_split(path):
    
    train_path, test_path, val_path = train_test_val_paths(path)
    
    # Create an empty csv file to append chunks
    header_df = pd.DataFrame(columns=['tweet'])
    header_df.to_csv(train_path, index=False)
    header_df.to_csv(test_path, index=False)
    header_df.to_csv(val_path, index=False)
    
    # Define the probabilities to select each path
    TRAIN_SIZE = 1 - TEST_SIZE - VAL_SIZE
    probabilities = [TRAIN_SIZE, TEST_SIZE, VAL_SIZE]
    paths = [train_path, test_path, val_path]
    
    # Create the DataFrame Reader
    df = pd.read_csv(path, 
                     lineterminator='\n', 
                     chunksize=BATCH_SIZE, 
                     usecols=['tweet'])

    # Split the dataset
    for chunk in df:
        path = np.random.choice(paths, p=probabilities)
        chunk.to_csv(path, index=False, mode='a')


In [28]:
get_tweets_split(path=trump_path)
get_tweets_split(path=biden_path)

# INPUT PIPELINE

In [29]:
def custom_standardization(text):
    # Lower case
    lower_text = tf.strings.lower(text)

    # Remove new lines
    lower_text = tf.strings.regex_replace(input=lower_text, 
                                          pattern='\n', 
                                          rewrite=' ')

    # Remove URLs
    free_url_text = tf.strings.regex_replace(input=lower_text, 
                                             pattern="http\S+", 
                                             rewrite=' ')

    # Remove emojis
    emoji_pattern = emoji.get_emoji_regexp().pattern
    emoji_pattern = emoji_pattern.replace('#','') # There is a # emoji
    free_emoji_text = tf.strings.regex_replace(input=free_url_text, 
                                               pattern='[%s]' % re.escape(emoji_pattern),
                                               rewrite=' ')

    # Remove punctuation
    # punctuation_pattern = string.punctuation.replace('#', '').replace('@','').replace("\'", '')
    punctuation_pattern = string.punctuation
    free_punctuation_text =  tf.strings.regex_replace(free_emoji_text,
                                                      '[%s]' % re.escape(punctuation_pattern),
                                                      ' ')
    return free_punctuation_text


In [30]:
#
# Original function in: https://www.kaggle.com/gatandubuc/donald-trump-vs-joe-biden
#
def is_english(text):
    """
    Calculate probability of given text to be written in several languages and
    return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}
    
    @param text: Text whose language want to be detected
    @type text: str
    
    @return: Dictionary with languages and unique stopwords seen in analyzed text
    @rtype: dict
    """

    languages_ratios = {}

    '''
    nltk.wordpunct_tokenize() splits all punctuations into separate tokens
    
    >>> wordpunct_tokenize("That's thirty minutes away. I'll be there in ten.")
    ['That', "'", 's', 'thirty', 'minutes', 'away', '.', 'I', "'", 'll', 'be', 'there', 'in', 'ten', '.']
    '''
    
    tokens = wordpunct_tokenize(text.numpy().decode())
    words = [word.lower() for word in tokens]

    # Compute per language included in nltk number of unique stopwords appearing in analyzed text
    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        words_set = set(words)
        common_elements = words_set.intersection(stopwords_set)

        languages_ratios[language] = len(common_elements) # language "score"
    
    most_rated_language = max(languages_ratios, key=languages_ratios.get)
    
    if most_rated_language != 'english':
        text = 'EMPTY'
    
    return text    
    # return most_rated_language == 'english'

In [31]:
def tf_is_english(text):
    py_function = tf.py_function(is_english, [text], text.dtype)
    py_function.set_shape(text.shape)
    return py_function

def filter_by_EMPTY(text):
    return not tf.math.equal(text, 'EMPTY')

In [32]:
def create_dataset(path):
    
    # Create the dataset
    ds = tf.data.experimental.make_csv_dataset(path, 
                                              batch_size=BATCH_SIZE, 
                                              select_columns=['tweet'],
                                              num_epochs=1)
    # Get the data in the tweet column
    ds = ds.map(lambda x: x['tweet'])
    
    # Standarize the input (lower case, remove URL, remove emoticons...)
    ds = ds.map(custom_standardization)
    
    # Filter by language
    # Step 1: change non english text by the word EMPTY
    # This step must be unbatched because is_english 
    # doesn't work with batches, only single rows (text)
    ds = ds.unbatch().map(tf_is_english)
    
    # Step 2: remove all ocurrences of the word EMPTY
    ds = ds.filter(lambda x: not tf.math.equal(x, 'EMPTY'))
    
    # Batch the output
    ds = ds.batch(BATCH_SIZE)
    
    return ds

In [48]:
def add_class_column(dataset, class_value):
    return dataset.map(lambda x: (x, tf.repeat(class_value, tf.size(x))))

In [49]:
def input_pipeline(trump_path, biden_path, class_atributte=True):
    
    trump_ds = create_dataset(trump_path)
    biden_ds = create_dataset(biden_path)
    
    # Add class atributte
    if class_atributte:
        trump_ds = add_class_column(trump_ds, 0)
        biden_ds = add_class_column(biden_ds, 1)
    
    datasets = [
        trump_ds.unbatch(),
        biden_ds.unbatch()
    ]
    
    # Equally merge datasets
    trump_biden_ds = tf.data.experimental.sample_from_datasets(datasets=datasets, weights=[0.5, 0.5], seed=42)
        
    # Batch the dataset
    trump_biden_ds = trump_biden_ds.batch(BATCH_SIZE)
    
    return trump_biden_ds
    

In [68]:
train_ds = input_pipeline(train_trump, train_biden, class_atributte=True)
val_ds = input_pipeline(val_trump, val_biden, class_atributte=True)
test_ds = input_pipeline(test_trump, test_biden, class_atributte=False)

In [40]:
features, labels = next(iter(train_ds))
print(features[:2])
print(labels[:2])

tf.Tensor(
[b'i bet if you could get  joebiden to open up around a beer or two he would say he is owed the right to benefit his family thru  govt because of all his  public service  he gave up his life to help us so he can help them   swampthing'
 b' joebiden calls him a lier and then insults him for being overweight   biden shows his low iq daily   maybe joe should hold town halls in kindergarten   he\xe2\x80\x99ll be amongst equals and no one will say anything when he takes his afternoon nap  '], shape=(2,), dtype=string)
tf.Tensor([1 1], shape=(2,), dtype=int32)


In [None]:
plt.hist(list(map(len,features.numpy())))

# MODEL

## Vectorize layer

### Define the layer

In [52]:
MAX_FEATURES = 10000
SEQUENCE_LENGTH = 260

vectorize_layer = TextVectorization(
    standardize=lambda x: x,
    max_tokens=MAX_FEATURES,
    output_mode='int',
    output_sequence_length=SEQUENCE_LENGTH
)

### Adapt the layer

In [74]:
train_text = train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

### Apply the layer

In [54]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

In [76]:
train_ds = train_ds.map(vectorize_text)
val_ds = val_ds.map(vectorize_text)
test_ds = test_ds.map(vectorize_text)

### Configure datasets for performance

In [78]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

## Create the model

In [84]:
EMBEDDING_DIM = 16

In [85]:
model = tf.keras.Sequential([
  layers.Embedding(MAX_FEATURES + 1, EMBEDDING_DIM),
  layers.Dropout(0.2),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(1)])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          160016    
_________________________________________________________________
dropout (Dropout)            (None, None, 16)          0         
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 17        
Total params: 160,033
Trainable params: 160,033
Non-trainable params: 0
_________________________________________________________________


### Loss function and optimizer

In [88]:
model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=tf.metrics.BinaryAccuracy(threshold=0.0))

### Train the model

In [None]:
def get_callbacks():
    return [
#         tfdocs.modeling.EpochDots(report_every=100),
        tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', patience=3)
      ]

In [90]:
epochs = 5
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Save the model

In [91]:
model.save('./saved_model/my_model')

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: ./saved_model/my_model\assets


## Restore the model

In [92]:
new_model = tf.keras.models.load_model('saved_model/my_model')

## Evaluate the model

In [94]:
new_model.evaluate(val_ds)



[0.6930584907531738, 0.49561402201652527]

In [98]:
plt.figure(figsize=(12,6))
plotter = tfdocs.plots.HistoryPlotter(metric = 'binary_accuracy', smoothing_std=10)
plotter.plot(history)

AttributeError: 'History' object has no attribute 'items'

<Figure size 864x432 with 0 Axes>