## twitter_nn

In [3]:
import os

base = '/media/steven/big_boi/twitter.ai'
data_dir = os.path.join(base, 'data')
log_dir = os.path.join(base, 'logs')

os.chdir(base)
print('set directory.')

set directory.


In [4]:
import re
import logging
import logging.handlers
import pprint
import json
import math
import keras
import pickle
import datetime
import pandas as pd 
import numpy as np 
import sklearn as sk
import google.auth
import tensorflow as tf
from google.oauth2 import service_account
from google.cloud import bigquery
from google.cloud import bigquery_storage
from twitter_bq_upload import bq_read_table

pp = pprint.PrettyPrinter(indent = 1)

#init bq client
creds_fname = '/media/steven/big_boi/creds_google.json'
client = bigquery.Client.from_service_account_json(creds_fname)
bqstorageclient = bigquery_storage.BigQueryStorageClient.from_service_account_json(creds_fname)
logging.info('initialized bigquery client.')

tf_dev = 'using tf with dev: {}.'.format(tf.config.list_physical_devices('GPU'))
logging.info(tf_dev)
print(tf_dev)
print('imported modules successfully.')

Using TensorFlow backend.


imported modules successfully.
using tf with dev: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')].
imported modules successfully.


### read data from bigquery table

In [5]:
data = bq_read_table()
tweets = list(data.text)
np.random.shuffle(tweets)

### remove bad characters 
(\n, \\, RT, https://..., http://, amp;, )

In [6]:
def clean_tweets(tweet_list):
    clean_tweet_list = []
    bad_chars = ['\n', '\\', 'amp;', 'RT ']
    for t in tweet_list:
        for b in bad_chars:
            t = t.replace(b, '')
        t = re.sub(r"http\S+", '', t)
        clean_tweet_list.append(t)
    output = 'cleaned {} tweets.'.format(len(clean_tweet_list))
    print(output)
    #logging.info(output)
    return(clean_tweet_list)

In [7]:
clean_tweet = clean_tweets(tweets)

cleaned 15288 tweets.


In [8]:
with open('clean_tweets.pkl', 'wb') as f:
    pickle.dump(clean_tweet, f)

In [9]:
avg_tweet_len = math.ceil(np.mean([len(c.split(' ')) for c in clean_tweet]))
corp = " ".join(clean_tweet)
words = corp.split(' ')

### turn tweets into sequences

In [10]:
def tweet_sequences(tokens, size):
    length = size + 1
    sequences = list()
    for i in range(length, len(tokens)):
        seq = tokens[i-length:i]
        line = ' '.join(seq)
        sequences.append(line)

    return(sequences)

In [11]:
tweet_seq = tweet_sequences(words, avg_tweet_len)
output = 'turned {} tweets into {} sequences.'.format(len(clean_tweet), len(tweet_seq))
print(output)
#logging.info(output)

turned 15288 tweets into 165957 sequences.


### tokenize and split into X and y
Tokenizing turns the words into numerical indices. 
y is the last word, X is the rest of the tweet.

In [195]:
#tokenizer = nltk.tokenize.TweetTokenizer()

In [12]:
def tokenize_xy(sequence):
    tokenizer = keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(sequence)
    sequences = tokenizer.texts_to_sequences(sequence)
    padded = keras.preprocessing.sequence.pad_sequences(sequences)
    vocab_size = len(tokenizer.word_index) + 1
    
    pickle.dump(tokenizer, open('tokenizer.pkl', 'wb'))
    print('saved tokenizer')
 
    X, y = padded[:,:-1], padded[:,-1]
    y = keras.utils.to_categorical(y, num_classes=vocab_size)
    seq_length = X.shape[1]
    print('X has shape: {}'.format(X.shape))
    print('y has shape: {}'.format(y.shape))
    #logging.info('X has shape: {}'.format(X.shape))
    #logging.info('y has shape: {}'.format(y.shape))
    return(X, y, seq_length, vocab_size)

In [13]:
X, y, s, v = tokenize_xy(tweet_seq)

saved tokenizer
X has shape: (165957, 29)
y has shape: (165957, 17629)


## Recurrent Neural Network (RNN)

Here we will create the model architecture for our RNN. Inspiration comes from this blog post: https://machinelearningmastery.com/how-to-develop-a-word-level-neural-language-model-in-keras/. TF docs also have a text generation example but I'm unsure if it has been ported to Tensorflow 2.0 (which simplifies the library A LOT). Note that text generation can be done on the *character* or the *word* level. The linked example does it on the word level, which is what we want here.

In [14]:
def fit_model(X, y, seq_length, vocab_size):
    model = keras.models.Sequential()
    model.add(keras.layers.Embedding(vocab_size, X.shape[1], input_length=seq_length))
    model.add(keras.layers.LSTM(100, return_sequences=True))
    model.add(keras.layers.LSTM(100))
    model.add(keras.layers.Dense(100, activation='relu'))
    model.add(keras.layers.Dense(vocab_size, activation='softmax'))
    print(model.summary())
    #logging.info(model.summary())

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    model.fit(X, y, batch_size=128, epochs=250)
 

    model.save('model.h5')

    print('saved model.')
    #logging.info('saved model.')

In [None]:
fit_model(X, y, s, v)