#Ececute all cells sequntially on everyday data

In [None]:
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [None]:
def get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))


def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))


def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

In [None]:
#modules for bert
import tensorflow_hub as hub
import tensorflow as tf
from bert.tokenization import FullTokenizer     # Still from bert module
from tensorflow.keras.models import Model       # Keras is the new high level API for TensorFlow
import math
    
import pandas as pd
import numpy as np
    
#     physical_devices = tf.config.list_physical_devices('GPU') 
#     tf.config.experimental.set_memory_growth(physical_devices[0], True)
    
max_seq_length = 280
    
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,),dtype=tf.int32,name="input_word_ids")
    
input_mask = tf.keras.layers.Input(shape=(max_seq_length,),dtype=tf.int32,name="input_mask")
    
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,),dtype=tf.int32,name="segment_ids")
    
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2",trainable=True)
    
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    
    #made the bert model
model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[pooled_output, sequence_output])

tf.gfile = tf.io.gfile
    
#geting vocab file and tokenizer
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

#function to produce embeddings
def get_bert_pool(input_ids,input_masks,input_segments):
    pool_embs,all_embs=model.predict([[input_ids],[input_masks],[input_segments]])
    return(pool_embs)

In [None]:
#function to extract bert embedings on a given batch of data
#batch is used to avoid problems due to GPU RAM
def extract_bert_embed(df):
            
    #tokenizing tweets
    tokens=df.text.apply(lambda text: tokenizer.tokenize(text))
    
    #input_ids
    input_ids=tokens.apply(lambda row:get_ids(row,tokenizer,max_seq_length))
    #input_masks
    input_masks=tokens.apply(lambda row:get_masks(row,max_seq_length))
    #input_segments
    input_segments=tokens.apply(lambda row:get_segments(row,max_seq_length))
    
    #turning each list in row into numpy array
    input_ids=input_ids.apply(lambda x:np.array(x))
    input_masks=input_masks.apply(lambda x:np.array(x))
    input_segments=input_segments.apply(lambda x:np.array(x))
    
    #now turning them into dataframes
    input_ids=pd.DataFrame(input_ids)
    input_masks=pd.DataFrame(input_masks)
    input_segments=pd.DataFrame(input_segments)
    
    #naming coloumns of each one from 'text'->respective dataframe name
    input_ids.columns=['input_ids']
    input_masks.columns=['input_masks']
    input_segments.columns=['input_segments']
    
    #now joining theminto one dataframe
    input=input_ids.join([input_masks,input_segments])
    
    #now extracting bert embeddings
    pool=input.apply(lambda tweet:get_bert_pool(tweet.input_ids,tweet.input_masks,tweet.input_segments),axis=1)
    
    #dealing with rows with sublist of type[[0,1,...767]]->[0,1,...,767] and changing to dataframe
    pool=pool.apply(lambda x:x[0])
    pool=pd.DataFrame(pool)
    
    #now changing each list in the row to dataframe of 768 columns
    pool=pd.DataFrame(pool.to_dict()[0]).T
    
    return(pool)

In [None]:
#check if tf is using GPU
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

In [None]:
import datetime as dt

today=dt.date.today()

#file paths
embed_path='F:/twitter_data/twitter_users_data/bert_emb/'
data_path='F:/twitter_data/twitter_users_data/'
tweets_file='tweet_{}.csv'.format(str(today))
embed_file='bert_embed_{}.csv'.format(str(today))

#df is the tweets gathered today
df=pd.read_csv(data_path+tweets_file)
#input_df is the tweet text
input_df=pd.DataFrame(df['text'])

In [None]:
#Function to extract bert embeddings in batchs 
def batchwise_bert_embed_extractor(input_df,batch_size=200):
    start=0
    #empty list for pools
    pool=[]
    
    #working on batch of tweets
    while(start<=len(input_df)):
        #input batch for bert embedings extraction
        pool_input_df=pd.DataFrame(input_df.iloc[start:start+batch_size])
        #extracted bert embedings on the batch
        pool_df=extract_bert_embed(pool_input_df)
        #adding embedings dataframe to pool
        pool.append(pool_df)
        #new batch
        start=start+batch_size
    
    #making pool_frame from list of pools
    pool_frame=pd.concat(pool,axis=0,ignore_index=True)
    #exporting pool_frame to file location as csv
    pool_frame.to_csv(embed_path+embed_file,index=False)

#this will execute on todays tweets
batchwise_bert_embed_extractor(input_df)