In [509]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import time
import pickle
import random
import os
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch
import tensorflow as tf
#import tensorflow_recommenders as tfrs
from tensorflow.keras import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Input, Dense, Dropout, Embedding, Reshape
from transformers import AutoTokenizer, AutoModel, LlamaTokenizer, LlamaForCausalLM
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
import tensorflow_recommenders as tfrs
from transformers import BertTokenizer
import transformers

In [510]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [511]:
model = transformers.LlamaForCausalLM.from_pretrained("bert-base-cased")

You are using a model of type bert to instantiate a model of type llama. This is not supported for all configurations of models and can yield errors.
Some weights of LlamaForCausalLM were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['embed_tokens.weight', 'layers.0.input_layernorm.weight', 'layers.0.mlp.down_proj.weight', 'layers.0.mlp.gate_proj.weight', 'layers.0.mlp.up_proj.weight', 'layers.0.post_attention_layernorm.weight', 'layers.0.self_attn.k_proj.weight', 'layers.0.self_attn.o_proj.weight', 'layers.0.self_attn.q_proj.weight', 'layers.0.self_attn.v_proj.weight', 'layers.1.input_layernorm.weight', 'layers.1.mlp.down_proj.weight', 'layers.1.mlp.gate_proj.weight', 'layers.1.mlp.up_proj.weight', 'layers.1.post_attention_layernorm.weight', 'layers.1.self_attn.k_proj.weight', 'layers.1.self_attn.o_proj.weight', 'layers.1.self_attn.q_proj.weight', 'layers.1.self_attn.v_proj.weight', 'layers.10.input_layernorm.weight', 'layers.10.mlp.down_proj

In [512]:
df = pd.read_csv('cleaned_tweets_biden.csv')
df['binarized'] = np.zeros(df.shape[0])

In [513]:
question_list = df[["cleaned_tweets", "binarized"]].drop_duplicates()

In [514]:
question_list_full = question_list.copy()

In [515]:
question_list = question_list_full[:1000]

In [516]:
embeddings = []
for question in tqdm(question_list['cleaned_tweets'][:1000]):
    prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
    prompt += f"### Instruction:{question}\n\n### Response:"
    input_ids = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
    outputs = model(input_ids, output_hidden_states=True)
    embeddings.append(outputs.hidden_states[-1][:, -1, :].detach().numpy())

  0%|          | 0/1000 [00:00<?, ?it/s]

In [517]:
model_name = "bert-base-cased"
pickle.dump(embeddings, open(model_name+'.pkl', 'wb'))

In [518]:

embeddings_fixed = np.array(embeddings).reshape((1000, 768))

In [519]:
df = df[:1000]

In [520]:
df["embedding_col"] = list(embeddings_fixed)

In [521]:
train_data, val_data = train_test_split(df, test_size=0.1)
train_data.to_parquet('train_data.parquet')
val_data.to_parquet('val_data.parquet')

In [522]:
def get_model(dim=50):
    weights = np.vstack(pickle.load(open('bert-base-cased.pkl', 'rb')))
    embedding_col = Input(name='embedding_col', shape=(1, ))
    question_embedding = Embedding(weights.shape[0], weights.shape[1], weights=[weights], name='question_embedding')(embedding_col)
    question_embedding.trainable = False
    x2 = Dense(50, name=f'question_embedding2')(question_embedding)
    
    x = [x2]
    x = tf.concat(x, axis=1, name='concat1')
    
    for i in range(1):
        x = tfrs.layers.dcn.Cross(projection_dim=dim*3, kernel_initializer="glorot_uniform", name=f'cross_layer_{i}')(x)
        x = Dropout(0.2)(x)
    
    for i in range(1):
        x = Dense(dim*3, activation="relu", name=f'dense_layer_{i}')(x)
        x = Dropout(0.2)(x)
    
    out = Dense(1, activation='sigmoid', name="out")(x)
    inputs = {'embedding_col': embedding_col}
    model = Model(inputs=inputs, outputs=out)
    
    return model



In [523]:
def train(train, val):
    l = list(train['embedding_col'])
    lowest_val = 4.3502
    l = [i + lowest_val for i in l]
    l2 = list(np.zeros((900, 768)))
    tensor = tf.convert_to_tensor(l)
    tensor2 = tf.convert_to_tensor(l2)
    model = get_model()

    train_label = np.array(train['binarized'])
    val_features = {
        'embedding_col': np.array(val['embedding_col'])
    }
    val_label = np.array(val['binarized'])

    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=2e-05,
        decay_steps=80000,
        decay_rate=0.96,
        staircase=True
    )
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule),
        loss=tf.keras.losses.BinaryCrossentropy(),
        metrics=[tf.keras.metrics.AUC(), tf.keras.metrics.BinaryAccuracy()]
    )
    
    history = model.fit(
        tensor, 
        np.random.choice([0, 1], size=(900, 768), p=[0.7, 0.3]),
        batch_size=8,
        epochs=10,
        verbose=1,
        use_multiprocessing=True,
        workers=20
    )
    
    return model


In [524]:
train_data = pd.read_parquet('train_data.parquet')
val_data = pd.read_parquet('val_data.parquet')
trained_model = train(train_data, val_data)
trained_model.save_weights('missing_imputation.h5')



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [526]:
lowest_val = 4.3502
input = [i + lowest_val for i in embeddings_fixed]

In [527]:
prediction_model = get_model()
prediction_model.load_weights('missing_imputation.h5')
obs =  np.random.choice([0, 1], size=(900, 768), p=[0.7, 0.3])
pred = prediction_model.predict(input[0])
print(pred)

[[[0.3085397 ]]

 [[0.3085397 ]]

 [[0.31273463]]

 [[0.3085397 ]]

 [[0.3097434 ]]

 [[0.31273463]]

 [[0.31273463]]

 [[0.3085397 ]]

 [[0.31273463]]

 [[0.3097434 ]]

 [[0.32185972]]

 [[0.3085397 ]]

 [[0.32185972]]

 [[0.31273463]]

 [[0.31273463]]

 [[0.3085397 ]]

 [[0.32185972]]

 [[0.3085397 ]]

 [[0.3085397 ]]

 [[0.31273463]]

 [[0.31273463]]

 [[0.31629544]]

 [[0.32185972]]

 [[0.3085397 ]]

 [[0.32185972]]

 [[0.3097434 ]]

 [[0.31273463]]

 [[0.31273463]]

 [[0.31273463]]

 [[0.3085397 ]]

 [[0.31273463]]

 [[0.3085397 ]]

 [[0.3097434 ]]

 [[0.31629544]]

 [[0.3097434 ]]

 [[0.31273463]]

 [[0.3085397 ]]

 [[0.3097434 ]]

 [[0.3085397 ]]

 [[0.31629544]]

 [[0.3085397 ]]

 [[0.31273463]]

 [[0.31273463]]

 [[0.3085397 ]]

 [[0.3085397 ]]

 [[0.31273463]]

 [[0.31273463]]

 [[0.3097434 ]]

 [[0.31273463]]

 [[0.3097434 ]]

 [[0.3097434 ]]

 [[0.31273463]]

 [[0.3085397 ]]

 [[0.31273463]]

 [[0.3097434 ]]

 [[0.3085397 ]]

 [[0.3085397 ]]

 [[0.3085397 ]]

 [[0.32185972]