In [2]:
#pip install tf-keras

Collecting tf-keras
  Downloading tf_keras-2.17.0-py3-none-any.whl.metadata (1.6 kB)
Downloading tf_keras-2.17.0-py3-none-any.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
    --------------------------------------- 0.0/1.7 MB 495.5 kB/s eta 0:00:04
   ------- -------------------------------- 0.3/1.7 MB 3.0 MB/s eta 0:00:01
   ---------------------------------------  1.7/1.7 MB 12.2 MB/s eta 0:00:01
   ---------------------------------------- 1.7/1.7 MB 11.0 MB/s eta 0:00:00
Installing collected packages: tf-keras
Successfully installed tf-keras-2.17.0
Note: you may need to restart the kernel to use updated packages.


In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel
import tensorflow as tf
import streamlit as st

In [28]:
# Load and preprocess your data
@st.cache_data
def load_data():
    text_df = pd.read_csv('Resources/agency_mvp_text_threads.csv')
    cleaned_text_df = text_df[['sender_type', 'receiver_type', 'body']]
    
    cleaned_text_df = text_df[['sender_type', 'receiver_type', 'body']].copy()
    cleaned_text_df = cleaned_text_df.dropna(subset=['body'])
    
    # Encode sender and receiver types
    def safe_encode(series):
        le = LabelEncoder()
        series = series.fillna('Unknown')
        le.fit(series)
        return le.transform(series), le

    sender_encoded, sender_encoder = safe_encode(cleaned_text_df['sender_type'])
    receiver_encoded, receiver_encoder = safe_encode(cleaned_text_df['receiver_type'])

    cleaned_text_df['sender_type_encoded'] = sender_encoded
    cleaned_text_df['receiver_type_encoded'] = receiver_encoded

    # Combine texts
    cleaned_text_df['combined_text'] = cleaned_text_df.apply(
        lambda row: f"Sender: {row['sender_type_encoded']} Receiver: {row['receiver_type_encoded']} Message: {row['body']}",
        axis=1
    )
    return cleaned_text_df

cleaned_text_df = load_data()

2024-10-01 21:52:53.925 No runtime found, using MemoryCacheStorageManager
2024-10-01 21:52:53.930 No runtime found, using MemoryCacheStorageManager


In [8]:
# Split the dataset into training and validation sets
train_texts, val_texts = train_test_split(cleaned_text_df['combined_text'].tolist(), test_size=0.1, random_state=42)

In [29]:
# Initialize the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # Set pad token

model = TFGPT2LMHeadModel.from_pretrained('gpt2')

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [30]:
# Tokenize the inputs
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128, return_tensors='tf')
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128, return_tensors='tf')

In [31]:
# Convert tokenized data into TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(input_ids=train_encodings['input_ids'], attention_mask=train_encodings['attention_mask']),
    train_encodings['input_ids']
))
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(input_ids=val_encodings['input_ids'], attention_mask=val_encodings['attention_mask']),
    val_encodings['input_ids']
))


In [32]:
# Batch the datasets and shuffle the training dataset
train_dataset = train_dataset.shuffle(1000).batch(4)
val_dataset = val_dataset.batch(4)

In [33]:
# Compile the model with Adam optimizer and the built-in loss function
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss)

In [34]:
# Train the model
if st.button("Train Model"):
    model.fit(train_dataset,
              validation_data=val_dataset,
              epochs=3)
    st.success("Model training complete!")

# Prediction function
def generate_text(prompt):
    input_ids = tokenizer.encode(prompt, return_tensors='tf')
    output = model.generate(input_ids, max_length=50, num_return_sequences=1)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Streamlit user interface
st.title("GPT-2 Model Text Generation")
st.write("Enter a prompt to generate text:")

user_input = st.text_area("Prompt:", "Type your prompt here...")

if st.button("Generate"):
    if user_input:
        generated_text = generate_text(user_input)
        st.write("Generated Text:")
        st.write(generated_text)
    else:
        st.warning("Please enter a prompt.")

In [35]:
model.save_pretrained("./my_text_bot_model")
tokenizer.save_pretrained("./my_text_bot_model")
st.success("Model and tokenizer saved!")

DeltaGenerator()