In [1]:
import numpy as np
import pandas as pd

In [14]:
df = pd.read_csv("../data/style/trump/df_trump.csv")
df

Unnamed: 0,sequences
0,"Paul Gosar Paul Where's Paul? Paul, stand u..."
1,"Thank you, Paul Appreciate Great job you're..."
2,I knew that she was going She never stops S...
3,"Governor, thank you Martha, go out and win ..."
4,We love you We love you We love you We lov...
...,...
181,"They are fantastic people Thank you Look, t..."
182,"So thank you very much Thank you, darling R..."
183,"""5 7 billion, sir "" I said, ""That's a lot of ..."
184,""" You know what? I was right We won it We wo..."


In [6]:
df_trump.sample()["sequences"].iloc[0]

' Think of that  That crazy Nancy  She is crazy  And shifty Schiff'

# Clean Text

In [43]:
import re
import nltk
import string
from nltk import word_tokenize

from tqdm.notebook import tqdm
tqdm.pandas()

In [71]:
def clean_text(text):
    # remove punctuation
    text = text.translate(str.maketrans('', '', "\""))
    text = re.sub(r'\s+', ' ', text)
    text = word_tokenize(text.lower().strip())
    text = [token.strip() for token in text if token.strip() != ""]

    return text

In [72]:
df["Clean_Text"] = df["sequences"].progress_apply(clean_text)

  0%|          | 0/186 [00:00<?, ?it/s]

In [73]:
df["Clean_Text"]

0      [paul, gosar, paul, where, 's, paul, ?, paul, ...
1      [thank, you, ,, paul, appreciate, great, job, ...
2      [i, knew, that, she, was, going, she, never, s...
3      [governor, ,, thank, you, martha, ,, go, out, ...
4      [we, love, you, we, love, you, we, love, you, ...
                             ...                        
181    [they, are, fantastic, people, thank, you, loo...
182    [so, thank, you, very, much, thank, you, ,, da...
183    [5, 7, billion, ,, sir, i, said, ,, that, 's, ...
184    [you, know, what, ?, i, was, right, we, won, i...
185    [i, mean, ,, that, was, amazing, no, ,, no, ,,...
Name: Clean_Text, Length: 186, dtype: object

In [77]:
df.loc[184]["sequences"]

'" You know what? I was right  We won it  We won it easily  We won it easily'

In [96]:
from collections import Counter


counts = Counter()
for row in df["Clean_Text"]:
    counts.update(row)

In [97]:
MIN_COUNT = 1

print("num_words before:",len(counts.keys()))
for word in list(counts):
    if counts[word] < MIN_COUNT:
        del counts[word]
print("num_words after:",len(counts.keys()))

num_words before: 561
num_words after: 561


In [98]:
vocab2index = {"": 0, "UNK": 1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [99]:
len(vocab2index)

563

In [107]:
import json
with open('../vocab2index.json', 'w') as f:
    json.dump(vocab2index, f, indent=4)

In [100]:
def encode_sentence(text, vocab2index, max_len=50):
    encoded = np.zeros(max_len, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in text])
    length = min(max_len, len(enc1)) # if above max len, cut the rest
    encoded[:length] = enc1[:length]

    return encoded

In [101]:
df["Clean_Text"].apply(len).describe()

count    186.000000
mean      14.865591
std        2.803125
min        9.000000
25%       13.000000
50%       15.000000
75%       17.000000
max       25.000000
Name: Clean_Text, dtype: float64

In [102]:
df["Clean_Text_Encoded"] = df["Clean_Text"].progress_apply(lambda x: encode_sentence(x, vocab2index))

  0%|          | 0/186 [00:00<?, ?it/s]

In [103]:
df["Clean_Text_Encoded"]

0      [2, 3, 2, 4, 5, 2, 6, 2, 7, 8, 9, 10, 11, 6, 2...
1      [12, 11, 7, 2, 13, 14, 15, 11, 16, 17, 18, 19,...
2      [21, 22, 23, 24, 25, 26, 24, 27, 28, 24, 27, 2...
3      [32, 7, 12, 11, 33, 7, 34, 35, 36, 37, 34, 35,...
4      [40, 41, 11, 40, 41, 11, 40, 41, 11, 40, 41, 1...
                             ...                        
181    [61, 19, 554, 154, 12, 11, 105, 7, 61, 16, 555...
182    [146, 12, 11, 50, 51, 12, 11, 7, 556, 145, 140...
183    [130, 272, 131, 7, 469, 21, 188, 7, 23, 5, 46,...
184    [11, 90, 85, 6, 21, 25, 139, 40, 279, 49, 40, ...
185    [21, 541, 7, 23, 25, 561, 57, 7, 57, 7, 21, 14...
Name: Clean_Text_Encoded, Length: 186, dtype: object

In [106]:
df.to_csv("../df_trump_encoded.csv", index=False)

# Modeling

In [81]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [84]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('mps')

device

device(type='mps')

In [82]:
class StyleEmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, style_size, pretrained_embeddings=None):
        super(StyleEmbeddingModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        if pretrained_embeddings is not None:
            self.embedding.weight.data.copy_(pretrained_embeddings)
            self.embedding.weight.requires_grad = False  # Optionally, freeze embeddings
        self.style_embed = nn.Embedding(2, style_size)  # Assume 2 styles for simplicity
        self.rnn = nn.GRU(embed_size + style_size, hidden_size, batch_first=True)
        self.decoder = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, style):
        x = self.embedding(x)
        style_embedding = self.style_embed(style).unsqueeze(1).expand_as(x)
        x = torch.cat([x, style_embedding], dim=2)
        output, _ = self.rnn(x)
        output = self.decoder(output)
        return output

In [85]:
vocab_size = len(vocab2index)
embed_size = 256
hidden_size = 512
style_size = 128

In [108]:
pretrained_embeddings = None

In [109]:
model = StyleEmbeddingModel(vocab_size, embed_size, hidden_size, style_size, pretrained_embeddings)