In [None]:
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
input_files = ['/kaggle/input/us-election-2020-presidential-debates/us_election_2020_1st_presidential_debate.csv',
               '/kaggle/input/us-election-2020-presidential-debates/us_election_2020_2nd_presidential_debate.csv',
               '/kaggle/input/us-election-2020-presidential-debates/us_election_2020_biden_town_hall.csv',
               '/kaggle/input/us-election-2020-presidential-debates/us_election_2020_trump_town_hall.csv']
dfs = [pd.read_csv(file) for file in input_files]
data_frame = pd.concat(dfs)
print(data_frame.shape)
data_frame.head()

Get text from the two speakers

In [None]:
from itertools import chain

print(data_frame.speaker.unique())
trump_df = data_frame[(data_frame.speaker.str.contains('Trump'))]
biden_df = data_frame[(data_frame.speaker.str.contains('Biden'))]

# Drop speaker and minute columns
trump_text_df = trump_df.drop(['speaker', 'minute'], axis=1)
biden_text_df = biden_df.drop(['speaker', 'minute'], axis=1)

trump_text = list(chain(*trump_text_df.astype(str).values.tolist()))
biden_text = list(chain(*biden_text_df.astype(str).values.tolist()))

print('DONALD TRUMP')
print(trump_text[::100])
print('-----------------------------------------------------------------------------')
print('JOE BIDEN')
print(biden_text[::50])

In [None]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(trump_text+biden_text)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary size : ', vocab_size)

Transform text to sequences

In [None]:
from nltk import sent_tokenize
import numpy as np
from keras.utils import to_categorical
max_sequence_length = 4

def get_sequences(text):
    input_sequences = list()
    output_sequences = list()
    slice_size = max_sequence_length + 1
    for s in text:
        s = s.lower()
        sentences = sent_tokenize(s)
        seq = tokenizer.texts_to_sequences(sentences)
        for s in seq:
            tokens = [s[i:i+slice_size] for i in range(len(s) - slice_size + 1)]
            for t in tokens:
                if t:
                    input_sequences.append(t[:-1])
                    output_sequences.append(t[-1])
    return np.array(input_sequences), to_categorical(output_sequences, num_classes=vocab_size)
            
        
            

In [None]:
trump_input, trump_output = get_sequences(trump_text)
biden_input, biden_output = get_sequences(biden_text)

In [None]:
len(trump_input), len(biden_input)

Model

In [None]:
from keras import Sequential
from keras.layers import LSTM, Dense, Embedding

def get_model():
    model = Sequential()
    model.add(Embedding(vocab_size, 50, input_length=max_sequence_length))
    model.add(LSTM(100, return_sequences=True))
    model.add(LSTM(100))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(vocab_size, activation='softmax'))
    print(model.summary())
    return model

In [None]:
from keras import Sequential
from keras.layers import LSTM, Dense, Embedding

def get_model():
    model = Sequential()
    model.add(Embedding(vocab_size, 50, input_length=max_sequence_length))
    model.add(LSTM(100, return_sequences=True))
    model.add(LSTM(100))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(vocab_size, activation='softmax'))
    print(model.summary())
    return model

In [None]:
def train_model(text_input, text_output, epochs):
    model = get_model()
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(text_input, text_output, batch_size=128, epochs=epochs)
    return model

In [None]:
def predict(model, seed):
    words = seed.split(' ')
    for i in range(10):
        input_vectors = tokenizer.texts_to_sequences([' '.join(words)])
        y =  model.predict_classes(input_vectors)
        for key, value in tokenizer.word_index.items():
            if value == y:
                word = key
                break
        print(word, end =" "),
        words = words[1:]
        words.append(word)

In [None]:
trump_model = train_model(trump_input, trump_output, 200)
# combined_model = train_model(np.concatenate([trump_input, biden_input]), 
#                              np.concatenate([trump_output, biden_output]), 200)


Make a prediction

In [None]:
predict(trump_model, 'believe in law and')