# News Headline Generation

## Part 1: Data Preparation

In [70]:
import numpy as np
import matplotlib.pylab as plt
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import string
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras import optimizers
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, LSTM, Embedding
from keras.layers import Conv2D, MaxPooling2D, Bidirectional
from keras.optimizers import RMSprop
from keras.applications.densenet import preprocess_input,decode_predictions
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
import keras.utils as ku
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.regularizers import l2
import random
from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu
import itertools


In [2]:
df2 = pd.read_csv("/kaggle/input/selected-sources/selected_sources.csv")
sources = df2[['title', 'publication']]

In [None]:
def removePunc(str):
    str = "".join(i for i in str if i not in string.punctuation)
    return str

sources = sources[sources['title'].apply(lambda x: isinstance(x, str))]
sources = sources[sources['title'].apply(lambda x: len(x.split()) <= 30)]
sources['title'] = sources['title'].apply(lambda x: x.lower())
sources['title'] = sources['title'].apply(lambda x: x.strip())
sources["title"] = sources['title'].apply(lambda x: removePunc(x))
print(sources.head)
print(sources.shape)

### Tokenization & Flattening

In [577]:
vocabSize = 10000
tokenizer = Tokenizer(num_words=vocabSize, oov_token = "<OOV>")

def textToToken(df):
    tokenizer.fit_on_texts(df["title"]) # Updating tokenizer vocabulary to only contains words in df
    inputs = []
    for title in df['title']:
        tokens = tokenizer.texts_to_sequences([title])[0] # Converts all text into tokens in array form like [8, 9, 2, 10, 11, 3, 1]
        for x in range(1, len(tokens)): # builds up n-gram sequences
            seq = tokens[:x+1]
            inputs.append(seq)
    return inputs

### Padding

In [578]:
def generate_oov_padded(input_sequences, total_words, oov_token_index=1):
    max_sequence_length = max([len(x) for x in input_sequences]) # Finds longest length in order to standardize
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')) # adds zeros to the beginning

    # Initialize lists for filtered predictors and labels
    filtered_predictors = []
    filtered_labels = []

    for seq in input_sequences:
        if oov_token_index != seq[-2]:  # Exclude sequences where OOV token predicts another word
            filtered_predictors.append(seq[:-1])  # All elements except the last
            filtered_labels.append(seq[-1])  # Only the last element

    filtered_predictors = np.array(filtered_predictors) # Convert to numpy arrays
    filtered_labels = np.array(filtered_labels)
    filtered_labels = ku.to_categorical(filtered_labels, num_classes=total_words) # Converts each element to array of all zeros except for the label

    return filtered_predictors, filtered_labels, max_sequence_length


### Text Generation Code

In [579]:
def temperatureSampling(preds, temperature=1.0, oov_token="<OOV>"):
    preds = np.asarray(preds).astype('float64') # convert to numpy and ensure float representation
    oov_index = tokenizer.word_index[oov_token] # ensures that oov token wont be chosen as the word
    preds[oov_index] = 0
    preds = np.log(preds + 1e-8) / temperature # avoids log of 0 by adding constant. Temp represents randomness here. High temp = more random, lower temp = less random
    exp_preds = np.exp(preds) 
    preds = exp_preds / np.sum(exp_preds) # normalizes probabilitys
    probas = np.random.multinomial(1, preds, 1) # picks randomly from distribution with preds as the pdf
    return np.argmax(probas)

def top_k_sampling(predictions, k=10, oov_token="<OOV>"):
    # Extract the top-k probabilities and their indices
    oov_index = tokenizer.word_index[oov_token] # ensures that oov token wont be chosen as the word
    predictions[oov_index] = 0
    top_k_indices = np.argsort(predictions)[-k:] # gets top-k probabilities
    top_k_values = predictions[top_k_indices]
    top_k_values = top_k_values / np.sum(top_k_values) # Normalize the top-k probabilities
    chosen_index = np.random.choice(top_k_indices, p=top_k_values) # Chose random from top-k
    top_words = tokenizer.sequences_to_texts([[idx] for idx in top_k_indices])
    return chosen_index

def generate_text(starter, num_words, model, max_sequence_len, temp, set_temp):
    k=10
    for _ in range(num_words):
        tokenlist = tokenizer.texts_to_sequences([starter])
        token_list = tokenizer.texts_to_sequences([starter])[0] # gets initial sequence from given word
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre') # pads to ensure proper length
        predicted = model.predict(token_list)[0] # array of probabilities     
        if temp:  
            next_index = temperatureSampling(predicted, temperature = set_temp) #<- TEMP sampling
        else:
            next_index = top_k_sampling(predicted, k=k) # TOP K sampling
        next_word = tokenizer.sequences_to_texts([[next_index]])[0] # converts back to word
        starter += " " + next_word
    return starter.title()

## Text Similarity Metrics

In [580]:
# Calculate the BLEU score for reference vs generated sentences
def calculate_bleu_score(references, generated):
    tokenized_refs = [[ref.split()] for ref in references]
    tokenized_gens = [gen.split() for gen in generated]

    smoothing = SmoothingFunction().method4
    bleu_score = corpus_bleu(tokenized_refs, tokenized_gens, smoothing_function=smoothing)

    return bleu_score

## Part 2: Source Specific Models

### Fox

In [12]:
foxSources = sources.loc[sources['publication'] == "Fox News"]
foxSources.reset_index(inplace=True)

In [None]:
inputs = textToToken(foxSources)
predictors, label, max_sequence_length = generate_oov_padded(inputs, vocabSize)
print(max_sequence_length)
train_pred, val_pred, train_label, val_label = train_test_split(predictors, label, test_size=0.2, random_state=30)

In [None]:
foxModel = Sequential()
input_len = max_sequence_length - 1
foxModel.add(Embedding(vocabSize, 100, input_length = input_len))
foxModel.add(Dropout(0.4))
foxModel.add(LSTM(100, return_sequences=True, kernel_regularizer=l2(0.001)))
foxModel.add(LSTM(100, kernel_regularizer=l2(0.001)))
foxModel.add(Dropout(0.4))
foxModel.add(Dense(vocabSize, activation='softmax'))
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
mc = ModelCheckpoint('best_model.h5', monitor='val_loss', mode='min', save_best_only=True)
optimizer = optimizers.Adam(learning_rate=0.001)
foxModel.compile(loss = 'categorical_crossentropy', optimizer=optimizer)  

In [None]:
foxModel.summary()

In [None]:
foxHistory = foxModel.fit(fox_train_pred, fox_train_label, validation_data=(fox_val_pred, fox_val_label), epochs=30, callbacks=[es, mc])

In [None]:
foxModel.save('foxModel1.keras')

In [None]:
curr = tf.keras.models.load_model('/kaggle/input/currentmodel/foxModel1.keras')

In [None]:
# print(generate_text("united states", 10, foxModel, max_sequence_length))
# print(generate_text("donald trump", 10, foxModel, max_sequence_length))
print(generate_text("donald", 5, curr, max_sequence_length))

### CNN

In [None]:
cnnSources = sources.loc[sources['publication'] == "CNN"]
cnnSources.reset_index(inplace=True)
cnnSources_random = cnnSources.sample(n=30000, random_state=30)
print(cnnSources.shape)


In [None]:
cnn_inputs = textToToken(cnnSources_random)
cnn_predictors, cnn_label, cnn_max_sequence_length = generate_oov_padded(cnn_inputs, vocabSize)
cnn_train_pred, cnn_val_pred, cnn_train_label, cnn_val_label = train_test_split(cnn_predictors, cnn_label, test_size=0.2, random_state=30)

In [8]:
cnnModel = Sequential()
input_len = cnn_max_sequence_length - 1
cnnModel.add(Embedding(vocabSize, 100, input_length = input_len))
cnnModel.add(Dropout(0.4))
cnnModel.add(LSTM(100, return_sequences=True, kernel_regularizer=l2(0.001)))
cnnModel.add(LSTM(100, kernel_regularizer=l2(0.001)))
cnnModel.add(Dropout(0.4))
cnnModel.add(Dense(vocabSize, activation='softmax'))
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
mc = ModelCheckpoint('best_model.h5', monitor='val_loss', mode='min', save_best_only=True)
optimizer = optimizers.Adam(learning_rate=0.001)
cnnModel.compile(loss = 'categorical_crossentropy', optimizer=optimizer)

In [None]:
cnnHistory = cnnModel.fit(cnn_train_pred, cnn_train_label, validation_data=(cnn_val_pred, cnn_val_label), epochs=30, callbacks=[es, mc])

In [None]:
cnnModel.save('cnnModel1.keras')

In [None]:
print(generate_text("vaccines are", 5, cnnModel, cnn_max_sequence_length))

### TMZ

In [None]:
tmzSources = sources.loc[sources['publication'] == "TMZ"]
tmzSources.reset_index(inplace=True)
tmzSources = tmzSources.sample(n=30000, random_state=30)
print(tmzSources.shape)

In [None]:
tmz_inputs = textToToken(tmzSources)
tmz_predictors, tmz_label, tmz_max_sequence_length = generate_oov_padded(tmz_inputs, vocabSize)
print(tmz_max_sequence_length)
tmz_train_pred, tmz_val_pred, tmz_train_label, tmz_val_label = train_test_split(tmz_predictors, tmz_label, test_size=0.2, random_state=30)

In [None]:
tmzModel = Sequential()
input_len = tmz_max_sequence_length - 1
tmzModel.add(Embedding(vocabSize, 100, input_length = input_len))
tmzModel.add(Dropout(0.4))
tmzModel.add(LSTM(100, return_sequences=True, kernel_regularizer=l2(0.001)))
tmzModel.add(LSTM(100, kernel_regularizer=l2(0.001)))
tmzModel.add(Dropout(0.4))
tmzModel.add(Dense(vocabSize, activation='softmax'))
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
mc = ModelCheckpoint('best_model.h5', monitor='val_loss', mode='min', save_best_only=True)
optimizer = optimizers.Adam(learning_rate=0.001)
tmzModel.compile(loss = 'categorical_crossentropy', optimizer=optimizer)

In [None]:
tmzHistory = tmzModel.fit(tmz_train_pred, tmz_train_label, validation_data=(tmz_val_pred, tmz_val_label), epochs=30, callbacks=[es, mc])

In [12]:
tmzModel.save('tmzModel1.keras')

In [None]:
print(generate_text("donald", 5, tmzModel, tmz_max_sequence_length))

### Refinery 29

In [None]:
refinerySources = sources.loc[sources['publication'] == "Refinery 29"]
refinerySources.reset_index(inplace=True)
refinerySources = refinerySources.sample(n=30000, random_state=40)
print(refinerySources.shape)

In [None]:
print(refinerySources.head)

In [36]:
rf_inputs = textToToken(refinerySources)
rf_predictors, rf_label, rf_max_sequence_length = generate_oov_padded(rf_inputs, vocabSize)
rf_train_pred, rf_val_pred, rf_train_label, rf_val_label = train_test_split(rf_predictors, rf_label, test_size=0.2, random_state=30)

21


In [46]:
rfModel = Sequential()
input_len = rf_max_sequence_length - 1
rfModel.add(Embedding(vocabSize, 100, input_length = input_len))
rfModel.add(Dropout(0.4)) 
rfModel.add(LSTM(100, return_sequences=True, kernel_regularizer=l2(0.001)))
rfModel.add(LSTM(100, kernel_regularizer=l2(0.001)))
rfModel.add(Dropout(0.4))
rfModel.add(Dense(vocabSize, activation='softmax'))
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
mc = ModelCheckpoint('best_model.h5', monitor='val_loss', mode='min', save_best_only=True)
optimizer = optimizers.Adam(learning_rate=0.001)
rfModel.compile(loss = 'categorical_crossentropy', optimizer=optimizer)

In [None]:
rfHistory = rfModel.fit(rf_train_pred, rf_train_label, validation_data=(rf_val_pred, rf_val_label), epochs=30, callbacks=[es, mc])

In [48]:
rfModel.save('rfModel2.keras')

In [65]:
rfTest = tf.keras.models.load_model('rfModel1.keras')

In [None]:
print(generate_text("donald", 5, rfTest, rf_max_sequence_length))

## VOX

In [None]:
voxSources = sources.loc[sources['publication'] == "Vox"]
voxSources.reset_index(inplace=True)
voxSources = voxSources.sample(n=25000, random_state=30)
print(voxSources.shape)

In [7]:
vox_inputs = textToToken(voxSources)
vox_predictors, vox_label, vox_max_sequence_length = generate_oov_padded(vox_inputs, vocabSize)
print()
vox_train_pred, vox_val_pred, vox_train_label, vox_val_label = train_test_split(vox_predictors, vox_label, test_size=0.2, random_state=30)

In [8]:
voxModel = Sequential()
input_len = vox_max_sequence_length - 1
voxModel.add(Embedding(vocabSize, 100, input_length = input_len))
voxModel.add(Dropout(0.4)) # dropout to avoid overfitting
voxModel.add(LSTM(100, return_sequences=True, kernel_regularizer=l2(0.001)))
voxModel.add(LSTM(100, kernel_regularizer=l2(0.001)))
voxModel.add(Dropout(0.4))
voxModel.add(Dense(vocabSize, activation='softmax'))
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
mc = ModelCheckpoint('best_model.h5', monitor='val_loss', mode='min', save_best_only=True)
optimizer = optimizers.Adam(learning_rate=0.001)
voxModel.compile(loss = 'categorical_crossentropy', optimizer=optimizer)

In [None]:
voxHistory = voxModel.fit(vox_train_pred, vox_train_label, validation_data=(vox_val_pred, vox_val_label), epochs=30, callbacks=[es, mc])

In [30]:
bizModel.save('bizModel1.keras')

In [None]:
print(generate_text("republicans", 10, voxModel, vox_max_sequence_length))

## Business Insider 

In [None]:
bizSources = sources.loc[sources['publication'] == "Business Insider"]
bizSources.reset_index(inplace=True)
bizSources = bizSources.sample(n=30000, random_state=40)
print(bizSources.shape)

In [None]:
biz_inputs = textToToken(bizSources)
biz_predictors, biz_label, biz_max_sequence_length = generate_oov_padded(biz_inputs, vocabSize)
print(biz_max_sequence_length)
biz_train_pred, biz_val_pred, biz_train_label, biz_val_label = train_test_split(biz_predictors, biz_label, test_size=0.2, random_state=30)

In [14]:
bizModel = Sequential()
input_len = biz_max_sequence_length - 1
bizModel.add(Embedding(vocabSize, 100, input_length = input_len))
bizModel.add(Dropout(0.4)) # dropout to avoid overfitting
bizModel.add(LSTM(100, return_sequences=True, kernel_regularizer=l2(0.001)))
bizModel.add(LSTM(100, kernel_regularizer=l2(0.001)))
bizModel.add(Dropout(0.4))
bizModel.add(Dense(vocabSize, activation='softmax'))
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
mc = ModelCheckpoint('best_model.h5', monitor='val_loss', mode='min', save_best_only=True)
optimizer = optimizers.Adam(learning_rate=0.001)
bizModel.compile(loss = 'categorical_crossentropy', optimizer=optimizer)

In [None]:
bizHistory = bizModel.fit(biz_train_pred, biz_train_label, validation_data=(biz_val_pred, biz_val_label), epochs=30, callbacks=[es, mc])

In [None]:
print(generate_text("donald trump", 8, bizModel, biz_max_sequence_length))

## Completed Models and Evaluation

In [474]:
cnnModel = tf.keras.models.load_model("/kaggle/input/completedmodels/cnn_best_model.h5")
foxModel = tf.keras.models.load_model("/kaggle/input/completedmodels/fox_best_model.h5")
rfModel = tf.keras.models.load_model("/kaggle/input/current-models/rfModel2.keras")
tmzModel = tf.keras.models.load_model("/kaggle/input/completedmodels/tmz_best_model.h5")
voxModel =  tf.keras.models.load_model("/kaggle/input/current-models/voxModel1.keras")
bizModel = tf.keras.models.load_model("/kaggle/input/bizmodels/bizModel1.keras")
bizModelExtra = tf.keras.models.load_model("/kaggle/input/completedmodels/biz_best_model.h5")

In [95]:
politicalWords = [
    "Donald Trump",
    "The Senate",
    "Democrats are",
    "Climate Change",
    "Healthcare Reform",
    "The White House",
    "Foreign Affairs",
    "Tax Cuts",
    "Immigration Laws",
    "Supreme Court",
    "Gun Control",
    "Civil Rights",
    "Election Results",
    "Trade Agreements",
    "Political Campaign",
    "National Security",
    "Public Education",
    "Foreign Aid",
    "Military Spending",
    "Social Justice",
    "Income Inequality",
    "Federal Budget",
    "Global Warming",
    "Hillary Clinton",
    "Economic Sanctions",
    
]
print(len(politicalWords))

25


In [582]:
maxSeqLength = {"rf":21,"vox":27,"biz":28,"cnn":23,"fox":24,"tmz":20}

In [140]:
rfSamples = sources.loc[sources['publication'] == "Refinery 29"]
rfSamples.reset_index(inplace=True)
rfSamples = rfSamples.title
cnnSamples = sources.loc[sources['publication'] == "CNN"]
cnnSamples.reset_index(inplace=True)
cnnSamples = cnnSamples.title
foxSamples = sources.loc[sources['publication'] == "Fox News"]
foxSamples.reset_index(inplace=True)
foxSamples = foxSamples.title
tmzSamples = sources.loc[sources['publication'] == "TMZ"]
tmzSamples.reset_index(inplace=True)
tmzSamples = tmzSamples.title
voxSamples = sources.loc[sources['publication'] == "Vox"]
voxSamples.reset_index(inplace=True)
voxSamples = voxSamples.title
bizSamples = sources.loc[sources['publication'] == "Business Insider"]
bizSamples.reset_index(inplace=True)
bizSamples = bizSamples.title

In [154]:
def generateStatistics(modelAbv, trainedModel, titleStarter, maxLengthList, temp, setTemp):
    bizTotals, voxTotals, cnnTotals, foxTotals, tmzTotals, rfTotals = 0,0,0,0,0,0
    for x in range(10):
        # Get 50 samples of each type
        curr_vox = voxSamples.sample(n=25, random_state=x*10)
        curr_rf = rfSamples.sample(n=25, random_state=x*10)
        curr_cnn = cnnSamples.sample(n=25, random_state=x*10)
        curr_fox = foxSamples.sample(n=25, random_state=x*10)
        curr_biz = bizSamples.sample(n=25, random_state=x*10)
        curr_tmz = tmzSamples.sample(n=25, random_state=x*10)
        # Create generated text for the specific model 
        genText = []
        for word in titleStarter:
            newWords = generate_text(word, 10, trainedModel, maxLengthList[modelAbv], temp, setTemp).lower()
            genText.append(newWords)

        bizTotals += calculate_bleu_score(curr_biz, genText)
        voxTotals += calculate_bleu_score(curr_vox, genText)
        cnnTotals += calculate_bleu_score(curr_cnn, genText)
        foxTotals += calculate_bleu_score(curr_fox, genText)
        tmzTotals += calculate_bleu_score(curr_tmz, genText)
        rfTotals += calculate_bleu_score(curr_rf, genText)
    return [bizTotals / 10, voxTotals / 10, cnnTotals / 10, foxTotals / 10, tmzTotals / 10, rfTotals / 10]

In [None]:
currTrainingData = sources.loc[sources['publication'] == "Refinery 29"]
vocabList = currTrainingData.sample(n=30000, random_state=40)
modelInputs = textToToken(vocabList) # training tokenizer
len(currTrainingData)

In [None]:
generate_text("donald trump", 6, rfModel, maxSeqLength["rf"], True, 1)

In [None]:
currTrainingData = sources.loc[sources['publication'] == "Vox"]
vocabList = currTrainingData.sample(n=30000, random_state=40)
modelInputs = textToToken(vocabList) # training tokenizer

modelAbv = "rf"
trainedModel = rfModel
outputListK = generateStatistics(modelAbv, trainedModel, politicalWords, maxSeqLength, False, 0)
outputListTemp1 = generateStatistics(modelAbv, trainedModel, politicalWords, maxSeqLength, True, 1)
outputListTemp2 = generateStatistics(modelAbv, trainedModel, politicalWords, maxSeqLength, True, 2)

In [None]:
print(outputListK)
print(outputListTemp1)
print(outputListTemp2)

### selected news sources
all at least 20,000 publications

<b>Everything</b>
- Fox News (right - 20,144)
- Vox (left - )
- CNN (left center - 127,602)

<b>Entertainment News</b>
- TMZ (49,595)
- Refinery29 (111,433)

<b>Business</b>
- Business Insider (57,953)

### Don't need to run again period, just for initial set up

In [None]:
df2 = pd.read_csv("all-the-news-2-1.csv")
#sources_w_art = df2[['year', 'title', 'article', 'publication']]
sources = df2[['title', 'publication']]

In [None]:
selected = ['Fox News', 'Vox', 'CNN', 'TMZ', 'Refinery 29', 'Business Insider']
sources = sources.loc[sources['publication'].isin(selected)]
sources = sources[['title', 'publication']]

In [None]:
sources.to_csv('selected_sources.csv', index=False)

In [None]:
sources