# News Headline Generation

## Part 1: Data Preparation

In [11]:
import numpy as np
import matplotlib.pylab as plt
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import string
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras import optimizers
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, LSTM, Embedding
from keras.layers import Conv2D, MaxPooling2D, Bidirectional
from keras.optimizers import RMSprop
from keras.applications.densenet import preprocess_input,decode_predictions
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
import keras.utils as ku
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.regularizers import l2
import random
from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu

In [2]:
df2 = pd.read_csv("/kaggle/input/selected-sources/selected_sources.csv")
sources = df2[['title', 'publication']]

In [3]:
def removePunc(str):
  str = "".join(i for i in str if i not in string.punctuation)
  return str

sources = sources[sources['title'].apply(lambda x: isinstance(x, str))]
sources = sources[sources['title'].apply(lambda x: len(x.split()) <= 30)]
sources['title'] = sources['title'].apply(lambda x: x.lower())
sources['title'] = sources['title'].apply(lambda x: x.strip())
sources["title"] = sources['title'].apply(lambda x: removePunc(x))
print(sources.head)
print(sources.shape)

<bound method NDFrame.head of                                                     title       publication
0       we should take concerns about the health of li...               Vox
1       colts gm ryan grigson says andrew lucks contra...  Business Insider
2       paris hilton woman in black for uncle montys f...               TMZ
3                 how to watch the google io keynote live               Vox
4       “elizabeth warren called me” is turning into a...               Vox
...                                                   ...               ...
413994  florida ammo selling out on heels of stayathom...               TMZ
413995  disney forcing annual pass holders to continue...               TMZ
413996  nick cannon pimps out his impala with custom n...               TMZ
413997  pete buttigieg says governors showing more lea...               TMZ
413998  ruth bader ginsburg still working out with tra...               TMZ

[413989 rows x 2 columns]>
(413989, 2)


### Tokenization & Flattening

In [4]:
vocabSize = 10000
tokenizer = Tokenizer(num_words=vocabSize, oov_token = "<OOV>")

def textToToken(df):
  # Updating tokenizer vocabulary to only contains words in df
  tokenizer.fit_on_texts(df["title"])
  # Getting the size of the vocabulary
  # vocabSize = len(tokenizer.word_index) + 1
  inputs = []
  for title in df['title']:
    # Converts all text into tokens in array form like [8, 9, 2, 10, 11, 3, 1]
    tokens = tokenizer.texts_to_sequences([title])[0]
    for x in range(1, len(tokens)):
      seq = tokens[:x+1]
      inputs.append(seq)
  return inputs #, vocabSize


### Padding

In [5]:
#padding sequences
#get input from output of tokenizer

def generate_padded_sequences(input_sequences, total_words):
    max_sequence_length = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen = max_sequence_length, padding = 'pre'))

    predictors = input_sequences[:,:-1]
    label = input_sequences[:,-1]

    label = ku.to_categorical(label, num_classes = total_words)
    return predictors, label, max_sequence_length


def generate_oov_padded(input_sequences, total_words, oov_token_index=1):
    max_sequence_length = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre'))

    # Initialize lists for filtered predictors and labels
    filtered_predictors = []
    filtered_labels = []

    for seq in input_sequences:
        if oov_token_index != seq[-2]:  # Exclude sequences where OOV token predicts another word
            filtered_predictors.append(seq[:-1])  # All elements except the last
            filtered_labels.append(seq[-1])  # Only the last element

    filtered_predictors = np.array(filtered_predictors) # Convert to numpy arrays
    filtered_labels = np.array(filtered_labels)
    filtered_labels = ku.to_categorical(filtered_labels, num_classes=total_words)

    return filtered_predictors, filtered_labels, max_sequence_length


### Text Generation Code

In [16]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds + 1e-8) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def top_k_sampling(predictions, k=10, oov_token="<OOV>"):
    # Extract the top-k probabilities and their indices
    oov_index = tokenizer.word_index[oov_token]
    predictions[oov_index] = 0
    top_k_indices = np.argsort(predictions)[-k:]
    top_k_values = predictions[top_k_indices]

    # Normalize the top-k probabilities
    top_k_values = top_k_values / np.sum(top_k_values)
    chosen_index = np.random.choice(top_k_indices, p=top_k_values)
    top_words = tokenizer.sequences_to_texts([[idx] for idx in top_k_indices])
    return chosen_index

def generate_text(starter, num_words, model, max_sequence_len):
    temperature = 1
    k=10
    for _ in range(num_words):
        token_list = tokenizer.texts_to_sequences([starter])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict(token_list)[0]        
        #next_index = sample(predicted, temperature = temperature)
        next_index = top_k_sampling(predicted, k=k)
        next_word = tokenizer.sequences_to_texts([[next_index]])[0]
        starter += " "+ next_word
    return starter.title()

## Text Similarity Metrics

In [12]:
# Calculate the BLEU score for a set of generated sentences against reference sentences.
def calculate_bleu_score(references, candidates):
    tokenized_refs = [[ref.split()] for ref in references]
    tokenized_cands = [cand.split() for cand in candidates]

    smoothing = SmoothingFunction().method4
    bleu_score = corpus_bleu(tokenized_refs, tokenized_cands, smoothing_function=smoothing)

    return bleu_score

## Part 2: Source Specific Models

### Fox

In [12]:
foxSources = sources.loc[sources['publication'] == "Fox News"]
foxSources.reset_index(inplace=True)

In [13]:
inputs = textToToken(foxSources)
predictors, label, max_sequence_length = generate_oov_padded(inputs, vocabSize)
print(max_sequence_length)
train_pred, val_pred, train_label, val_label = train_test_split(predictors, label, test_size=0.2, random_state=30)

24


In [None]:
word_counts = tokenizer.word_counts
print("Most Common Words:", sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:10])

In [None]:
unique, counts = np.unique(np.argmax(label, axis=1), return_counts=True)
label_distribution = dict(zip(unique, counts))

In [None]:
print(foxSources['title'][0])
print(foxSources['title'][1])
print(foxSources['title'][2])
print(foxSources['title'][3])

print(inputs[6][-1], "is preview")


inputs[:10]

In [None]:
print(vocabSize)
print(len(label[0]))
print("")
print(len(label))
print(len(inputs))
print("")
x = np.where(label[0] == 1)
print(x)

# this is checking that label works correctly. label is an array, 
# with n rows (n = # of inputs / len of inputs) and vocabSize columns.
# for each input, there is a 1 at the index corresponding to the ending index on
# the input (eg for input 0, there is a 1 at 9288) and a 0 at all other indices.
# the label denotes what the ending word is (bc the index is where the word is
# located)

In [None]:
foxModel = Sequential()
input_len = max_sequence_length - 1
foxModel.add(Embedding(vocabSize, 100, input_length = input_len))
foxModel.add(Dropout(0.4)) # dropout to avoid overfitting
foxModel.add(LSTM(100, return_sequences=True, kernel_regularizer=l2(0.001)))
foxModel.add(LSTM(100, kernel_regularizer=l2(0.001)))
foxModel.add(Dropout(0.4))
foxModel.add(Dense(vocabSize, activation='softmax'))
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
mc = ModelCheckpoint('best_model.h5', monitor='val_loss', mode='min', save_best_only=True)
optimizer = optimizers.Adam(learning_rate=0.001)
foxModel.compile(loss = 'categorical_crossentropy', optimizer=optimizer)  

In [None]:
foxModel.summary()

In [None]:
foxHistory = foxModel.fit(fox_train_pred, fox_train_label, validation_data=(fox_val_pred, fox_val_label), epochs=30, callbacks=[es, mc])

In [None]:
foxModel.save('foxModel1.keras')

In [None]:
curr = tf.keras.models.load_model('/kaggle/input/currentmodel/foxModel1.keras')

In [None]:
# print(generate_text("united states", 10, foxModel, max_sequence_length))
# print(generate_text("donald trump", 10, foxModel, max_sequence_length))
print(generate_text("donald", 5, curr, max_sequence_length))


### CNN

In [9]:
cnnSources = sources.loc[sources['publication'] == "CNN"]
cnnSources.reset_index(inplace=True)
cnnSources_random = cnnSources.sample(n=30000, random_state=30)
print(cnnSources.shape)


(127594, 3)


In [10]:
cnn_inputs = textToToken(cnnSources_random)
cnn_predictors, cnn_label, cnn_max_sequence_length = generate_oov_padded(cnn_inputs, vocabSize)
cnn_train_pred, cnn_val_pred, cnn_train_label, cnn_val_label = train_test_split(cnn_predictors, cnn_label, test_size=0.2, random_state=30)

30


In [8]:
cnnModel = Sequential()
input_len = cnn_max_sequence_length - 1
cnnModel.add(Embedding(vocabSize, 100, input_length = input_len))
cnnModel.add(Dropout(0.4)) # dropout to avoid overfitting
cnnModel.add(LSTM(100, return_sequences=True, kernel_regularizer=l2(0.001)))
cnnModel.add(LSTM(100, kernel_regularizer=l2(0.001)))
cnnModel.add(Dropout(0.4))
cnnModel.add(Dense(vocabSize, activation='softmax'))
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
mc = ModelCheckpoint('best_model.h5', monitor='val_loss', mode='min', save_best_only=True)
optimizer = optimizers.Adam(learning_rate=0.001)
cnnModel.compile(loss = 'categorical_crossentropy', optimizer=optimizer)

In [None]:
cnnHistory = cnnModel.fit(cnn_train_pred, cnn_train_label, validation_data=(cnn_val_pred, cnn_val_label), epochs=30, callbacks=[es, mc])

In [None]:
cnnModel.save('cnnModel1.keras')

In [56]:
print(generate_text("vaccines are", 5, cnnModel, cnn_max_sequence_length))

Vaccines Are Happen Around Baltimore County Initially


### TMZ

In [6]:
tmzSources = sources.loc[sources['publication'] == "TMZ"]
tmzSources.reset_index(inplace=True)
tmzSources = tmzSources.sample(n=30000, random_state=30)
print(tmzSources.shape)

(30000, 3)


In [7]:
tmz_inputs = textToToken(tmzSources)
tmz_predictors, tmz_label, tmz_max_sequence_length = generate_oov_padded(tmz_inputs, vocabSize)
print(tmz_max_sequence_length)
tmz_train_pred, tmz_val_pred, tmz_train_label, tmz_val_label = train_test_split(tmz_predictors, tmz_label, test_size=0.2, random_state=30)

20


In [10]:
tmzModel = Sequential()
input_len = tmz_max_sequence_length - 1
tmzModel.add(Embedding(vocabSize, 100, input_length = input_len))
tmzModel.add(Dropout(0.4)) # dropout to avoid overfitting
tmzModel.add(LSTM(100, return_sequences=True, kernel_regularizer=l2(0.001)))
tmzModel.add(LSTM(100, kernel_regularizer=l2(0.001)))
tmzModel.add(Dropout(0.4))
tmzModel.add(Dense(vocabSize, activation='softmax'))
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
mc = ModelCheckpoint('best_model.h5', monitor='val_loss', mode='min', save_best_only=True)
optimizer = optimizers.Adam(learning_rate=0.001)
tmzModel.compile(loss = 'categorical_crossentropy', optimizer=optimizer)

In [None]:
tmzHistory = tmzModel.fit(tmz_train_pred, tmz_train_label, validation_data=(tmz_val_pred, tmz_val_label), epochs=30, callbacks=[es, mc])

In [12]:
tmzModel.save('tmzModel1.keras')

In [None]:
print(generate_text("donald", 5, tmzModel, tmz_max_sequence_length))

### Refinery 29

In [20]:
refinerySources = sources.loc[sources['publication'] == "Refinery 29"]
refinerySources.reset_index(inplace=True)
refinerySources = refinerySources.sample(n=30000, random_state=40)
print(refinerySources.shape)

(30000, 3)


In [44]:
print(refinerySources.head)

<bound method NDFrame.head of          index                                              title  publication
27126   120769  chance the rapper text message drake grammys 2...  Refinery 29
105444  397438                      jenny bird  catalina earrings  Refinery 29
77657   171438    best of brooklinen bedding and sheets sale 2019  Refinery 29
74829   168610  how riverdale will explain the death of luke p...  Refinery 29
47798   141496  natalie morales angry over creepy paparazzi ph...  Refinery 29
...        ...                                                ...          ...
98529   369770                                          selected   Refinery 29
110641  413169                  urban outfitters  toile sheet set  Refinery 29
41009   134707          jon snow game of thrones theory confirmed  Refinery 29
110918  413446                              tenga  iroha zen vibe  Refinery 29
64545   158298       parents moving into childs house paying rent  Refinery 29

[30000 rows x 3 colum

In [36]:
rf_inputs = textToToken(refinerySources)
rf_predictors, rf_label, rf_max_sequence_length = generate_oov_padded(rf_inputs, vocabSize)
rf_train_pred, rf_val_pred, rf_train_label, rf_val_label = train_test_split(rf_predictors, rf_label, test_size=0.2, random_state=30)

21


In [46]:
rfModel = Sequential()
input_len = rf_max_sequence_length - 1
rfModel.add(Embedding(vocabSize, 100, input_length = input_len))
rfModel.add(Dropout(0.4)) # dropout to avoid overfitting
rfModel.add(LSTM(100, return_sequences=True, kernel_regularizer=l2(0.001)))
rfModel.add(LSTM(100, kernel_regularizer=l2(0.001)))
rfModel.add(Dropout(0.4))
rfModel.add(Dense(vocabSize, activation='softmax'))
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
mc = ModelCheckpoint('best_model.h5', monitor='val_loss', mode='min', save_best_only=True)
optimizer = optimizers.Adam(learning_rate=0.001)
rfModel.compile(loss = 'categorical_crossentropy', optimizer=optimizer)

In [47]:
rfHistory = rfModel.fit(rf_train_pred, rf_train_label, validation_data=(rf_val_pred, rf_val_label), epochs=30, callbacks=[es, mc])

Epoch 1/30
Epoch 2/30
   9/3948 [..............................] - ETA: 27s - loss: 7.8400

  saving_api.save_model(


Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [48]:
rfModel.save('rfModel2.keras')

In [65]:
rfTest = tf.keras.models.load_model('rfModel1.keras')

In [None]:
print(generate_text("donald", 5, rfTest, rf_max_sequence_length))

## VOX

In [6]:
voxSources = sources.loc[sources['publication'] == "Vox"]
voxSources.reset_index(inplace=True)
voxSources = voxSources.sample(n=25000, random_state=30)
print(voxSources.shape)

(25000, 3)


In [7]:
vox_inputs = textToToken(voxSources)
vox_predictors, vox_label, vox_max_sequence_length = generate_oov_padded(vox_inputs, vocabSize)
print()
vox_train_pred, vox_val_pred, vox_train_label, vox_val_label = train_test_split(vox_predictors, vox_label, test_size=0.2, random_state=30)

In [8]:
voxModel = Sequential()
input_len = vox_max_sequence_length - 1
voxModel.add(Embedding(vocabSize, 100, input_length = input_len))
voxModel.add(Dropout(0.4)) # dropout to avoid overfitting
voxModel.add(LSTM(100, return_sequences=True, kernel_regularizer=l2(0.001)))
voxModel.add(LSTM(100, kernel_regularizer=l2(0.001)))
voxModel.add(Dropout(0.4))
voxModel.add(Dense(vocabSize, activation='softmax'))
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
mc = ModelCheckpoint('best_model.h5', monitor='val_loss', mode='min', save_best_only=True)
optimizer = optimizers.Adam(learning_rate=0.001)
voxModel.compile(loss = 'categorical_crossentropy', optimizer=optimizer)

In [9]:
voxHistory = voxModel.fit(vox_train_pred, vox_train_label, validation_data=(vox_val_pred, vox_val_label), epochs=30, callbacks=[es, mc])

Epoch 1/30
Epoch 2/30
   9/6074 [..............................] - ETA: 42s - loss: 6.8525 

  saving_api.save_model(


Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [30]:
bizModel.save('bizModel1.keras')

In [43]:
print(generate_text("republicans", 10, voxModel, vox_max_sequence_length))

Republicans In South Recognition 4 Interesting Senators Who Couldn’T His Surprising


## Business Insider 

In [15]:
bizSources = sources.loc[sources['publication'] == "Business Insider"]
bizSources.reset_index(inplace=True)
bizSources = bizSources.sample(n=30000, random_state=40)
print(bizSources.shape)

(30000, 3)


In [None]:
biz_inputs = textToToken(bizSources)
biz_predictors, biz_label, biz_max_sequence_length = generate_oov_padded(biz_inputs, vocabSize)
print(biz_max_sequence_length)
biz_train_pred, biz_val_pred, biz_train_label, biz_val_label = train_test_split(biz_predictors, biz_label, test_size=0.2, random_state=30)

28


In [14]:
bizModel = Sequential()
input_len = biz_max_sequence_length - 1
bizModel.add(Embedding(vocabSize, 100, input_length = input_len))
bizModel.add(Dropout(0.4)) # dropout to avoid overfitting
bizModel.add(LSTM(100, return_sequences=True, kernel_regularizer=l2(0.001)))
bizModel.add(LSTM(100, kernel_regularizer=l2(0.001)))
bizModel.add(Dropout(0.4))
bizModel.add(Dense(vocabSize, activation='softmax'))
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
mc = ModelCheckpoint('best_model.h5', monitor='val_loss', mode='min', save_best_only=True)
optimizer = optimizers.Adam(learning_rate=0.001)
bizModel.compile(loss = 'categorical_crossentropy', optimizer=optimizer)

In [15]:
bizHistory = bizModel.fit(biz_train_pred, biz_train_label, validation_data=(biz_val_pred, biz_val_label), epochs=30, callbacks=[es, mc])

Epoch 1/30
Epoch 2/30
  15/6414 [..............................] - ETA: 47s - loss: 7.3135

  saving_api.save_model(


Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [29]:
print(generate_text("donald trump", 8, bizModel, biz_max_sequence_length))

['who', 'wants', 'said', 'in', 'has', 'to', 'and', 'calls', 'is', 'says']
['2020', 'china', 'coronavirus', 'his', 'trumps', 'new', 'us', 'a', 'trump', 'the']
['presidential', 'hotel', 'secret', 'democratic', 'debate', 'family', 'us', 'man', 'new', 'state']
['instagram', 'new', 'video', 'family', 'jersey', 'movie', 'car', 'week', 'hampshire', 'york']
['in', 'carrier', 'hotel', 'house', 'shooting', 'man', 'city', 'debate', 'times', 'primary']
['is', 'live', 'after', 'vote', 'with', 'for', 'on', 'at', 'and', 'in']
['photo', 'us', 'twitter', 'results', 'trump', 'new', 'her', 'his', 'a', 'the']
['is', 'results', 'on', 'with', 'at', 'after', 'vote', 'and', 'was', 'in']
Donald Trump In A New Hampshire Primary On Results And


## Completed Models and Evaluation

In [8]:
cnnModel = tf.keras.models.load_model("/kaggle/input/current-models/cnnModel1.keras")
foxModel = tf.keras.models.load_model("/kaggle/input/current-models/foxModel1.keras")
rfModel = tf.keras.models.load_model("/kaggle/input/current-models/rfModel2.keras")
tmzModel = tf.keras.models.load_model("/kaggle/input/current-models/tmzModel1.keras")
voxModel =  tf.keras.models.load_model("/kaggle/input/current-models/voxModel1.keras")
bizModel = tf.keras.models.load_model("/kaggle/working/bizModel1.keras")

In [9]:
politicalWords = [
    "Donald Trump",
    "The Senate",
    "Democrats are",
    "Climate Change",
    "Healthcare Reform",
    "The White House",
    "Foreign Affairs",
    "Tax Cuts",
    "Immigration Laws",
    "Supreme Court",
    "Gun Control",
    "Civil Rights",
    "Election Results",
    "Trade Agreements",
    "Political Campaign",
    "National Security",
    "Public Education",
    "Foreign Aid",
    "Military Spending",
    "Social Justice",
    "Income Inequality",
    "Federal Budget",
    "Global Warming",
    "Hillary Clinton",
    "Economic Sanctions"
]
print(len(politicalWords))

25


In [None]:
maxSeqLength = {"rf":21,"vox":27,"biz":28,"cnn":23,"fox":24,"tmz":20}
pubList = ["rf","vox","biz","cnn","fox","tmz"]

rfOutput, voxOutput, bizOutput, cnnOutput, foxOutput, tmzOutput = [], [], [], [], [], []
for word in politicalWords:
    randNum = random.randint(5, 9)
    newWords = generate_text(word, randNum, rfModel, maxSeqLength["rf"]).lower()
    rfOutput.append(newWords)
    
    randNum = random.randint(5, 9)
    newWords = generate_text(word, randNum, voxModel, maxSeqLength["vox"]).lower()
    voxOutput.append(newWords)

    randNum = random.randint(5, 9)
    newWords = generate_text(word, randNum, bizModel, maxSeqLength["biz"]).lower()
    bizOutput.append(newWords)
    
    randNum = random.randint(5, 9)
    newWords = generate_text(word, randNum, cnnModel, maxSeqLength["cnn"]).lower()
    cnnOutput.append(newWords)
    
    randNum = random.randint(5, 9)
    newWords = generate_text(word, randNum, foxModel, maxSeqLength["fox"]).lower()
    foxOutput.append(newWords)
    
    randNum = random.randint(5, 9)
    newWords = generate_text(word, randNum, tmzModel, maxSeqLength["tmz"]).lower()
    tmzOutput.append(newWords)

In [42]:
rfSamples = sources.loc[sources['publication'] == "Refinery 29"]
rfSamples = rfSamples.sample(n=25, random_state=40)
rfSamples = rfSamples.title

In [43]:
cnnSamples = sources.loc[sources['publication'] == "CNN"]
cnnSamples = cnnSamples.sample(n=25, random_state=40)
cnnSamples = cnnSamples.title

In [44]:
foxSamples = sources.loc[sources['publication'] == "Fox News"]
foxSamples = foxSamples.sample(n=25, random_state=40)
foxSamples = foxSamples.title

In [45]:
tmzSamples = sources.loc[sources['publication'] == "TMZ"]
tmzSamples = tmzSamples.sample(n=25, random_state=40)
tmzSamples = tmzSamples.title

In [46]:
voxSamples = sources.loc[sources['publication'] == "Vox"]
voxSamples = voxSamples.sample(n=25, random_state=40)
voxSamples = voxSamples.title

In [47]:
bizSamples = sources.loc[sources['publication'] == "Business Insider"]
bizSamples = bizSamples.sample(n=25, random_state=40)
bizSamples = bizSamples.title

In [None]:
print(calculate_bleu_score(rfOutput, rfSamples))
print(calculate_bleu_score(rfOutput, voxSamples))
print(calculate_bleu_score(rfOutput, cnnSamples))
print(calculate_bleu_score(rfOutput, foxSamples))
print(calculate_bleu_score(rfOutput, tmzSamples))
print(calculate_bleu_score(rfOutput, bizSamples))

In [None]:
print(calculate_bleu_score(refinerySources, rfGenText))
print(calculate_bleu_score(refinerySources, rfGenText))
print(calculate_bleu_score(cnnSources, rfGenText))

### selected news sources
all at least 20,000 publications

<b>Everything</b>
- Fox News (right - 20,144)
- Vox (left - )
- CNN (left center - 127,602)

<b>Entertainment News</b>
- TMZ (49,595)
- Refinery29 (111,433)

<b>Business</b>
- Business Insider (57,953)

### Don't need to run again period, just for initial set up

In [None]:
df2 = pd.read_csv("all-the-news-2-1.csv")
#sources_w_art = df2[['year', 'title', 'article', 'publication']]
sources = df2[['title', 'publication']]

In [None]:
selected = ['Fox News', 'Vox', 'CNN', 'TMZ', 'Refinery 29', 'Business Insider']
sources = sources.loc[sources['publication'].isin(selected)]
sources = sources[['title', 'publication']]

In [None]:
sources.to_csv('selected_sources.csv', index=False)

In [None]:
sources