# News Headline Generation

## Part 1: Data Preparation

In [1]:
import numpy as np
import matplotlib.pylab as plt
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import string
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, LSTM, Embedding
from keras.layers import Conv2D, MaxPooling2D
from keras.optimizers import RMSprop
from keras.applications.densenet import preprocess_input,decode_predictions
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
import keras.utils as ku
#from keras.preprocessing.sequence import pad_sequencese




In [2]:
df2 = pd.read_csv("/kaggle/input/selected-sources/selected_sources.csv")
#sources_w_art = df2[['year', 'title', 'article', 'publication']]
sources = df2[['title', 'publication']]

In [3]:
'''
Should we remove punctuation? There might be some pros and cons, but references seem to remove it.

Here we are cleaning our data
'''


sources.head()
print(sources.shape)
sources = sources[sources['title'].apply(lambda x: isinstance(x, str))]
sources = sources[sources['title'].apply(lambda x: len(x.split()) <= 30)]
sources['title'] = sources['title'].apply(lambda x: x.lower())
sources['title'] = sources['title'].apply(lambda x: x.strip())

def removePunc(str):
  str = "".join(i for i in str if i not in string.punctuation)
  return str

sources["title"] = sources['title'].apply(lambda x: removePunc(x))
print(sources.head)

print(sources.shape)

(413999, 2)
<bound method NDFrame.head of                                                     title       publication
0       we should take concerns about the health of li...               Vox
1       colts gm ryan grigson says andrew lucks contra...  Business Insider
2       paris hilton woman in black for uncle montys f...               TMZ
3                 how to watch the google io keynote live               Vox
4       “elizabeth warren called me” is turning into a...               Vox
...                                                   ...               ...
413994  florida ammo selling out on heels of stayathom...               TMZ
413995  disney forcing annual pass holders to continue...               TMZ
413996  nick cannon pimps out his impala with custom n...               TMZ
413997  pete buttigieg says governors showing more lea...               TMZ
413998  ruth bader ginsburg still working out with tra...               TMZ

[413989 rows x 2 columns]>
(413989, 2)


In [4]:
foxSources = sources.loc[sources['publication'] == "Fox News"]
foxSources.reset_index(inplace=True)
voxSources = sources.loc[sources['publication'] == "Vox"]
cnnSources = sources.loc[sources['publication'] == "CNN"]
tmzSources = sources.loc[sources['publication'] == "TMZ"]
refinerySources = sources.loc[sources['publication'] == "Refinery 29"]
bizSources = sources.loc[sources['publication'] == "Business Insider"]

In [5]:
print(foxSources.shape)
print(voxSources.shape)
print(cnnSources.shape)
print(tmzSources.shape)
print(refinerySources.shape)
print(bizSources.shape)

(20144, 3)
(47272, 2)
(127594, 2)
(49595, 2)
(111432, 2)
(57952, 2)


### Tokenization & Flattening

In [22]:
vocabSize = 10000
tokenizer = Tokenizer(num_words=vocabSize, oov_token = "<OOV>")

def textToToken(df):
  # Updating tokenizer vocabulary to only contains words in df
  tokenizer.fit_on_texts(df["title"])
  # Getting the size of the vocabulary
  # vocabSize = len(tokenizer.word_index) + 1
  inputs = []
  for title in df['title']:
    # Converts all text into tokens in array form like [8, 9, 2, 10, 11, 3, 1]
    tokens = tokenizer.texts_to_sequences([title])[0]
    for x in range(1, len(tokens)):
      seq = tokens[:x+1]
      inputs.append(seq)
  return inputs #, vocabSize



### Padding

In [7]:
#padding sequences
#get input from output of tokenizer

def generate_padded_sequences(input_sequences, total_words):
    max_sequence_length = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen = max_sequence_length, padding = 'pre'))

    predictors = input_sequences[:,:-1]
    label = input_sequences[:,-1]

    label = ku.to_categorical(label, num_classes = total_words)
    return predictors, label, max_sequence_length

### Ensuring Meaningful Words

In [None]:
# we want to ignore any words that are in the context of OOV words because this will mess up the process of contextualizing surrounding words.
# this means that if a word appears AFTER an OOV word, we don't want to include their token as a possible option. in the event that this word
# appears in another context preceding an OOV word, its token will be added.
# we also want to ensure that we have a decent semantic construction around the word, so we should instill a minimum size that we will parse

def semanticCleaning()

## Part 2: Source Specific Models

### Fox

In [23]:
inputs = textToToken(foxSources)
predictors, label, max_sequence_length = generate_padded_sequences(inputs, vocabSize)


In [21]:
print(foxSources['title'][0])
print(foxSources['title'][1])
print(foxSources['title'][2])
print(foxSources['title'][3])

print(inputs[6][-1], "is preview")


inputs[:10]

baseball capsules
washington nationals at miami marlins game preview
metsbraves preview
cubsphillies preview
217 is preview


[[1492, 1],
 [335, 3466],
 [335, 3466, 11],
 [335, 3466, 11, 1377],
 [335, 3466, 11, 1377, 1],
 [335, 3466, 11, 1377, 1, 177],
 [335, 3466, 11, 1377, 1, 177, 217],
 [1, 217],
 [1, 217],
 [1, 217]]

In [24]:
print(vocabSize)
print(len(label[0]))
print("")
print(len(label))
print(len(inputs))
print("")
x = np.where(label[0] == 1)
print(x)

# this is checking that label works correctly. label is an array, 
# with n rows (n = # of inputs / len of inputs) and vocabSize columns.
# for each input, there is a 1 at the index corresponding to the ending index on
# the input (eg for input 0, there is a 1 at 9288) and a 0 at all other indices.
# the label denotes what the ending word is (bc the index is where the word is
# located)

5000
5000

162682
162682

(array([3465]),)


In [25]:
foxModel = Sequential()
input_len = max_sequence_length - 1
foxModel.add(Embedding(vocabSize, 10, input_length = input_len))
foxModel.add(Dropout(0.4)) # dropout to avoid overfitting
foxModel.add(LSTM(100))    
foxModel.add(Dropout(0.4))
foxModel.add(Dense(vocabSize, activation='softmax'))


foxModel.compile(loss = 'categorical_crossentropy', optimizer = 'adam')
  
  

In [26]:
foxModel.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 20, 10)            50000     
                                                                 
 dropout_2 (Dropout)         (None, 20, 10)            0         
                                                                 
 lstm_1 (LSTM)               (None, 100)               44400     
                                                                 
 dropout_3 (Dropout)         (None, 100)               0         
                                                                 
 dense_1 (Dense)             (None, 5000)              505000    
                                                                 
Total params: 599400 (2.29 MB)
Trainable params: 599400 (2.29 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [28]:
foxHistory = foxModel.fit(predictors, label, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [29]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict(token_list)
        idx = np.argmax(predicted)
        next = tokenizer.sequences_to_texts([[idx]])
#         for word,index in tokenizer.word_index.items():
#             if index == idx:
#                 output_word = word
#                 break
        seed_text += " "+next[0]
    return seed_text.title()

In [30]:
print(generate_text("united states", 5, foxModel, max_sequence_length))
print(generate_text("donald trump", 5, foxModel, max_sequence_length))



United States To Be In The Us
Donald Trump Is A A The Of


### CNN

In [None]:
#Model

def create_model(max_sequence_length, total_words):
    input_len = max_sequence_length -1

    model = Sequential()


    #Embedding Layer
    model.add(Embedding(total_words, 10, input_length = input_len))

    model.add(Dropout(0.1))

    #Hidden Layer 1
    model.add(LSTM(100))
    model.add(Dropout(0.1))


    model.add(Dense(total_words, activation = 'softmax'))

    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

    return model

model = create_model(max_sequence_length, vocabSize)





### Vox

### Model

In [None]:
max_word_count = 0
title_with_most_words = ""
for title in sources['title']:
    if type(title) == float:
      print(title)
    else:
      words = title.split()
    
    # Get the word count for the current title
    word_count = len(words)
    
    # Check if the current title has more words than the previous maximum
    if word_count > max_word_count:
        max_word_count = word_count
        title_with_most_words = title
print(title_with_most_words, max_word_count)

In [None]:
classes = np.unique(sources['publication'], return_counts=True)
classes

In [None]:
# Split the data into training and test sets, stratified by the 'publication' category
X_train, X_test, y_train, y_test = train_test_split(
    sources['title'],  # Features
    sources['publication'],  # Target variable
    test_size=0.1,  # 10% for the test set
    stratify=sources['publication'],  # Stratify by 'publication' category
    random_state=42  # Set a random seed for reproducibility
)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
max_num_words = 10000
seq_len = 40
embedding_size = 100

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=max_num_words) #Tokenizer is used to tokenize text
tokenizer.fit_on_texts(X_train) #Fit this to our corpus

x_train = tokenizer.texts_to_sequences(X_train) #'text to sequences converts the text to a list of indices
x_train = pad_sequences(x_train, maxlen=40) #pad_sequences makes every sequence a fixed size list by padding with 0s 


x_test = tokenizer.texts_to_sequences(X_test) 
x_test = pad_sequences(x_test, maxlen=40)

x_train.shape, x_test.shape # Check the dimensions of x_train and x_test 

In [None]:
x_train[2]

In [None]:
unique_labels = list(y_train.unique())
print(unique_labels)

In [None]:
num_classes = len(np.unique(y_train))

# Tokenize and pad your sequences as you have done before

# Create a label encoder to encode the labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Create the LSTM model
model = Sequential()

# Add an embedding layer to convert words to dense vectors
model.add(Embedding(input_dim=max_num_words, output_dim=embedding_size, input_length=seq_len))

# Add an LSTM layer
model.add(LSTM(64, return_sequences=False))

# Add a dense layer with softmax activation for classification
model.add(Dense(num_classes, activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(x_train, y_train_encoded, epochs=10, batch_size=32, validation_data=(x_test, y_test_encoded))

# Evaluate the model
loss, accuracy = model.evaluate(x_test, y_test_encoded)
print(f'Loss: {loss}, Accuracy: {accuracy}')

### selected news sources
all at least 20,000 publications

<b>Everything</b>
- Fox News (right - 20,144)
- Vox (left - )
- CNN (left center - 127,602)

<b>Entertainment News</b>
- TMZ (49,595)
- Refinery29 (111,433)

<b>Business</b>
- Business Insider (57,953)

### Don't need to run again period, just for initial set up

In [None]:
df2 = pd.read_csv("all-the-news-2-1.csv")
#sources_w_art = df2[['year', 'title', 'article', 'publication']]
sources = df2[['title', 'publication']]

In [None]:
selected = ['Fox News', 'Vox', 'CNN', 'TMZ', 'Refinery 29', 'Business Insider']
sources = sources.loc[sources['publication'].isin(selected)]
sources = sources[['title', 'publication']]

In [None]:
sources.to_csv('selected_sources.csv', index=False)

In [None]:
sources