# News Headline Generation

## Part 1: Data Preparation

In [111]:
import numpy as np
import matplotlib.pylab as plt
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import string
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, LSTM, Embedding
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.optimizers import RMSprop
from keras.applications.densenet import preprocess_input,decode_predictions
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
import keras.utils as ku
#from keras.preprocessing.sequence import pad_sequencese


In [49]:
df2 = pd.read_csv("selected_sources.csv")
#sources_w_art = df2[['year', 'title', 'article', 'publication']]
sources = df2[['title', 'publication']]

In [88]:
'''
Should we remove punctuation? There might be some pros and cons, but references seem to remove it.

Here we are cleaning our data
'''


sources.head()
print(sources.shape)
sources = sources[sources['title'].apply(lambda x: isinstance(x, str))]
sources = sources[sources['title'].apply(lambda x: len(x.split()) <= 30)]
sources['title'] = sources['title'].apply(lambda x: x.lower())
sources['title'] = sources['title'].apply(lambda x: x.strip())

def removePunc(str):
  str = "".join(i for i in str if i not in string.punctuation)
  return str

sources["title"] = sources['title'].apply(lambda x: removePunc(x))
print(sources.head)

print(sources.shape)

(413989, 2)
<bound method NDFrame.head of                                                     title       publication
0       we should take concerns about the health of li...               Vox
1       colts gm ryan grigson says andrew lucks contra...  Business Insider
2       paris hilton woman in black for uncle montys f...               TMZ
3                 how to watch the google io keynote live               Vox
4       “elizabeth warren called me” is turning into a...               Vox
...                                                   ...               ...
413994  florida ammo selling out on heels of stayathom...               TMZ
413995  disney forcing annual pass holders to continue...               TMZ
413996  nick cannon pimps out his impala with custom n...               TMZ
413997  pete buttigieg says governors showing more lea...               TMZ
413998  ruth bader ginsburg still working out with tra...               TMZ

[413989 rows x 2 columns]>
(413989, 2)


In [70]:
foxSources = sources.loc[sources['publication'] == "Fox News"]
foxSources.reset_index(inplace=True)
voxSources = sources.loc[sources['publication'] == "Vox"]
cnnSources = sources.loc[sources['publication'] == "CNN"]
tmzSources = sources.loc[sources['publication'] == "TMZ"]
refinerySources = sources.loc[sources['publication'] == "Refinery 29"]
bizSources = sources.loc[sources['publication'] == "Business Insider"]

In [77]:
print(foxSources.shape)
print(voxSources.shape)
print(cnnSources.shape)
print(tmzSources.shape)
print(refinerySources.shape)
print(bizSources.shape)

(20144, 3)
(47272, 2)
(127594, 2)
(49595, 2)
(111432, 2)
(57952, 2)
        index                                              title publication
0      276117                                  baseball capsules    Fox News
1      276118  washington nationals at miami marlins game pre...    Fox News
2      276119                                mets-braves preview    Fox News
3      276120                              cubs-phillies preview    Fox News
4      276121                            brewers-pirates preview    Fox News
...       ...                                                ...         ...
20139  296313     missing mom case leads fbi to georgia landfill    Fox News
20140  296314  kim jong un's bizarre north korea propaganda p...    Fox News
20141  296315  outrage over indian flag doormats on sale on a...    Fox News
20142  296316  golf on the moon: how alan shepard tricked nas...    Fox News
20143  296317                    fox news sunday local air times    Fox News

[20144 

In [96]:
tokenizer = Tokenizer()

def textToToken(df):
  # Updating tokenizer vocabulary to only contains words in df
  tokenizer.fit_on_texts(df["title"])
  # Getting the size of the vocabulary
  vocabSize = len(tokenizer.word_index) + 1
  inputs = []
  for title in df['title']:
    # Converts all text into tokens in array form like [8, 9, 2, 10, 11, 3, 1]
    tokens = tokenizer.texts_to_sequences([title])[0]
    for x in range(1, len(tokens)):
      nGrams = tokens[:x+1]
      inputs.append(nGrams)
  return inputs, vocabSize


In [120]:
#padding sequences
#get input from output of tokenizer

def generate_padded_sequences(input_sequences, total_words):
    max_sequence_length = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen = max_sequence_length, padding = 'pre'))

    predictors = input_sequences[:,:-1]
    label = input_sequences[:,-1]

    label = ku.to_categorical(label, num_classes = total_words)
    return predictors, label, max_sequence_length

### Fox

In [123]:
inputs, vocabSize = textToToken(foxSources)
predictors, label, max_sequence_length = generate_padded_sequences(inputs, vocabSize)
label

[[1493, 9288], [343, 2574], [343, 2574, 10], [343, 2574, 10, 1372], [343, 2574, 10, 1372, 3797], [343, 2574, 10, 1372, 3797, 199], [343, 2574, 10, 1372, 3797, 199, 222], [2972, 4711], [2972, 4711, 222], [2973, 5344], [2973, 5344, 222], [2575, 4193], [2575, 4193, 222], [1373, 4194], [1373, 4194, 222], [1681, 427], [1681, 427, 3467], [1681, 427, 3467, 10], [1681, 427, 3467, 10, 4195], [1681, 427, 3467, 10, 4195, 1682], [1681, 427, 3467, 10, 4195, 1682, 3199], [1681, 427, 3467, 10, 4195, 1682, 3199, 199], [1681, 427, 3467, 10, 4195, 1682, 3199, 199, 222], [3200, 1942], [3200, 1942, 222], [222, 2575], [222, 2575, 10], [222, 2575, 10, 1942], [1258, 4196], [1258, 4196, 222], [1854, 113], [1854, 113, 3467], [1854, 113, 3467, 222], [222, 2575], [222, 2575, 638], [222, 2575, 638, 2973], [3201, 2974], [3201, 2974, 222], [7424, 2759], [7424, 2759, 222], [222, 2575], [222, 2575, 638], [222, 2575, 638, 5344], [2574, 3797], [2574, 3797, 222], [906, 6200], [906, 6200, 2975], [906, 6200, 2975, 222], [

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

### CNN

In [None]:
#Model

def create_model(max_sequence_length, total_words):
    input_len = max_sequence_length -1

    model = Sequential()


    #Embedding Layer
    model.add(Embedding(total_words, 10, input_length = input_len))

    model.add(Dropout(0.1))

    #Hidden Layer 1
    model.add(LSTM(100))
    model.add(Dropout(0.1))


    model.add(Dense(total_words, activation = 'softmax'))

    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

    return model

model = create_model(max_sequence_length, vocabSize)





### Vox

### Model

In [16]:
max_word_count = 0
title_with_most_words = ""
for title in sources['title']:
    if type(title) == float:
      print(title)
    else:
      words = title.split()
    
    # Get the word count for the current title
    word_count = len(words)
    
    # Check if the current title has more words than the previous maximum
    if word_count > max_word_count:
        max_word_count = word_count
        title_with_most_words = title
print(title_with_most_words, max_word_count)

BRIEF-Novocure presents phase 2 pilot innovate trial results suggesting tumor treating fields plus paclitaxel may be safe as first-line treatment and may improve survival of patients with recurrent ovarian cancer 30


In [17]:
classes = np.unique(sources['publication'], return_counts=True)
classes

(array(['Business Insider', 'CNN', 'Fox News', 'Refinery 29', 'Reuters',
        'TMZ'], dtype=object),
 array([ 57952, 127594,  20144, 111432, 840086,  49595]))

In [27]:
# Split the data into training and test sets, stratified by the 'publication' category
X_train, X_test, y_train, y_test = train_test_split(
    sources['title'],  # Features
    sources['publication'],  # Target variable
    test_size=0.1,  # 10% for the test set
    stratify=sources['publication'],  # Stratify by 'publication' category
    random_state=42  # Set a random seed for reproducibility
)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1086122,)
(120681,)
(1086122,)
(120681,)


In [34]:
max_num_words = 10000
seq_len = 40
embedding_size = 100

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=max_num_words) #Tokenizer is used to tokenize text
tokenizer.fit_on_texts(X_train) #Fit this to our corpus

x_train = tokenizer.texts_to_sequences(X_train) #'text to sequences converts the text to a list of indices
x_train = pad_sequences(x_train, maxlen=40) #pad_sequences makes every sequence a fixed size list by padding with 0s 


x_test = tokenizer.texts_to_sequences(X_test) 
x_test = pad_sequences(x_test, maxlen=40)

x_train.shape, x_test.shape # Check the dimensions of x_train and x_test 

((1086122, 40), (120681, 40))

In [35]:
x_train[2]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,  331,  764, 2170,    6,
        114,  281,  870,  558, 1391,   83,   69], dtype=int32)

In [36]:
unique_labels = list(y_train.unique())
print(unique_labels)

['TMZ', 'Reuters', 'Refinery 29', 'CNN', 'Fox News', 'Business Insider']


In [40]:
num_classes = len(np.unique(y_train))

# Tokenize and pad your sequences as you have done before

# Create a label encoder to encode the labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Create the LSTM model
model = Sequential()

# Add an embedding layer to convert words to dense vectors
model.add(Embedding(input_dim=max_num_words, output_dim=embedding_size, input_length=seq_len))

# Add an LSTM layer
model.add(LSTM(64, return_sequences=False))

# Add a dense layer with softmax activation for classification
model.add(Dense(num_classes, activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(x_train, y_train_encoded, epochs=10, batch_size=32, validation_data=(x_test, y_test_encoded))

# Evaluate the model
loss, accuracy = model.evaluate(x_test, y_test_encoded)
print(f'Loss: {loss}, Accuracy: {accuracy}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss: 0.4504948854446411, Accuracy: 0.882094144821167


### selected news sources
all at least 20,000 publications

<b>Everything</b>
- Fox News (right - 20,144)
- Vox (left - )
- CNN (left center - 127,602)

<b>Entertainment News</b>
- TMZ (49,595)
- Refinery29 (111,433)

<b>Business</b>
- Business Insider (57,953)

### Don't need to run again period, just for initial set up

In [7]:
df2 = pd.read_csv("all-the-news-2-1.csv")
#sources_w_art = df2[['year', 'title', 'article', 'publication']]
sources = df2[['title', 'publication']]

FileNotFoundError: [Errno 2] No such file or directory: 'all-the-news-2-1.csv'

In [8]:
selected = ['Fox News', 'Vox', 'CNN', 'TMZ', 'Refinery 29', 'Business Insider']
sources = sources.loc[sources['publication'].isin(selected)]
sources = sources[['title', 'publication']]

In [9]:
sources.to_csv('selected_sources.csv', index=False)

In [10]:
sources

Unnamed: 0,title,publication
0,We should take concerns about the health of li...,Vox
1,Colts GM Ryan Grigson says Andrew Luck's contr...,Business Insider
4,Paris Hilton: Woman In Black For Uncle Monty's...,TMZ
8,How to watch the Google I/O keynote live,Vox
10,“Elizabeth Warren called me!” is turning into ...,Vox
...,...,...
2688873,Florida Ammo Selling Out On Heels of Stay-At-H...,TMZ
2688874,Disney Forcing Annual Pass Holders to Continue...,TMZ
2688875,Nick Cannon Pimps Out His Impala with Custom N...,TMZ
2688876,Pete Buttigieg Says Governors Showing More Lea...,TMZ
