# News Headline Generation

## Part 1: Data Preparation

In [6]:
import numpy as np
import matplotlib.pylab as plt
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, LSTM, Embedding
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.optimizers import RMSprop
from keras.applications.densenet import preprocess_input, decode_predictions
from keras.preprocessing.text import Tokenizer
#from keras.preprocessing.sequence import pad_sequences
from keras.utils import pad_sequences


In [14]:
df2 = pd.read_csv("selected_sources.csv")
#sources_w_art = df2[['year', 'title', 'article', 'publication']]
sources = df2[['title', 'publication']]

In [15]:
sources.head()
print(sources.shape)
sources = sources[sources['title'].apply(lambda x: isinstance(x, str))]
sources = sources[sources['title'].apply(lambda x: len(x.split()) <= 30)]
print(sources.shape)

(1206821, 2)
(1206803, 2)


### Model

In [16]:
max_word_count = 0
title_with_most_words = ""
for title in sources['title']:
    if type(title) == float:
      print(title)
    else:
      words = title.split()
    
    # Get the word count for the current title
    word_count = len(words)
    
    # Check if the current title has more words than the previous maximum
    if word_count > max_word_count:
        max_word_count = word_count
        title_with_most_words = title
print(title_with_most_words, max_word_count)

BRIEF-Novocure presents phase 2 pilot innovate trial results suggesting tumor treating fields plus paclitaxel may be safe as first-line treatment and may improve survival of patients with recurrent ovarian cancer 30


In [17]:
classes = np.unique(sources['publication'], return_counts=True)
classes

(array(['Business Insider', 'CNN', 'Fox News', 'Refinery 29', 'Reuters',
        'TMZ'], dtype=object),
 array([ 57952, 127594,  20144, 111432, 840086,  49595]))

In [27]:
# Split the data into training and test sets, stratified by the 'publication' category
X_train, X_test, y_train, y_test = train_test_split(
    sources['title'],  # Features
    sources['publication'],  # Target variable
    test_size=0.1,  # 10% for the test set
    stratify=sources['publication'],  # Stratify by 'publication' category
    random_state=42  # Set a random seed for reproducibility
)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1086122,)
(120681,)
(1086122,)
(120681,)


In [34]:
max_num_words = 10000
seq_len = 40
embedding_size = 100

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=max_num_words) #Tokenizer is used to tokenize text
tokenizer.fit_on_texts(X_train) #Fit this to our corpus

x_train = tokenizer.texts_to_sequences(X_train) #'text to sequences converts the text to a list of indices
x_train = pad_sequences(x_train, maxlen=40) #pad_sequences makes every sequence a fixed size list by padding with 0s 


x_test = tokenizer.texts_to_sequences(X_test) 
x_test = pad_sequences(x_test, maxlen=40)

x_train.shape, x_test.shape # Check the dimensions of x_train and x_test 

((1086122, 40), (120681, 40))

In [35]:
x_train[2]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,  331,  764, 2170,    6,
        114,  281,  870,  558, 1391,   83,   69], dtype=int32)

In [36]:
unique_labels = list(y_train.unique())
print(unique_labels)

['TMZ', 'Reuters', 'Refinery 29', 'CNN', 'Fox News', 'Business Insider']


In [40]:
num_classes = len(np.unique(y_train))

# Tokenize and pad your sequences as you have done before

# Create a label encoder to encode the labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Create the LSTM model
model = Sequential()

# Add an embedding layer to convert words to dense vectors
model.add(Embedding(input_dim=max_num_words, output_dim=embedding_size, input_length=seq_len))

# Add an LSTM layer
model.add(LSTM(64, return_sequences=False))

# Add a dense layer with softmax activation for classification
model.add(Dense(num_classes, activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(x_train, y_train_encoded, epochs=10, batch_size=32, validation_data=(x_test, y_test_encoded))

# Evaluate the model
loss, accuracy = model.evaluate(x_test, y_test_encoded)
print(f'Loss: {loss}, Accuracy: {accuracy}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss: 0.4504948854446411, Accuracy: 0.882094144821167


### selected news sources
all at least 20,000 publications

<b>Everything</b>
- Fox News (right - 20,144)
- Reuters (center - 840,094)
- CNN (left - 127,602)

<b>Entertainment News</b>
- TMZ (49,595)
- Refinery29 (111,433)

<b>Business</b>
- Business Insider (57,953)

### Don't need to run again, just for initial set up

In [7]:
df2 = pd.read_csv("all-the-news-2-1.csv")
#sources_w_art = df2[['year', 'title', 'article', 'publication']]
sources = df2[['title', 'publication']]

In [8]:
selected = ['Fox News', 'Vox', 'CNN', 'TMZ', 'Refinery 29', 'Business Insider']
sources = sources.loc[sources['publication'].isin(selected)]
sources = sources[['title', 'publication']]

In [9]:
sources.to_csv('selected_sources.csv', index=False)

In [10]:
sources

Unnamed: 0,title,publication
0,We should take concerns about the health of li...,Vox
1,Colts GM Ryan Grigson says Andrew Luck's contr...,Business Insider
4,Paris Hilton: Woman In Black For Uncle Monty's...,TMZ
8,How to watch the Google I/O keynote live,Vox
10,“Elizabeth Warren called me!” is turning into ...,Vox
...,...,...
2688873,Florida Ammo Selling Out On Heels of Stay-At-H...,TMZ
2688874,Disney Forcing Annual Pass Holders to Continue...,TMZ
2688875,Nick Cannon Pimps Out His Impala with Custom N...,TMZ
2688876,Pete Buttigieg Says Governors Showing More Lea...,TMZ
