<a href="https://colab.research.google.com/github/samlawson1/news/blob/TENSORFLOW_MODEL/tensorflow_model_train/NYT_TensorFlow_Model_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**TensorFlow Text Classification Model**

- Training Dataset: [A Million News Headlines - Kaggle](https://www.kaggle.com/datasets/therohk/million-headlines) - Labeled & Filtered with Label_Text_For_Training.ipynb notebook




In [2]:
import os
import pandas as pd
import numpy as np
import random
import math
import tensorflow as tf

tf.get_logger().setLevel('ERROR')

In [3]:
#Read in dataset with pandas and show values
file = r'drive/MyDrive/tf_text_analysis/train_test_data.csv'
df = pd.read_csv(file)
print(df.shape)
df.head()


(200000, 4)


Unnamed: 0,publish_date,headline_text,AfinnScore,Label
0,20030219,barca take record as robson celebrates birthda...,3.0,1
1,20030219,big hopes for launceston cycling championship,3.0,1
2,20030219,big plan to boost paroo water supplies,2.0,1
3,20030219,brigadier dismisses reports troops harassed in,-3.0,0
4,20030219,businesses should prepare for terrorist attacks,-3.0,0


In [4]:
#Split into train, validation, and test datasets

pos = df.loc[df['Label'] == 1].reset_index(drop = True)
neg = df.loc[df['Label'] == 0].reset_index(drop = True)

#random shuffle datasets so they aren't ordered by date

pos_i = list(pos.index)
neg_i = list(neg.index)

random.shuffle(pos_i)
random.shuffle(neg_i)

#use iloc to reorder and create new index
pos = pos.iloc[pos_i].reset_index(drop = True)
neg = neg.iloc[neg_i].reset_index(drop = True)

#100K positive and negative records
print(len(pos), len(neg))

#Split into train and test - 80% Train / 20% Test
#Function
def split_data(df, num):
  split_point = math.ceil(len(df) * num)
  train_text = list(df.iloc[:split_point]['headline_text'])
  train_labels = list(df.iloc[:split_point]['Label'])

  test_text = list(df.iloc[split_point:]['headline_text'])
  test_labels = list(df.iloc[split_point:]['Label'])

  return(train_text, train_labels, test_text, test_labels)


100000 100000


In [5]:
pos_train_text, pos_train_labels, pos_test_text, pos_text_labels = split_data(pos, 0.8)
neg_train_text, neg_train_labels, neg_test_text, neg_text_labels = split_data(neg, 0.8)

#test data
test_text = pos_test_text + neg_test_text
test_labels = pos_text_labels + neg_text_labels

train_text = pos_train_text + neg_train_text
train_labels = pos_train_labels + neg_train_labels

print(f'Train Data Size: {len(train_text)}')
print(f'Test Data Size: {len(test_text)}')
print(f'Total Data Check: {len(train_text) + len(test_text)}')

Train Data Size: 160000
Test Data Size: 40000
Total Data Check: 200000


In [6]:
#Create validation subset of train data

#Make df to reshuffle pos and neg

train_df = pd.DataFrame({
    'headline_text':train_text,
    'Label':train_labels
})

train_pos = train_df.loc[train_df['Label'] == 1].reset_index(drop = True)
train_neg = train_df.loc[train_df['Label'] == 0].reset_index(drop = True)

pos_train_text_final, pos_train_labels_final, pos_val_text, pos_val_labels = split_data(train_pos, 0.8)
neg_train_text_final, neg_train_labels_final, neg_val_text, neg_val_labels = split_data(train_neg, 0.8)

train_text_final = pos_train_text_final + neg_train_text_final
train_labels_final = pos_train_labels_final + neg_train_labels_final

val_text = pos_val_text + neg_val_text
val_labels = pos_val_labels + neg_val_labels


#Make labels numpy arrays
test_labels = np.array(test_labels)
train_labels_final = np.array(train_labels_final)
val_labels = np.array(val_labels)

In [7]:
print(f'Final Training Data Volume: {len(train_text_final)}')
print(f'Validation Data Volume: {len(val_text)}')
print(f'Testing Data Volume: {len(test_text)}')

Final Training Data Volume: 128000
Validation Data Volume: 32000
Testing Data Volume: 40000


In [8]:
#initiate Tokenizer object
vocab = 5000
max_len = 10
# tokenizer = Tokenizer(num_words=vocab,oov_token='<OOV>')
# tokenizer.fit_on_texts(train_text)
# word_index = tokenizer.word_index

# #training sequences
# train_sequences = tokenizer.texts_to_sequences(train_text_final)
# train_pad = pad_sequences(train_sequences, padding='post', maxlen = max_len)

# #validation sequences
# val_sequences = tokenizer.texts_to_sequences(val_text)
# val_pad = pad_sequences(val_sequences, padding='post', maxlen = max_len)

# #test sequences
# test_sequences = tokenizer.texts_to_sequences(test_text)
# test_pad = pad_sequences(test_sequences, padding = 'post', maxlen = max_len)




In [9]:
#Create text vectorization layer
vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=vocab,
    standardize='lower_and_strip_punctuation',
    split='whitespace',
    output_mode='int',
    output_sequence_length = max_len)

vectorize_layer.adapt(train_text)

In [10]:
#Test
vectorize_layer([
                ["I like bananas cause they're sweet"],
                ["The sun is so warm"],
                ["People say reading books makes you smart"]
                ])

<tf.Tensor: shape=(3, 10), dtype=int64, numpy=
array([[1021,  394,    1, 1370,    1, 1912,    0,    0,    0,    0],
       [   7, 2197,   51, 1703, 1120,    0,    0,    0,    0,    0],
       [ 270,  244,    1, 2787,  346,  573, 1638,    0,    0,    0]])>

In [11]:
#format text for vectorization
train_text_vector_layer = np.array([[t] for t in train_text_final])
val_text_vector_layer = np.array([[t] for t in val_text])


In [12]:
#Create Model
model = tf.keras.Sequential([
    vectorize_layer,
    tf.keras.layers.Embedding(vocab, 5, input_length = max_len),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [13]:
#Train and Validate
num_epochs = 30
history = model.fit(train_text_vector_layer, train_labels_final, epochs = num_epochs,
                    validation_data=(val_text_vector_layer, val_labels), verbose=2)

Epoch 1/30
4000/4000 - 21s - loss: 0.2378 - accuracy: 0.9134 - val_loss: 0.1321 - val_accuracy: 0.9628 - 21s/epoch - 5ms/step
Epoch 2/30
4000/4000 - 21s - loss: 0.1228 - accuracy: 0.9645 - val_loss: 0.1260 - val_accuracy: 0.9639 - 21s/epoch - 5ms/step
Epoch 3/30
4000/4000 - 26s - loss: 0.1145 - accuracy: 0.9668 - val_loss: 0.1205 - val_accuracy: 0.9663 - 26s/epoch - 7ms/step
Epoch 4/30
4000/4000 - 22s - loss: 0.1087 - accuracy: 0.9683 - val_loss: 0.1167 - val_accuracy: 0.9676 - 22s/epoch - 6ms/step
Epoch 5/30
4000/4000 - 16s - loss: 0.1034 - accuracy: 0.9695 - val_loss: 0.1151 - val_accuracy: 0.9678 - 16s/epoch - 4ms/step
Epoch 6/30
4000/4000 - 17s - loss: 0.0990 - accuracy: 0.9698 - val_loss: 0.1145 - val_accuracy: 0.9685 - 17s/epoch - 4ms/step
Epoch 7/30
4000/4000 - 17s - loss: 0.0951 - accuracy: 0.9705 - val_loss: 0.1163 - val_accuracy: 0.9678 - 17s/epoch - 4ms/step
Epoch 8/30
4000/4000 - 22s - loss: 0.0915 - accuracy: 0.9707 - val_loss: 0.1163 - val_accuracy: 0.9681 - 22s/epoch - 5

In [14]:
test_text_eval = np.array([[x] for x in test_text])
#Test - 95% accuracy!
loss, accuracy = model.evaluate(test_text_eval, test_labels)
print('Loss: ', loss)
print('Accuracy: ', accuracy)

Loss:  0.19560609757900238
Accuracy:  0.9528499841690063


In [20]:
#Save
dir = 'drive/MyDrive/nyt_text_analysis'
model_folder = 'model'
save_path = os.path.join(dir, model_folder)
if not os.path.exists(save_path):
  os.mkdir(save_path)




In [21]:
save_model = os.path.join(save_path, 'model.keras')
model.save(save_model)