<a href="https://colab.research.google.com/github/samlawson1/news/blob/TENSORFLOW_MODEL/tensorflow_model_train/NYT_TensorFlow_Model_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**TensorFlow Text Classification Model**

- Training Dataset: [A Million News Headlines - Kaggle](https://www.kaggle.com/datasets/therohk/million-headlines) - Labeled & Filtered with Label_Text_For_Training.ipynb notebook

1. Install dependencies [per TensorFlow docs](https://www.tensorflow.org/text/tutorials/classify_text_with_bert).


> Google Colab is using tensorflow v 2.15.0. Will be using that instead of v 2.13.0 in the docs



In [1]:
# !pip install -q tfds-nightly
# !pip install -U "tensorflow-text==2.15.*"
# !pip install "tf-models-official==2.15.*"


In [2]:
import os
import pandas as pd
import numpy as np
import random
import math
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from official.nlp import optimization

tf.get_logger().setLevel('ERROR')

In [3]:
#Read in dataset with pandas and show values
file = r'drive/MyDrive/tf_text_analysis/train_test_data.csv'
df = pd.read_csv(file)
print(df.shape)
df.head()


(200000, 4)


Unnamed: 0,publish_date,headline_text,AfinnScore,Label
0,20030219,barca take record as robson celebrates birthda...,3.0,1
1,20030219,big hopes for launceston cycling championship,3.0,1
2,20030219,big plan to boost paroo water supplies,2.0,1
3,20030219,brigadier dismisses reports troops harassed in,-3.0,0
4,20030219,businesses should prepare for terrorist attacks,-3.0,0


In [4]:
#Split into train, validation, and test datasets

pos = df.loc[df['Label'] == 1].reset_index(drop = True)
neg = df.loc[df['Label'] == 0].reset_index(drop = True)

#random shuffle datasets so they aren't ordered by date

pos_i = list(pos.index)
neg_i = list(neg.index)

random.shuffle(pos_i)
random.shuffle(neg_i)

#use iloc to reorder and create new index
pos = pos.iloc[pos_i].reset_index(drop = True)
neg = neg.iloc[neg_i].reset_index(drop = True)

#100K positive and negative records
print(len(pos), len(neg))

#Split into train and test - 80% Train / 20% Test
#Function
def split_data(df, num):
  split_point = math.ceil(len(df) * num)
  train_text = list(df.iloc[:split_point]['headline_text'])
  train_labels = list(df.iloc[:split_point]['Label'])

  test_text = list(df.iloc[split_point:]['headline_text'])
  test_labels = list(df.iloc[split_point:]['Label'])

  return(train_text, train_labels, test_text, test_labels)


100000 100000


In [5]:
pos_train_text, pos_train_labels, pos_test_text, pos_text_labels = split_data(pos, 0.8)
neg_train_text, neg_train_labels, neg_test_text, neg_text_labels = split_data(neg, 0.8)

#test data
test_text = pos_test_text + neg_test_text
test_labels = pos_text_labels + neg_text_labels

train_text = pos_train_text + neg_train_text
train_labels = pos_train_labels + neg_train_labels

print(f'Train Data Size: {len(train_text)}')
print(f'Test Data Size: {len(test_text)}')
print(f'Total Data Check: {len(train_text) + len(test_text)}')

Train Data Size: 160000
Test Data Size: 40000
Total Data Check: 200000


In [6]:
#Create validation subset of train data

#Make df to reshuffle pos and neg

train_df = pd.DataFrame({
    'headline_text':train_text,
    'Label':train_labels
})

train_pos = train_df.loc[train_df['Label'] == 1].reset_index(drop = True)
train_neg = train_df.loc[train_df['Label'] == 0].reset_index(drop = True)

pos_train_text_final, pos_train_labels_final, pos_val_text, pos_val_labels = split_data(train_pos, 0.8)
neg_train_text_final, neg_train_labels_final, neg_val_text, neg_val_labels = split_data(train_neg, 0.8)

train_text_final = pos_train_text_final + neg_train_text_final
train_labels_final = pos_train_labels_final + neg_train_labels_final

val_text = pos_val_text + neg_val_text
val_labels = pos_val_labels + neg_val_labels


#Make labels numpy arrays
test_labels = np.array(test_labels)
train_labels_final = np.array(train_labels_final)
val_labels = np.array(val_labels)

In [7]:
print(f'Final Training Data Volume: {len(train_text_final)}')
print(f'Validation Data Volume: {len(val_text)}')
print(f'Testing Data Volume: {len(test_text)}')

Final Training Data Volume: 128000
Validation Data Volume: 32000
Testing Data Volume: 40000


In [19]:
#initiate Tokenizer object
vocab = 5000
max_len = 10
tokenizer = Tokenizer(num_words=vocab,oov_token='<OOV>')
tokenizer.fit_on_texts(train_text)
word_index = tokenizer.word_index

#training sequences
train_sequences = tokenizer.texts_to_sequences(train_text_final)
train_pad = pad_sequences(train_sequences, padding='post', maxlen = max_len)

#validation sequences
val_sequences = tokenizer.texts_to_sequences(val_text)
val_pad = pad_sequences(val_sequences, padding='post', maxlen = max_len)

#test sequences
test_sequences = tokenizer.texts_to_sequences(test_text)
test_pad = pad_sequences(test_sequences, padding = 'post', maxlen = max_len)




In [24]:
#Create Model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab, 5, input_length = max_len),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [25]:
#Train and Validate
num_epochs = 30
history = model.fit(train_pad, train_labels_final, epochs = num_epochs,
                    validation_data=(val_pad, val_labels), verbose=2)

Epoch 1/30
4000/4000 - 9s - loss: 0.2427 - accuracy: 0.9103 - val_loss: 0.1339 - val_accuracy: 0.9611 - 9s/epoch - 2ms/step
Epoch 2/30
4000/4000 - 11s - loss: 0.1210 - accuracy: 0.9648 - val_loss: 0.1229 - val_accuracy: 0.9666 - 11s/epoch - 3ms/step
Epoch 3/30
4000/4000 - 8s - loss: 0.1130 - accuracy: 0.9673 - val_loss: 0.1202 - val_accuracy: 0.9660 - 8s/epoch - 2ms/step
Epoch 4/30
4000/4000 - 9s - loss: 0.1082 - accuracy: 0.9690 - val_loss: 0.1159 - val_accuracy: 0.9676 - 9s/epoch - 2ms/step
Epoch 5/30
4000/4000 - 9s - loss: 0.1026 - accuracy: 0.9698 - val_loss: 0.1143 - val_accuracy: 0.9676 - 9s/epoch - 2ms/step
Epoch 6/30
4000/4000 - 8s - loss: 0.0985 - accuracy: 0.9706 - val_loss: 0.1152 - val_accuracy: 0.9679 - 8s/epoch - 2ms/step
Epoch 7/30
4000/4000 - 9s - loss: 0.0946 - accuracy: 0.9710 - val_loss: 0.1164 - val_accuracy: 0.9680 - 9s/epoch - 2ms/step
Epoch 8/30
4000/4000 - 7s - loss: 0.0914 - accuracy: 0.9711 - val_loss: 0.1168 - val_accuracy: 0.9682 - 7s/epoch - 2ms/step
Epoch 

In [28]:
#Test - 96% accuracy!
loss, accuracy = model.evaluate(test_pad, test_labels)



In [32]:
#Save
dir = 'drive/MyDrive/tf_text_analysis'
model_folder = 'model'
save_path = os.path.join(dir, model_folder)
if not os.path.exists(save_path):
  os.mkdir(save_path)


In [33]:
model.save(save_path, include_optimizer=True)