<a href="https://colab.research.google.com/github/samlawson1/news/blob/TENSORFLOW_MODEL/tensorflow_model_train/NYT_TensorFlow_Model_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**TensorFlow Text Classification Model**

- Training Dataset: [A Million News Headlines - Kaggle](https://www.kaggle.com/datasets/therohk/million-headlines) - Labeled & Filtered with Label_Text_For_Training.ipynb notebook




In [1]:
import os
import pandas as pd
import numpy as np
import random
import math
import tensorflow as tf

tf.get_logger().setLevel('ERROR')

In [2]:
#Read in dataset with pandas and show values
file = r'drive/MyDrive/tf_text_analysis/train_test_data.csv'
df = pd.read_csv(file)
print(df.shape)
df.head()


(200000, 4)


Unnamed: 0,publish_date,headline_text,AfinnScore,Label
0,20030219,barca take record as robson celebrates birthda...,3.0,1
1,20030219,big hopes for launceston cycling championship,3.0,1
2,20030219,big plan to boost paroo water supplies,2.0,1
3,20030219,brigadier dismisses reports troops harassed in,-3.0,0
4,20030219,businesses should prepare for terrorist attacks,-3.0,0


In [3]:
#Split into train, validation, and test datasets

pos = df.loc[df['Label'] == 1].reset_index(drop = True)
neg = df.loc[df['Label'] == 0].reset_index(drop = True)

#random shuffle datasets so they aren't ordered by date

pos_i = list(pos.index)
neg_i = list(neg.index)

random.shuffle(pos_i)
random.shuffle(neg_i)

#use iloc to reorder and create new index
pos = pos.iloc[pos_i].reset_index(drop = True)
neg = neg.iloc[neg_i].reset_index(drop = True)

#100K positive and negative records
print(len(pos), len(neg))

#Split into train and test - 80% Train / 20% Test
#Function
def split_data(df, num):
  split_point = math.ceil(len(df) * num)
  train_text = list(df.iloc[:split_point]['headline_text'])
  train_labels = list(df.iloc[:split_point]['Label'])

  test_text = list(df.iloc[split_point:]['headline_text'])
  test_labels = list(df.iloc[split_point:]['Label'])

  return(train_text, train_labels, test_text, test_labels)


100000 100000


In [4]:
pos_train_text, pos_train_labels, pos_test_text, pos_text_labels = split_data(pos, 0.8)
neg_train_text, neg_train_labels, neg_test_text, neg_text_labels = split_data(neg, 0.8)

#test data
test_text = pos_test_text + neg_test_text
test_labels = pos_text_labels + neg_text_labels

train_text = pos_train_text + neg_train_text
train_labels = pos_train_labels + neg_train_labels

print(f'Train Data Size: {len(train_text)}')
print(f'Test Data Size: {len(test_text)}')
print(f'Total Data Check: {len(train_text) + len(test_text)}')

Train Data Size: 160000
Test Data Size: 40000
Total Data Check: 200000


In [5]:
#Create validation subset of train data

#Make df to reshuffle pos and neg

train_df = pd.DataFrame({
    'headline_text':train_text,
    'Label':train_labels
})

train_pos = train_df.loc[train_df['Label'] == 1].reset_index(drop = True)
train_neg = train_df.loc[train_df['Label'] == 0].reset_index(drop = True)

pos_train_text_final, pos_train_labels_final, pos_val_text, pos_val_labels = split_data(train_pos, 0.8)
neg_train_text_final, neg_train_labels_final, neg_val_text, neg_val_labels = split_data(train_neg, 0.8)

train_text_final = pos_train_text_final + neg_train_text_final
train_labels_final = pos_train_labels_final + neg_train_labels_final

val_text = pos_val_text + neg_val_text
val_labels = pos_val_labels + neg_val_labels


#Make labels numpy arrays
test_labels = np.array(test_labels)
train_labels_final = np.array(train_labels_final)
val_labels = np.array(val_labels)

In [6]:
print(f'Final Training Data Volume: {len(train_text_final)}')
print(f'Validation Data Volume: {len(val_text)}')
print(f'Testing Data Volume: {len(test_text)}')

Final Training Data Volume: 128000
Validation Data Volume: 32000
Testing Data Volume: 40000


In [7]:

vocab = 5000
max_len = 10




In [8]:
#Create text vectorization layer
vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=vocab,
    standardize='lower_and_strip_punctuation',
    split='whitespace',
    output_mode='int',
    output_sequence_length = max_len)

vectorize_layer.adapt(train_text)

In [9]:
#Test
vectorize_layer([
                ["I like bananas cause they're sweet"],
                ["The sun is so warm"],
                ["People say reading books makes you smart"]
                ])

<tf.Tensor: shape=(3, 10), dtype=int64, numpy=
array([[1094,  458,    1, 1400,    1, 1934,    0,    0,    0,    0],
       [   7, 2024,   50, 1663, 1181,    0,    0,    0,    0,    0],
       [ 264,  238, 4989, 2775,  327,  571, 1784,    0,    0,    0]])>

In [10]:
#format text for vectorization
train_text_vector_layer = np.array([[t] for t in train_text_final])
val_text_vector_layer = np.array([[t] for t in val_text])


In [11]:
#Create Model
model = tf.keras.Sequential([
    vectorize_layer,
    tf.keras.layers.Embedding(vocab, 5, input_length = max_len),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [12]:
#Train and Validate
num_epochs = 30
history = model.fit(train_text_vector_layer, train_labels_final, epochs = num_epochs,
                    validation_data=(val_text_vector_layer, val_labels), verbose=2)

Epoch 1/30
4000/4000 - 14s - loss: 0.4890 - accuracy: 0.8140 - val_loss: 0.2964 - val_accuracy: 0.9260 - 14s/epoch - 3ms/step
Epoch 2/30
4000/4000 - 13s - loss: 0.2457 - accuracy: 0.9330 - val_loss: 0.1779 - val_accuracy: 0.9542 - 13s/epoch - 3ms/step
Epoch 3/30
4000/4000 - 14s - loss: 0.1821 - accuracy: 0.9480 - val_loss: 0.1440 - val_accuracy: 0.9615 - 14s/epoch - 3ms/step
Epoch 4/30
4000/4000 - 14s - loss: 0.1623 - accuracy: 0.9522 - val_loss: 0.1340 - val_accuracy: 0.9636 - 14s/epoch - 4ms/step
Epoch 5/30
4000/4000 - 14s - loss: 0.1540 - accuracy: 0.9546 - val_loss: 0.1308 - val_accuracy: 0.9641 - 14s/epoch - 4ms/step
Epoch 6/30
4000/4000 - 14s - loss: 0.1485 - accuracy: 0.9556 - val_loss: 0.1314 - val_accuracy: 0.9644 - 14s/epoch - 4ms/step
Epoch 7/30
4000/4000 - 14s - loss: 0.1470 - accuracy: 0.9567 - val_loss: 0.1283 - val_accuracy: 0.9651 - 14s/epoch - 4ms/step
Epoch 8/30
4000/4000 - 13s - loss: 0.1457 - accuracy: 0.9564 - val_loss: 0.1267 - val_accuracy: 0.9653 - 13s/epoch - 3

In [13]:
test_text_eval = np.array([[x] for x in test_text])
#Test - 96% accuracy!
loss, accuracy = model.evaluate(test_text_eval, test_labels)
print('Loss: ', loss)
print('Accuracy: ', accuracy)

Loss:  0.13340526819229126
Accuracy:  0.9648749828338623


In [14]:
#Save
dir = 'drive/MyDrive/nyt_text_analysis'
model_folder = 'model_v2'
save_path = os.path.join(dir, model_folder)
if not os.path.exists(save_path):
  os.mkdir(save_path)




In [15]:
save_model = os.path.join(save_path, 'model.keras')
model.save(save_model)