In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.layers as layers
# import tensorflowjs as tfjs
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

In [2]:
# load data
df = pd.read_csv('/Users/praveenkalva/Downloads/uci-news-aggregator.csv')
print(df.head())
print(df.columns)

   ID                                              TITLE  \
0   1  Fed official says weak data caused by weather,...   
1   2  Fed's Charles Plosser sees high bar for change...   
2   3  US open: Stocks fall after Fed official hints ...   
3   4  Fed risks falling 'behind the curve', Charles ...   
4   5  Fed's Plosser: Nasty Weather Has Curbed Job Gr...   

                                                 URL          PUBLISHER  \
0  http://www.latimes.com/business/money/la-fi-mo...  Los Angeles Times   
1  http://www.livemint.com/Politics/H2EvwJSK2VE6O...           Livemint   
2  http://www.ifamagazine.com/news/us-open-stocks...       IFA Magazine   
3  http://www.ifamagazine.com/news/fed-risks-fall...       IFA Magazine   
4  http://www.moneynews.com/Economy/federal-reser...          Moneynews   

  CATEGORY                          STORY             HOSTNAME      TIMESTAMP  
0        b  ddUyU0VZz0BRneMioxUPQVP6sIxvM      www.latimes.com  1394470370698  
1        b  ddUyU0VZz0BRneMi

In [3]:
# only keep title
df.drop(columns=['ID', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME',
       'TIMESTAMP'], inplace=True)
df.rename(columns={'TITLE': 'title'}, inplace=True)
df.head()

Unnamed: 0,title
0,"Fed official says weak data caused by weather,..."
1,Fed's Charles Plosser sees high bar for change...
2,US open: Stocks fall after Fed official hints ...
3,"Fed risks falling 'behind the curve', Charles ..."
4,Fed's Plosser: Nasty Weather Has Curbed Job Gr...


In [4]:
# use sentiment analyzer to get sentiment score for each title
# to use in unsupervised learning
sia = SentimentIntensityAnalyzer()
def sentiment_score(text):
    score = sia.polarity_scores(text)
    return score['compound']
# get sentiment score to each title
df['compound_score'] = df['title'].map(sentiment_score)

In [5]:
# create label column with 0 for negative and 1 for positive
# and only look at sentiment score greater than 0.4 in magnitude
def label_score(score):
    if score > 0.4:
        return 1
    elif score < -0.4:
        return 0
    else: 
        return pd.NA

df['pos_label'] = df['compound_score'].map(label_score)
# drop rows with NA values
df.dropna(inplace=True)
print(df.head())

                                                title  compound_score  \
0   Fed official says weak data caused by weather,...         -0.4404   
3   Fed risks falling 'behind the curve', Charles ...         -0.4019   
7   Fed's Plosser expects US unemployment to fall ...         -0.4404   
13  ECB FOCUS-Stronger euro drowns out ECB's messa...         -0.6486   
14         EU aims for deal on tackling failing banks         -0.5106   

   pos_label  
0          0  
3          0  
7          0  
13         0  
14         0  


In [6]:
# split data into train and test
X = df['title']
y = df['pos_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
stop_words = stopwords.words("english")
# define preprocessing steps
def preprocess(text):
    # lowercase
    text = tf.strings.lower(text)
    # remove punctuation
    text = tf.strings.regex_replace(text, '[^\w\s]', '')
    # remove stop_words
    text = tf.strings.regex_replace(text, r'\b(' + r'|'.join(stop_words) + r')\b\s*', '')
    return text

In [8]:
# define TextVectorizer and parameters
MAX_FEATURES = 20000
SEQUENCE_LENGTH = 15

vectorize_layer = layers.TextVectorization(
    standardize=preprocess,
    max_tokens=MAX_FEATURES,
    output_mode='int',
    output_sequence_length=SEQUENCE_LENGTH,
    split='whitespace'
)

Metal device set to: Apple M1

systemMemory: 8.00 GB
maxCacheSize: 2.67 GB



2022-08-11 18:51:07.959117: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-08-11 18:51:07.959422: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [9]:
# adapt vectorize layer on training data
vectorize_layer.adapt(X_train)
# vectorize training and testing data
X_train_vectorized = vectorize_layer(X_train)
X_test_vectorized = vectorize_layer(X_test)
# check y dtypes
y_train = y_train.astype('int32')
y_test = y_test.astype('int32')

2022-08-11 18:51:11.822502: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-08-11 18:51:11.889468: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


In [21]:
# define model
embedding_dim = 32
model = keras.Sequential([
    layers.Embedding(MAX_FEATURES, embedding_dim, input_length=SEQUENCE_LENGTH),
    layers.LSTM(16),
    layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 15, 32)            640000    
                                                                 
 lstm_1 (LSTM)               (None, 16)                3136      
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 643,153
Trainable params: 643,153
Non-trainable params: 0
_________________________________________________________________


In [11]:
# train model
history = model.fit(X_train_vectorized, y_train, epochs=10, batch_size=256)

Epoch 1/10


2022-08-11 18:51:27.451513: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-11 18:51:27.656667: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-11 18:51:27.826082: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [12]:
# evaluate model
loss, accuracy = model.evaluate(X_test_vectorized, y_test)
print("loss", loss)
print("accuracy", accuracy)

2022-08-11 18:58:19.025074: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-11 18:58:19.152475: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


loss 0.11007792502641678
accuracy 0.9727628231048584


In [22]:
# model looks good, so retrain on all data!
final_vectorize_layer = layers.TextVectorization(
    standardize=None,
    max_tokens=MAX_FEATURES,
    output_mode='int',
    output_sequence_length=SEQUENCE_LENGTH,
    split='whitespace'
)

final_vectorize_layer.adapt(preprocess(X))
X_vectorized = final_vectorize_layer(preprocess(X))
y = y.astype('int32')

model.fit(X_vectorized, y, epochs=10, batch_size=256)

2022-08-11 19:12:07.600213: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 1/10


2022-08-11 19:12:11.398346: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-11 19:12:11.633371: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-11 19:12:11.784591: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1696e5e10>

In [23]:
# create inference model
export_model = tf.keras.Sequential([
  layers.Input(shape=(1), dtype='string'),
  final_vectorize_layer,
  model,
])

export_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [24]:
# check that model is working on some test strings
examples = [
    "CDC eases Covid guidance as U.S. has more tools to fight the virus and keep people out of the hospital",
    "What happens if Ukraine’s Zaporizhzhia nuclear plant explodes?",
    "Disordered eating in children: Boys and girls face similar risk",
    "House explodes in Evansville, Indiana, leaving 3 dead and nearby homes 'uninhabitable'",
    "Solar Company Gets Bright Idea to Cover Storage Facilities in Solar Panels—Brings Power to 1,400 Homes",
    "Switzerland’s Brilliant Plan For Underground Cargo Delivery Tunnels to Reduce Traffic is Now Underway"
]
# pos, neg, neg, neg, pos, pos

res = export_model.predict(preprocess(examples))
res = np.around(res, decimals=3)
print(res)

[[0.004]
 [0.   ]
 [0.001]
 [0.026]
 [0.998]
 [0.999]]


2022-08-11 19:14:28.325316: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-11 19:14:28.408431: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


In [25]:
# save model
#export_model.save('./pos_news_model.h5') # h5 format
export_model.save('./pos_news_model', save_traces=True) # tf saved model format



INFO:tensorflow:Assets written to: ./pos_news_model/assets


INFO:tensorflow:Assets written to: ./pos_news_model/assets
