In [1]:
import argparse
import os
import pandas as pd
import numpy as np
import json
import pickle
import tensorflow as tf
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sentence_transformers import SentenceTransformer
from tensorflow.keras.callbacks import Callback, EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Input, Dropout, Bidirectional, BatchNormalization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import joblib


2025-04-28 03:40:14.660311: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-28 03:40:15.102666: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745811615.301789  292590 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745811615.368685  292590 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745811615.613772  292590 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
def embed_text(texts, model_name):
    """Generate embeddings using a pre-trained Sentence Transformer model."""
    print(f"Generating embeddings using {model_name}...")
    model = SentenceTransformer(model_name)
    embeddings = model.encode(texts, show_progress_bar=True)
    return embeddings

def prepare_text_features(df, use_short_description=False):
    """Prepare text features from the dataframe."""
    if use_short_description:
        # Combine headline and short_description
        texts = df['headline'] + " " + df['short_description'].fillna("")
    else:
        texts = df['headline']
    
    return texts.tolist(), df['category'].values

In [3]:
data_path = '/home/ubuntu/SentimentProject/DA5402_Project/data/category_classification/category_csv/complete_data.csv'

df = pd.read_csv(data_path)

In [4]:
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [5]:
df.drop(['link', 'date', 'authors'], axis=1, inplace=True)
df.head()

Unnamed: 0,headline,category,short_description
0,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...
1,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...
2,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha..."
3,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to..."
4,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...


In [6]:
texts, category = prepare_text_features(df)
texts[:2], category[:2]

(['Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters',
  'American Airlines Flyer Charged, Banned For Life After Punching Flight Attendant On Video'],
 array(['U.S. NEWS', 'U.S. NEWS'], dtype=object))

In [7]:
embedding_model = "all-MiniLM-L6-v2"
text_embeddings = embed_text(texts, embedding_model)
print(text_embeddings[:2])
print(text_embeddings[0].shape)

Generating embeddings using all-MiniLM-L6-v2...


Batches: 100%|██████████| 6548/6548 [02:33<00:00, 42.78it/s]


[[-3.41089666e-02  9.91281196e-02 -7.52788857e-02 -1.15467906e-02
   6.00800030e-02 -4.27349359e-02  1.13641676e-02  1.05969608e-01
  -3.40073518e-02 -2.75691506e-02 -1.54415136e-02  1.93908829e-02
  -4.46778722e-03  3.74372974e-02  2.82978397e-02  5.61935715e-02
   3.37446146e-02 -1.24953808e-02  2.09627990e-02  2.08215620e-02
  -5.77472374e-02 -1.37316100e-02  5.47243804e-02  6.28399625e-02
  -4.72335964e-02 -2.82013360e-02 -7.59805590e-02  1.80630423e-02
   1.81770455e-02 -8.81924853e-03 -1.03362938e-02 -3.98095548e-02
  -1.48401381e-02  4.98954169e-02 -5.90856373e-02 -2.55721640e-02
   2.17398535e-02 -3.29798348e-02 -7.66109899e-02  1.18269417e-02
   1.16666846e-01 -2.79738568e-02 -2.21716706e-02  2.71492377e-02
   5.25691919e-02  2.61019939e-03 -6.92779720e-02  9.83793288e-02
  -1.58369597e-02  3.59982699e-02  2.08804403e-02  1.23132616e-02
   2.66551077e-02 -4.76788133e-02 -2.09531710e-02 -2.38097291e-02
  -4.88777757e-02 -1.02558978e-01  3.92592102e-02  1.38727045e-02
  -1.37357

In [8]:
text_embeddings.shape[1]

384

In [9]:
input_dim = text_embeddings.shape[1]
model_dnn = Sequential([
        Input(shape=(input_dim,)),
        Dense(512, activation='relu'),
        BatchNormalization(),
        Dropout(0.4),
        Dense(256, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.2),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

I0000 00:00:1745811825.795894  292590 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5315 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9


### Business Model

In [10]:
model_path = '/home/ubuntu/SentimentProject/DA5402_Project/models/cat_classification/dense_nn_Business/dense_nn.keras'
model = tf.keras.models.load_model(model_path)

In [11]:
predictions = model.predict(text_embeddings)

I0000 00:00:1745811839.413097  294163 service.cc:152] XLA service 0x7fdfcc015fa0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1745811839.415021  294163 service.cc:160]   StreamExecutor device (0): NVIDIA GeForce RTX 4060 Laptop GPU, Compute Capability 8.9
2025-04-28 03:43:59.590075: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1745811839.846984  294163 cuda_dnn.cc:529] Loaded cuDNN version 90501


[1m  21/6548[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m33s[0m 5ms/step  

I0000 00:00:1745811840.499713  294163 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m6548/6548[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 4ms/step


In [12]:
df['BUSINESS'] = predictions

In [13]:
predictions[:5]

array([[0.3124554 ],
       [0.5566388 ],
       [0.00411345],
       [0.00488206],
       [0.6222352 ]], dtype=float32)

### CRIME Model

In [14]:
model_path = '/home/ubuntu/SentimentProject/DA5402_Project/models/cat_classification/dense_nn_CRIME/dense_nn.keras'
model = tf.keras.models.load_model(model_path)

In [15]:
predictions = model.predict(text_embeddings)

[1m6548/6548[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 3ms/step


In [16]:
df['CRIME'] = predictions

In [17]:
predictions[:5]

array([[0.00436825],
       [0.761155  ],
       [0.00456217],
       [0.01644285],
       [0.6693015 ]], dtype=float32)

### Entertainment Model

In [18]:
model_path = '/home/ubuntu/SentimentProject/DA5402_Project/models/cat_classification/dense_nn_ENTERTAINMENT/dense_nn.keras'
model = tf.keras.models.load_model(model_path)

In [19]:
predictions = model.predict(text_embeddings)

[1m6548/6548[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 4ms/step


In [20]:
df['ENTERTAINMENT'] = predictions

In [21]:
predictions[:5]

array([[0.00099692],
       [0.02820886],
       [0.05187636],
       [0.00626864],
       [0.07781427]], dtype=float32)

### POLITICS Model

In [22]:
model_path = '/home/ubuntu/SentimentProject/DA5402_Project/models/cat_classification/dense_nn_POLITICS/dense_nn.keras'
model = tf.keras.models.load_model(model_path)

In [23]:
predictions = model.predict(text_embeddings)

[1m6548/6548[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 3ms/step


In [24]:
df['POLITICS'] = predictions

In [25]:
predictions[:5]

array([[0.08988536],
       [0.01738927],
       [0.02629746],
       [0.00242779],
       [0.11056314]], dtype=float32)

### SPORTS Model

In [26]:
model_path = '/home/ubuntu/SentimentProject/DA5402_Project/models/cat_classification/dense_nn_SPORTS/dense_nn.keras'
model = tf.keras.models.load_model(model_path)

In [27]:
predictions = model.predict(text_embeddings)

[1m6548/6548[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 4ms/step


In [28]:
df['SPORTS'] = predictions

In [29]:
predictions[:5]

array([[0.0043285 ],
       [0.03904956],
       [0.00050028],
       [0.03876851],
       [0.00134997]], dtype=float32)

### WORLD_NEWS Model

In [30]:
model_path = '/home/ubuntu/SentimentProject/DA5402_Project/models/cat_classification/dense_nn_WORLD_NEWS/dense_nn.keras'
model = tf.keras.models.load_model(model_path)

In [31]:
predictions = model.predict(text_embeddings)

[1m6548/6548[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 3ms/step


In [32]:
df['WORLD_NEWS'] = predictions

In [33]:
predictions[:5]

array([[6.0041660e-01],
       [4.8073571e-02],
       [1.9451600e-02],
       [1.6646009e-02],
       [3.9424869e-04]], dtype=float32)

### EDUCATION Model

In [34]:
model_path = '/home/ubuntu/SentimentProject/DA5402_Project/models/cat_classification/logistic_regression_EDUCATION/logistic_regression.pkl'
model = joblib.load(model_path)

In [35]:
predictions = model.predict_proba(text_embeddings)

In [36]:
df['EDUCATION'] = predictions[:,1]

In [37]:
predictions[:5]

array([[0.94068039, 0.05931961],
       [0.94910392, 0.05089608],
       [0.9843051 , 0.0156949 ],
       [0.93959067, 0.06040933],
       [0.99159476, 0.00840524]])

In [38]:
df.head()

Unnamed: 0,headline,category,short_description,BUSINESS,CRIME,ENTERTAINMENT,POLITICS,SPORTS,WORLD_NEWS,EDUCATION
0,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,0.312455,0.004368,0.000997,0.089885,0.004329,0.600417,0.05932
1,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,0.556639,0.761155,0.028209,0.017389,0.03905,0.048074,0.050896
2,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",0.004113,0.004562,0.051876,0.026297,0.0005,0.019452,0.015695
3,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",0.004882,0.016443,0.006269,0.002428,0.038769,0.016646,0.060409
4,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,0.622235,0.669302,0.077814,0.110563,0.00135,0.000394,0.008405


In [39]:
df.EDUCATION

0         0.059320
1         0.050896
2         0.015695
3         0.060409
4         0.008405
            ...   
209522    0.034450
209523    0.042569
209524    0.030144
209525    0.010618
209526    0.029892
Name: EDUCATION, Length: 209527, dtype: float64

In [40]:
df.to_csv('/home/ubuntu/SentimentProject/DA5402_Project/data/category_classification/category_csv/data_new.csv')