# Import Libraries

In [3]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, balanced_accuracy_score,classification_report, ConfusionMatrixDisplay
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Dense, Conv1D, Conv2D, Conv1DTranspose, Conv2DTranspose, Reshape, Flatten, Dropout, multiply, Concatenate
from tensorflow.keras.layers import BatchNormalization, Activation, Embedding, ZeroPadding2D, LeakyReLU
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.initializers import RandomNormal
import tensorflow.keras.backend as K
from sklearn.utils import shuffle
from random import randint
import tensorflow as tf
from sklearn.cluster import KMeans
from transformers import BertTokenizer, BertModel
import torch

2024-11-27 12:55:58.131157: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-27 12:55:58.131258: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-27 12:55:58.213438: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-27 12:55:58.386668: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Load Dataset

### news.tsv

In [4]:
news_df_raw = pd.read_csv('/notebooks/news.tsv', sep='\t', header=None)
print(news_df_raw.head())
print("Number of columns:", news_df_raw.shape[1])  


        0          1                2  \
0  N55528  lifestyle  lifestyleroyals   
1  N19639     health       weightloss   
2  N61837       news        newsworld   
3  N53526     health           voices   
4  N38324     health          medical   

                                                   3  \
0  The Brands Queen Elizabeth, Prince Charles, an...   
1                      50 Worst Habits For Belly Fat   
2  The Cost of Trump's Aid Freeze in the Trenches...   
3  I Was An NBA Wife. Here's How It Affected My M...   
4  How to Get Rid of Skin Tags, According to a De...   

                                                   4  \
0  Shop the notebooks, jackets, and more that the...   
1  These seemingly harmless habits are holding yo...   
2  Lt. Ivan Molchanets peeked over a parapet of s...   
3  I felt like I was a fraud, and being an NBA wi...   
4  They seem harmless, but there's a very good re...   

                                               5  \
0  https://assets.msn.com/l

In [5]:
# Check the content of the 8th column
print(news_df_raw.iloc[:, 7].head())  # Access the 8th column (index starts at 0, so 7 represents the 8th column)

0                                                   []
1    [{"Label": "Adipose tissue", "Type": "C", "Wik...
2    [{"Label": "Ukraine", "Type": "G", "WikidataId...
3    [{"Label": "National Basketball Association", ...
4    [{"Label": "Skin tag", "Type": "C", "WikidataI...
Name: 7, dtype: object


In [6]:
# Compare the 7th and 8th columns for differences
for i in range(10):
    print("Row:", i)
    print("Entity column (7th):", news_df_raw.iloc[i, 6])
    print("Additional column (8th):", news_df_raw.iloc[i, 7])
    print("\n")

Row: 0
Entity column (7th): [{"Label": "Prince Philip, Duke of Edinburgh", "Type": "P", "WikidataId": "Q80976", "Confidence": 1.0, "OccurrenceOffsets": [48], "SurfaceForms": ["Prince Philip"]}, {"Label": "Charles, Prince of Wales", "Type": "P", "WikidataId": "Q43274", "Confidence": 1.0, "OccurrenceOffsets": [28], "SurfaceForms": ["Prince Charles"]}, {"Label": "Elizabeth II", "Type": "P", "WikidataId": "Q9682", "Confidence": 0.97, "OccurrenceOffsets": [11], "SurfaceForms": ["Queen Elizabeth"]}]
Additional column (8th): []


Row: 1
Entity column (7th): [{"Label": "Adipose tissue", "Type": "C", "WikidataId": "Q193583", "Confidence": 1.0, "OccurrenceOffsets": [20], "SurfaceForms": ["Belly Fat"]}]
Additional column (8th): [{"Label": "Adipose tissue", "Type": "C", "WikidataId": "Q193583", "Confidence": 1.0, "OccurrenceOffsets": [97], "SurfaceForms": ["belly fat"]}]


Row: 2
Entity column (7th): []
Additional column (8th): [{"Label": "Ukraine", "Type": "G", "WikidataId": "Q212", "Confidence":

In [7]:
# Combine the 7th and 8th columns into a single column
news_df_raw['Entity_Combined'] = news_df_raw.iloc[:, 6].combine_first(news_df_raw.iloc[:, 7])

# Drop the original 7th and 8th columns, keeping only the combined column
news_df_new = news_df_raw.drop(columns=[6, 7])

# Rename the combined column
news_df_new.rename(columns={'Entity_Combined': 'Entity'}, inplace=True)

# Check the result
print(news_df_new.head())

        0          1                2  \
0  N55528  lifestyle  lifestyleroyals   
1  N19639     health       weightloss   
2  N61837       news        newsworld   
3  N53526     health           voices   
4  N38324     health          medical   

                                                   3  \
0  The Brands Queen Elizabeth, Prince Charles, an...   
1                      50 Worst Habits For Belly Fat   
2  The Cost of Trump's Aid Freeze in the Trenches...   
3  I Was An NBA Wife. Here's How It Affected My M...   
4  How to Get Rid of Skin Tags, According to a De...   

                                                   4  \
0  Shop the notebooks, jackets, and more that the...   
1  These seemingly harmless habits are holding yo...   
2  Lt. Ivan Molchanets peeked over a parapet of s...   
3  I felt like I was a fraud, and being an NBA wi...   
4  They seem harmless, but there's a very good re...   

                                               5  \
0  https://assets.msn.com/l

In [8]:
# Define the correct column names
correct_columns = ['NewsID', 'Category', 'Subcategory', 'Title', 'Abstract', 'URL', 'Entity']

# Assign the column names to the cleaned dataframe
news_df_new.columns = correct_columns

# Verify the updated dataframe with proper column names
print(news_df_new.head())

# Save news_df_new as a CSV file
news_df_new.to_csv('/notebooks/news_df_new.csv', index=False)

   NewsID   Category      Subcategory  \
0  N55528  lifestyle  lifestyleroyals   
1  N19639     health       weightloss   
2  N61837       news        newsworld   
3  N53526     health           voices   
4  N38324     health          medical   

                                               Title  \
0  The Brands Queen Elizabeth, Prince Charles, an...   
1                      50 Worst Habits For Belly Fat   
2  The Cost of Trump's Aid Freeze in the Trenches...   
3  I Was An NBA Wife. Here's How It Affected My M...   
4  How to Get Rid of Skin Tags, According to a De...   

                                            Abstract  \
0  Shop the notebooks, jackets, and more that the...   
1  These seemingly harmless habits are holding yo...   
2  Lt. Ivan Molchanets peeked over a parapet of s...   
3  I felt like I was a fraud, and being an NBA wi...   
4  They seem harmless, but there's a very good re...   

                                             URL  \
0  https://assets.msn.com/l

### behaviors.tsv

In [9]:
columns = ['ImpressionID', 'UserID', 'Timestamp', 'ClickedArticles', 'ImpressionArticles']
behaviors_df = pd.read_csv('behaviors.tsv', sep='\t', header=None, names=columns)
print(behaviors_df.head())


   ImpressionID  UserID              Timestamp  \
0             1  U13740  11/11/2019 9:05:58 AM   
1             2  U91836  11/12/2019 6:11:30 PM   
2             3  U73700  11/14/2019 7:01:48 AM   
3             4  U34670  11/11/2019 5:28:05 AM   
4             5   U8125  11/12/2019 4:11:21 PM   

                                     ClickedArticles  \
0  N55189 N42782 N34694 N45794 N18445 N63302 N104...   
1  N31739 N6072 N63045 N23979 N35656 N43353 N8129...   
2  N10732 N25792 N7563 N21087 N41087 N5445 N60384...   
3  N45729 N2203 N871 N53880 N41375 N43142 N33013 ...   
4                        N10078 N56514 N14904 N33740   

                                  ImpressionArticles  
0                                  N55689-1 N35729-0  
1  N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...  
2  N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...  
3                N35729-0 N33632-0 N49685-1 N27581-0  
4  N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...  


# Data Cleaning

In [10]:
print(behaviors_df.isnull().sum())
print(news_df_new.isnull().sum())

ImpressionID             0
UserID                   0
Timestamp                0
ClickedArticles       3238
ImpressionArticles       0
dtype: int64
NewsID            0
Category          0
Subcategory       0
Title             0
Abstract       2666
URL               0
Entity            3
dtype: int64


In [11]:
print("behaviors.tsv shape before cleaning:", behaviors_df.shape)
print("news.tsv shape before cleaning:", news_df_new.shape)

behaviors.tsv shape before cleaning: (156965, 5)
news.tsv shape before cleaning: (51282, 7)


In [12]:
# Step 1: Remove duplicates
behaviors_df.drop_duplicates(inplace=True)
news_df_new.drop_duplicates(subset='NewsID', inplace=True)

# Step 4: Check the shape of the cleaned data
print("Behaviors shape after cleaning:", behaviors_df.shape)
print("News shape after cleaning:", news_df_new.shape)


Behaviors shape after cleaning: (156965, 5)
News shape after cleaning: (51282, 7)


In [13]:
# Step 1: Remove duplicates
behaviors_df.drop_duplicates(inplace=True)
news_df_new.drop_duplicates(subset='NewsID', inplace=True)

# Step 2: Handle missing values
# Drop rows where 'ClickedArticles' or 'Title' is missing
behaviors_df = behaviors_df.dropna(subset=['ClickedArticles'])
news_df_new = news_df_new.dropna(subset=['Title'])

# Step 3: Fill missing 'Abstract' and 'Entity' columns
news_df_new['Abstract'].fillna('No Abstract', inplace=True)
news_df_new['Entity'].fillna('No Entity', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  news_df_new['Abstract'].fillna('No Abstract', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  news_df_new['Entity'].fillna('No Entity', inplace=True)


In [14]:
print("Behaviors shape after cleaning:", behaviors_df.shape)
print("News shape after cleaning:", news_df_new.shape)
# Save news_df_new as a CSV file
news_df_new.to_csv('/notebooks/news_df_new.csv', index=False)

Behaviors shape after cleaning: (153727, 5)
News shape after cleaning: (51282, 7)


In [15]:
print(behaviors_df.isnull().sum())
print(news_df_new.isnull().sum())

ImpressionID          0
UserID                0
Timestamp             0
ClickedArticles       0
ImpressionArticles    0
dtype: int64
NewsID         0
Category       0
Subcategory    0
Title          0
Abstract       0
URL            0
Entity         0
dtype: int64


# Features Extraction

In [15]:
before_extraction_behavior_df=behaviors_df
before_extraction_news_df_new=news_df_new


In [16]:
# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to extract BERT embeddings
def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.pooler_output.squeeze().numpy()  # Convert to a simple array

# 1. Feature extraction from article information
# Extract BERT embeddings for Title and Abstract
news_df_new['bert_title_embeddings'] = news_df_new['Title'].apply(get_bert_embeddings)
news_df_new['bert_abstract_embeddings'] = news_df_new['Abstract'].apply(get_bert_embeddings)

# One-hot encode categorical variables for Category and Subcategory
news_df_new = pd.get_dummies(news_df_new, columns=['Category', 'Subcategory'], drop_first=True)

# 2. Feature extraction from user interaction data
# Convert the Timestamp to datetime and extract features
behaviors_df['Timestamp'] = pd.to_datetime(behaviors_df['Timestamp'])
behaviors_df['Hour'] = behaviors_df['Timestamp'].dt.hour
behaviors_df['Day'] = behaviors_df['Timestamp'].dt.day
behaviors_df['Month'] = behaviors_df['Timestamp'].dt.month
behaviors_df['Weekday'] = behaviors_df['Timestamp'].dt.weekday  # Monday=0, Sunday=6

# Convert ClickedArticles and ImpressionArticles into lists
behaviors_df['ClickedArticles'] = behaviors_df['ClickedArticles'].apply(lambda x: x.split(', '))
behaviors_df['ImpressionArticles'] = behaviors_df['ImpressionArticles'].apply(lambda x: x.split(', '))

# 3. Feature extraction from Entity column

# Use One-Hot Encoding, uncomment the line below
news_df_new = pd.get_dummies(news_df_new, columns=['Entity'], drop_first=True)

# Display the DataFrame with extracted features
print(news_df_new)


       NewsID                                              Title  \
0      N55528  The Brands Queen Elizabeth, Prince Charles, an...   
1      N19639                      50 Worst Habits For Belly Fat   
2      N61837  The Cost of Trump's Aid Freeze in the Trenches...   
3      N53526  I Was An NBA Wife. Here's How It Affected My M...   
4      N38324  How to Get Rid of Skin Tags, According to a De...   
...       ...                                                ...   
51277  N16909  Adapting, Learning And Soul Searching: Reflect...   
51278  N47585  Family says 13-year-old Broadway star died fro...   
51279   N7482  St. Dominic soccer player tries to kick cancer...   
51280  N34418                       How the Sounders won MLS Cup   
51281  N44276                  Best Sports Car Deals for October   

                                                Abstract  \
0      Shop the notebooks, jackets, and more that the...   
1      These seemingly harmless habits are holding yo...   
2  

In [17]:
# Check the shape of the DataFrame
print("Shape before feature extraction:", before_extraction_news_df_new.shape)
print("Shape after feature extraction:", news_df_new.shape) 


Shape before feature extraction: (51282, 9)
Shape after feature extraction: (51282, 34757)


In [18]:
# Check the new columns in the DataFrame
print(news_df_new.head())  # View the first few rows of the DataFrame

   NewsID                                              Title  \
0  N55528  The Brands Queen Elizabeth, Prince Charles, an...   
1  N19639                      50 Worst Habits For Belly Fat   
2  N61837  The Cost of Trump's Aid Freeze in the Trenches...   
3  N53526  I Was An NBA Wife. Here's How It Affected My M...   
4  N38324  How to Get Rid of Skin Tags, According to a De...   

                                            Abstract  \
0  Shop the notebooks, jackets, and more that the...   
1  These seemingly harmless habits are holding yo...   
2  Lt. Ivan Molchanets peeked over a parapet of s...   
3  I felt like I was a fraud, and being an NBA wi...   
4  They seem harmless, but there's a very good re...   

                                             URL  \
0  https://assets.msn.com/labs/mind/AAGH0ET.html   
1  https://assets.msn.com/labs/mind/AAB19MK.html   
2  https://assets.msn.com/labs/mind/AAJgNsz.html   
3  https://assets.msn.com/labs/mind/AACk2N6.html   
4  https://assets.

In [19]:

# Check the data types of the columns
print(news_df_new.dtypes)


NewsID                                                                                                                                                                                                                                                                                                                                                                                                                                                           object
Title                                                                                                                                                                                                                                                                                                                                                                                                                                                            object
Abstract                                                                                

In [20]:

# Check specific feature columns
print(news_df_new['bert_title_embeddings'].head())  # Should contain numpy arrays
print(news_df_new.filter(like='Entity_').head())  # Should contain one-hot encoded entity columns



0    [-0.9266579, -0.54415226, -0.93047905, 0.92278...
1    [-0.81567925, -0.27121076, -0.13772647, 0.5299...
2    [-0.80619395, -0.4283152, -0.4185663, 0.632967...
3    [-0.8127059, -0.46549726, -0.8291062, 0.464757...
4    [-0.6489112, -0.2725074, -0.43546677, 0.258971...
Name: bert_title_embeddings, dtype: object
   Entity_[]  \
0      False   
1      False   
2       True   
3       True   
4      False   

   Entity_[{"Label": "'40s Junction", "Type": "M", "WikidataId": "Q4540376", "Confidence": 1.0, "OccurrenceOffsets": [10], "SurfaceForms": ["Holiday Traditions"]}]  \
0                                              False                                                                                                                  
1                                              False                                                                                                                  
2                                              False                                

In [21]:
# Check one of the BERT embeddings
print(news_df_new['bert_title_embeddings'].iloc[0])  # Example of one embedding


[-9.26657915e-01 -5.44152260e-01 -9.30479050e-01  9.22783196e-01
  7.69032240e-01 -2.08176285e-01  9.06574547e-01  3.56711030e-01
 -6.61042929e-01 -9.99989033e-01 -3.41342539e-01  8.95081282e-01
  9.78437781e-01  5.91495335e-01  8.73546243e-01 -5.70198476e-01
 -2.43862689e-01 -4.49865699e-01  4.47783142e-01 -1.76718995e-01
  7.31375158e-01  9.99988258e-01  3.46777327e-02  3.48244339e-01
  6.35313153e-01  9.73136187e-01 -6.57489002e-01  9.29466188e-01
  9.53589201e-01  7.96715200e-01 -5.43796837e-01  3.77541214e-01
 -9.90869880e-01 -2.42178932e-01 -9.39158261e-01 -9.86937821e-01
  6.27595365e-01 -7.18522668e-01  4.29719016e-02  8.54834449e-03
 -8.59672487e-01  3.86373878e-01  9.99985278e-01  4.24565554e-01
  5.98158181e-01 -3.87675315e-01 -9.99999940e-01  3.87281835e-01
 -8.54201853e-01  8.47476065e-01  7.41616547e-01  8.68313611e-01
  2.50619620e-01  5.27501285e-01  5.44898570e-01 -4.03815567e-01
 -1.08831443e-01  1.64300054e-01 -3.17668140e-01 -5.15919030e-01
 -5.51980078e-01  4.96228

In [22]:
# Check the shape of one embedding 
print(news_df_new['bert_title_embeddings'].iloc[0].shape)


(768,)


In [23]:
# Check the first few rows of the one-hot encoded columns
entity_columns = news_df_new.filter(like='Entity_').columns
print(news_df_new[entity_columns].head())

   Entity_[]  \
0      False   
1      False   
2       True   
3       True   
4      False   

   Entity_[{"Label": "'40s Junction", "Type": "M", "WikidataId": "Q4540376", "Confidence": 1.0, "OccurrenceOffsets": [10], "SurfaceForms": ["Holiday Traditions"]}]  \
0                                              False                                                                                                                  
1                                              False                                                                                                                  
2                                              False                                                                                                                  
3                                              False                                                                                                                  
4                                              False                

In [24]:
# Check the ClickedArticles and ImpressionArticles
print(behaviors_df[['ClickedArticles', 'ImpressionArticles']].head())


                                     ClickedArticles  \
0  [N55189 N42782 N34694 N45794 N18445 N63302 N10...   
1  [N31739 N6072 N63045 N23979 N35656 N43353 N812...   
2  [N10732 N25792 N7563 N21087 N41087 N5445 N6038...   
3  [N45729 N2203 N871 N53880 N41375 N43142 N33013...   
4                      [N10078 N56514 N14904 N33740]   

                                  ImpressionArticles  
0                                [N55689-1 N35729-0]  
1  [N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 ...  
2  [N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 ...  
3              [N35729-0 N33632-0 N49685-1 N27581-0]  
4  [N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N...  


In [25]:

# Check if timestamp features are extracted correctly
print(behaviors_df[['Hour', 'Day', 'Month', 'Weekday']].head())

   Hour  Day  Month  Weekday
0     9   11     11        0
1    18   12     11        1
2     7   14     11        3
3     5   11     11        0
4    16   12     11        1


# Clustering 

In [26]:
# 1. Concatenate BERT Title and Abstract embeddings
# Convert the BERT embeddings into numpy arrays for clustering
title_embeddings = np.array(news_df_new['bert_title_embeddings'].tolist())
abstract_embeddings = np.array(news_df_new['bert_abstract_embeddings'].tolist())

# Concatenate Title and Abstract embeddings
combined_embeddings = np.concatenate([title_embeddings, abstract_embeddings], axis=1)  # Combine along axis=1 (features)

# 2. Apply KMeans Clustering
num_clusters = 26

# Initialize KMeans and fit to the combined BERT embeddings
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(combined_embeddings)  # This will output the cluster labels for each article

# 3. Assign the resulting cluster labels back to DataFrame
news_df_new['ClusterLabel'] = cluster_labels

# Now the DataFrame has a new column 'ClusterLabel', which indicates which cluster each article belongs to
print(news_df_new[['Title', 'ClusterLabel']])  # Display article titles with their corresponding cluster labels


                                                   Title  ClusterLabel
0      The Brands Queen Elizabeth, Prince Charles, an...            16
1                          50 Worst Habits For Belly Fat            19
2      The Cost of Trump's Aid Freeze in the Trenches...             7
3      I Was An NBA Wife. Here's How It Affected My M...            10
4      How to Get Rid of Skin Tags, According to a De...            19
...                                                  ...           ...
51277  Adapting, Learning And Soul Searching: Reflect...             7
51278  Family says 13-year-old Broadway star died fro...            15
51279  St. Dominic soccer player tries to kick cancer...            18
51280                       How the Sounders won MLS Cup            10
51281                  Best Sports Car Deals for October            15

[51282 rows x 2 columns]


### Saving labels and embeddings in .npy

In [27]:
np.save('cluster_labels.npy', cluster_labels)

In [28]:
chunk_size = 10000  
num_samples = combined_embeddings.shape[0]

# Save the array in smaller chunks
for i in range(0, num_samples, chunk_size):
    np.save(f'bert_embeddings_chunk_{i//chunk_size}.npy', combined_embeddings[i:i+chunk_size])

print("BERT embeddings saved in chunks.")

BERT embeddings saved in chunks.


In [29]:
num_samples = combined_embeddings.shape[0]  # Total number of samples

# Calculate the number of chunks (round up in case of any remainder)
num_chunks = math.ceil(num_samples / chunk_size)

print(f"Number of chunks: {num_chunks}")


Number of chunks: 6
