In [1]:
!pip install pandas transformers textblob tqdm
!python -m textblob.download_corpora


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
Finished.


In [2]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel
from textblob import TextBlob
from datetime import datetime
from tqdm import tqdm
import torch
import re

In [3]:
# 1. Load Excel
excel_file = '/content/drive/MyDrive/Extended AMUSED Dataset/Extended Amused dataset/Final dataset.xlsx'
post_df = pd.read_excel(excel_file, sheet_name='post features')
user_df = pd.read_excel(excel_file, sheet_name='user features')

In [4]:
# 2. Preprocess Post Features
clickbait_phrases = ['you won\'t believe', 'shocking', 'what happened next', 'can\'t miss', 'top 10', 'reasons why']

def contains_clickbait(text):
    text_lower = text.lower()
    return int(any(phrase in text_lower for phrase in clickbait_phrases))

def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

In [5]:
# Load BERT
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model.eval()

def get_title_embedding(text):
    tokens = bert_tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=32)
    with torch.no_grad():
        outputs = bert_model(**tokens)
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()  # [CLS] token
    return cls_embedding

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [6]:
# Extract post features
post_features = []
for _, row in tqdm(post_df.iterrows(), total=len(post_df)):
    post_id = row['post-id']
    event_id = row.get('event-id', '')
    title = str(row['post-title']) if pd.notna(row['post-title']) else ''

    title_length = len(title.split())
    sentiment = get_sentiment(title)
    clickbait_flag = contains_clickbait(title)
    embedding = get_title_embedding(title)

    feature_row = {
        'post_id': post_id,
        'event_id': event_id,
        'title': title,
        'title_length': title_length,
        'title_sentiment': sentiment,
        'clickbait_flag': clickbait_flag,
    }

    for i, val in enumerate(embedding):
        feature_row[f'title_embedding_{i}'] = val

    post_features.append(feature_row)

post_features_df = pd.DataFrame(post_features)

100%|██████████| 1024/1024 [02:12<00:00,  7.72it/s]


In [8]:
# 3. Preprocess User Features
user_df['followers'] = user_df['followers'].fillna(0)
user_df['followings'] = user_df['followings'].fillna(0)
user_df['is user verified(0 verified, 1 unverified)'] = user_df['is user verified(0 verified, 1 unverified)'].fillna(0).astype(int)

def convert_date_to_days(joined_date):
    try:
        join_date = pd.to_datetime(joined_date)
        delta = datetime.now() - join_date
        return delta.days
    except:
        return 0

user_df['join_days_ago'] = user_df['joining date'].apply(convert_date_to_days)

  join_date = pd.to_datetime(joined_date)


In [10]:
# Select relevant user features
user_features_df = user_df[['post-id', 'followers', 'followings', 'is user verified(0 verified, 1 unverified)', 'join_days_ago']]

In [16]:

# 5. Save to CSV
user_features_df.to_csv('user_combined_features.csv', index=False)
print("✅ Features saved to 'user_combined_features.csv'")


✅ Features saved to 'user_combined_features.csv'


In [17]:
# 5. Save to CSV
post_features_df.to_csv('post_combined_features.csv', index=False)
print("✅ Features saved to 'post_combined_features.csv'")

✅ Features saved to 'post_combined_features.csv'
