This notebook detects author popularity and virality based on engagement metrics:

**1-Data Preprocessing & Feature Engineering**
Computes author popularity score using follower count, heart count, and verification status.
Calculates engagement rate and trend score for influencer impact assessment.
Encodes verified status and introduces a binary virality target variable.

**2-Top Author Identification**
Extracts top 10 most influential authors based on popularity score.

In [1]:
# 1. Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [2]:
# 2. Load and Inspect Data
df = pd.read_csv("my_dataframe (8).csv")
print(df.head())
print(df.columns)

                    id  play_count  share_count  comment_count  digg_count  \
0  7446111299458698498     4700000        54600           3247      885900   
1  7446111299458698498     4700000        54600           3247      885900   
2  7446111299458698498     4700000        54600           3247      885900   
3  7446111299458698498     4700000        54600           3247      885900   
4  7446111299458698498     4700000        54600           3247      885900   

   collect_count  followers     likes  video_count hashtags  ...  description  \
0          88200     709600  30100000          945       []  ...          NaN   
1          88200     709600  30100000          945       []  ...          NaN   
2          88200     709600  30100000          945       []  ...          NaN   
3          88200     709600  30100000          945       []  ...          NaN   
4          88200     709600  30100000          945       []  ...          NaN   

    author_id follower_count heart_count nic

In [3]:
# 3. Feature Engineering
# Calculate author_popularity_score
df["author_popularity_score"] = df["follower_count"] * 0.6 + df["heart_count"] * 0.4
df["author_popularity_score"] = df.apply(
    lambda row: row["author_popularity_score"] * 1.1 if row["verified"] else row["author_popularity_score"], axis=1
)

# Calculate engagement_rate and trend_score
df['engagement_rate'] = df['heart_count'] / df['follower_count'].replace(0, 1)
df['trend_score'] = df['engagement_rate'] * 0.5 + df['video_count'] * 0.3 + df['share_count'] * 0.2

In [5]:
# Encode 'verified' status as 1 for True and 0 for False
df['verified'] = df['verified'].astype(int)

# Create a binary target variable based on engagement rate
df['virality'] = np.where(df['engagement_rate'] > df['engagement_rate'].quantile(0.75), 1, 0)
df["virality"]

Unnamed: 0,virality
0,1
1,1
2,1
3,1
4,1
...,...
6239,0
6240,0
6241,0
6242,0


In [7]:
# 6. Identify Top Authors

top_authors = df.sort_values(by="author_popularity_score", ascending=False).head(10)
print(top_authors.columns)


Index(['id', 'play_count', 'share_count', 'comment_count', 'digg_count',
       'collect_count', 'followers', 'likes', 'video_count', 'hashtags',
       'create_time', 'duration', 'video_url', 'music_title', 'music_author',
       'music_id', 'username', 'music_play_url', 'description', 'author_id',
       'follower_count', 'heart_count', 'nickname', 'verified', 'friend_count',
       'sentiment', 'tiktok_web_url', 'audio_path', 'author_popularity_score',
       'engagement_rate', 'trend_score', 'virality'],
      dtype='object')


"top_authors_features = top_authors[features]\ntop_authors_features_scaled = scaler.transform(top_authors_features)\ntop_authors['predicted_virality'] = model.predict(top_authors_features_scaled)"

In [8]:
top_authors_info = top_authors[['author_id', 'username', 'nickname']]

In [11]:
top_authors_info.head()

Unnamed: 0,author_id,username,nickname
6037,bayashi.tiktok,bayashi.tiktok,バヤシ🥑Bayashi
6046,bayashi.tiktok,bayashi.tiktok,バヤシ🥑Bayashi
6027,bayashi.tiktok,bayashi.tiktok,バヤシ🥑Bayashi
6028,bayashi.tiktok,bayashi.tiktok,バヤシ🥑Bayashi
6029,bayashi.tiktok,bayashi.tiktok,バヤシ🥑Bayashi


In [9]:
top_authors_info.to_csv('top_authors_info.csv', index=False)