In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df=pd.read_csv('socialmedia_cleaned.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 29 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   Unnamed: 0          100000 non-null  int64  
 1   Platform            100000 non-null  object 
 2   Post ID             100000 non-null  object 
 3   Post Type           100000 non-null  object 
 4   Post Content        100000 non-null  object 
 5   Post Timestamp      100000 non-null  object 
 6   Likes               100000 non-null  int64  
 7   Comments            100000 non-null  int64  
 8   Shares              100000 non-null  int64  
 9   Impressions         100000 non-null  int64  
 10  Reach               100000 non-null  int64  
 11  Engagement Rate     100000 non-null  float64
 12  Audience Age        100000 non-null  int64  
 13  Audience Gender     100000 non-null  object 
 14  Audience Location   100000 non-null  object 
 15  Audience Interests  100000 non-null

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import joblib

# Recreate feature matrix (same features as before)
features = [
    'post_length', 'has_hashtags', 'has_links',
    'post_hour', 'is_weekend',
    'Impressions', 'Reach', 'Audience Age'
]

df_encoded = pd.get_dummies(df[['Post Type', 'Audience Gender']], drop_first=True)
X = pd.concat([df[features], df_encoded], axis=1)

#retrain the model here
model = LogisticRegression(max_iter=1000)
y = df['engaged']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
model.fit(X_train, y_train)


In [3]:
df['engagement_score']=model.predict_proba(X)[:,1]

In [8]:
#Simulate a Ranked Feed

feed = df.sample(100,random_state=42).copy()

#Baseline feed
baseline_feed = feed.sort_values(by='Post Timestamp', ascending=False)

#ML-ranked feed
ranked_feed = feed.sort_values(by='engagement_score', ascending=False)


In [7]:
ranked_feed[['Post ID', 'engagement_score', 'total_engagement']].head(10)


Unnamed: 0,Post ID,engagement_score,total_engagement
71963,45b66f1f-f8f9-4cc9-b1a8-d738352f2947,0.50946,1310
72897,57596585-7145-4a36-a824-e23dffa90e44,0.505871,1456
81932,2c0d3a39-99a6-41c6-ba38-91b8b2f937df,0.505341,906
95321,9103def8-02c0-497c-a843-be0fc4039600,0.503365,742
87278,edb25bfe-e18b-4d8a-95bd-a583a8216aae,0.501955,1452
9507,677a5af8-d1b9-43ac-b20f-a7f748359301,0.499843,942
95115,0a61da78-ad58-4f38-b84e-b2423e96d82d,0.499357,595
31791,4807d3e6-1244-43cf-b64d-25b54162b6b4,0.499013,823
63421,261ada2a-9bd8-4833-beb5-25b3cfef86f1,0.498842,840
88624,332d9433-c6ec-43d6-898f-35d6d7699945,0.498695,1061


In [10]:
# Calculate total engagement for each post
baseline_feed['total_engagement'] = baseline_feed['Likes'] + baseline_feed['Comments'] + baseline_feed['Shares']
ranked_feed['total_engagement'] = ranked_feed['Likes'] + ranked_feed['Comments'] + ranked_feed['Shares']

print("Baseline Feed Total Engagement:", baseline_feed['total_engagement'].sum())
print("Ranked Feed Total Engagement:", ranked_feed['total_engagement'].sum())

uplift = (ranked_feed['total_engagement'].sum() - baseline_feed['total_engagement'].sum()) / baseline_feed['total_engagement'].sum()
print(f"Engagement Uplift from ML Ranking: {uplift:.2%}")


Baseline Feed Total Engagement: 88540
Ranked Feed Total Engagement: 88540
Engagement Uplift from ML Ranking: 0.00%


In [11]:
print("Baseline Feed Engaged Posts:", baseline_feed['engaged'].sum())
print("Ranked Feed Engaged Posts:", ranked_feed['engaged'].sum())


Baseline Feed Engaged Posts: 56
Ranked Feed Engaged Posts: 56


### Model Iteration: Switching to XGBoost for Better Ranking

In the initial version, the Logistic Regression model produced engagement scores that did not lead to any improvement over the baseline feed. To address this, I iterated on the modeling step by switching to **XGBoost**, a more powerful tree-based algorithm that can better capture non-linear relationships and feature interactions.

The goal is to increase predictive accuracy to more effectively prioritize high-engagement posts in the feed.


In [13]:
from xgboost import XGBClassifier

# Re-split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# Train XGBoost
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)

# Predict new scores for ranking
df['engagement_score'] = xgb_model.predict_proba(X)[:, 1]


In [14]:
# Sample larger feed
feed = df.sample(100, random_state=42).copy()

# Recreate feeds
baseline_feed = feed.sort_values(by='Post Timestamp', ascending=False)
ranked_feed = feed.sort_values(by='engagement_score', ascending=False)

# Recalculate total_engagement
baseline_feed['total_engagement'] = baseline_feed['Likes'] + baseline_feed['Comments'] + baseline_feed['Shares']
ranked_feed['total_engagement'] = ranked_feed['Likes'] + ranked_feed['Comments'] + ranked_feed['Shares']

# Compare again
print("Baseline Feed Total Engagement:", baseline_feed['total_engagement'].sum())
print("Ranked Feed Total Engagement:", ranked_feed['total_engagement'].sum())

uplift = (ranked_feed['total_engagement'].sum() - baseline_feed['total_engagement'].sum()) / baseline_feed['total_engagement'].sum()
print(f"Engagement Uplift from ML Ranking: {uplift:.2%}")


Baseline Feed Total Engagement: 88540
Ranked Feed Total Engagement: 88540
Engagement Uplift from ML Ranking: 0.00%


In [19]:
#check this if we have high impression post
df[df['Impressions'] > 5000].shape


(55428, 30)

In [21]:
threshold = df['Impressions'].quantile(0.75)
high_var_posts = df[df['Impressions'] >= threshold].copy()

# Sample
feed = high_var_posts.sample(min(100, len(high_var_posts)), random_state=42).copy()

# Re-rank
baseline_feed = feed.sort_values(by='Post Timestamp', ascending=False)
ranked_feed = feed.sort_values(by='engagement_score', ascending=False)

# Recompute engagement
baseline_feed['total_engagement'] = baseline_feed['Likes'] + baseline_feed['Comments'] + baseline_feed['Shares']
ranked_feed['total_engagement'] = ranked_feed['Likes'] + ranked_feed['Comments'] + ranked_feed['Shares']

# Compare
print("Baseline Feed Total Engagement:", baseline_feed['total_engagement'].sum())
print("Ranked Feed Total Engagement:", ranked_feed['total_engagement'].sum())

uplift = (ranked_feed['total_engagement'].sum() - baseline_feed['total_engagement'].sum()) / baseline_feed['total_engagement'].sum()
print(f"Engagement Uplift from ML Ranking: {uplift:.2%}")

Baseline Feed Total Engagement: 88445
Ranked Feed Total Engagement: 88445
Engagement Uplift from ML Ranking: 0.00%
