In [1]:
%pip install pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.3[0m[39;49m -> [0m[32;49m26.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
# Classification Model - Predicting High Performing Content
import pandas as pd

# Path to your data folder
INPUT_FILE = "../data/Raw_Dataset_LinkedIn.csv"
OUTPUT_FILE = "../data/Master_Dataset_LinkedIn.csv"

#Configuration constants
MAX_POSTS_PER_USER = 350
MAX_RELATIVE_ENGAGEMENT = 150
MIN_POST_LENGTH = 50 

In [3]:
# Balance the dataset by removing outliers, capping posts per user, and ensuring 50/50 HIGH/LOW performer distribution

# Load the dataset
df_ml = pd.read_csv(INPUT_FILE)

# Remove extreme viral hits that distort the mean)
df_ml = df_ml[df_ml['Relative Engagement'] <= MAX_RELATIVE_ENGAGEMENT]

# Remove very short posts
df_ml = df_ml[df_ml['Post Content Length'] >= MIN_POST_LENGTH]

# Bin Semantic Alignment into quintiles
#df_ml["Semantic Alignment"] = pd.qcut(df_ml["Semantic Alignment"], 5, labels=False)

# Normalize count features by Post Content Length
# df_ml['Hook Length'] = df_ml["Hook Length"] / (df_ml["Post Content Length"] + 1)
# df_ml["Emoji Count"] = df_ml["Emoji Count"] / (df_ml["Post Content Length"] + 1)
# df_ml['Hashtag Count'] = df_ml["Hashtag Count"] / (df_ml["Post Content Length"] + 1)
# df_ml['Linebreak Count'] = df_ml["Linebreak Count"] / (df_ml["Post Content Length"] + 1)
# df_ml['Link Count'] = df_ml["Link Count"] / (df_ml["Post Content Length"] + 1)

# Get balanced high/low performance capping per user
balanced_dfs = []
for user_id, user_data in df_ml.groupby('User ID'):
    # Split into classes
    high_perf = user_data[user_data['Is High Performing'] == 1]
    low_perf = user_data[user_data['Is High Performing'] == 0]
    
    # Determine the budget for this user
    # If they have 5 posts total, we can at most take 2 of each to keep it 1:1
    user_total_available = len(user_data)
    cap_for_this_user = min(user_total_available, MAX_POSTS_PER_USER)
    
    # To be perfectly balanced, we can't take more than what the smaller class has
    # and we can't take more than half of the total cap.
    max_possible_per_class = cap_for_this_user // 2
    n_to_take = min(len(high_perf), len(low_perf), max_possible_per_class)
    
    # Only sample if we actually have posts to take
    if n_to_take > 0:
        sampled_high = high_perf.sample(n=n_to_take, random_state=42)
        sampled_low  = low_perf.sample(n=n_to_take, random_state=42)

        balanced_chunk = pd.concat([sampled_high, sampled_low])
        balanced_dfs.append(balanced_chunk)

        print(user_id, ":", len(balanced_chunk))
    else:
        # If a user ONLY has high or ONLY has low, they are excluded 
        # to maintain the 50/50 integrity of your training set.
        continue

# Combine all users back into one master dataframe
df_ml = pd.concat(balanced_dfs).sample(frac=1, random_state=42).reset_index(drop=True)
print(f"Total posts after balanced capping: {len(df_ml)}")
print(df_ml['Is High Performing'].value_counts())

# Save the cleaned and balanced dataset
df_ml.to_csv(OUTPUT_FILE, index=False)

Alexander Rüegg : 2
Andreas Stutz : 114
Andy Lavicka : 332
Arinda Huber-Bouman : 120
Beat Brun : 106
Bernardo Romero : 6
Bernhard von Allmen : 64
Büşra Coşkuner : 350
Daniel Graf : 2
Daniel Grossenbacher : 6
David Butler : 136
Dr. Martin Feuz : 46
Farhad Ahmadyar : 8
Gerhard Wesp : 44
Jonas Kamber : 350
Joshua Steffen : 60
Julien Silva : 48
Kateryna Osadchuk : 136
Ksenija Korolova : 102
Laurent Decrue : 350
Lisa Winter : 134
Marc Hauser : 350
Martin Nyffenegger : 14
Michael Lanker : 88
Michael Scheiwiller : 6
Michael Wood : 350
Oliver Ganz : 326
Oliver Notz : 4
Patrick Fischbacher : 34
Philippe Theis : 44
René Goebels : 36
Reto Laemmler : 288
Robin Setzer : 4
Sabine Wildemann : 212
Stefan Birrer : 24
Theresa Engl : 110
Thomas Veit : 22
Tigran Arzumanov : 326
Tilman Eberle : 86
Tobias Clemens : 158
Valentin Binnendijk : 124
Yulia Matiash : 2
Total posts after balanced capping: 5124
Is High Performing
0    2562
1    2562
Name: count, dtype: int64
