In [1]:
%pip install numpy pandas scikit-learn sentence-transformers

Collecting numpy
  Using cached numpy-2.4.1-cp314-cp314-macosx_14_0_arm64.whl.metadata (6.6 kB)
Collecting pandas
  Downloading pandas-3.0.0-cp314-cp314-macosx_11_0_arm64.whl.metadata (79 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.8.0-cp314-cp314-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-5.2.2-py3-none-any.whl.metadata (16 kB)
Collecting scipy>=1.10.0 (from scikit-learn)
  Using cached scipy-1.17.0-cp314-cp314-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting joblib>=1.3.0 (from scikit-learn)
  Using cached joblib-1.5.3-py3-none-any.whl.metadata (5.5 kB)
Collecting threadpoolctl>=3.2.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Collecting transformers<6.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-5.0.0-py3-none-any.whl.metadata (37 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-1.3.5-py

In [None]:
import os
from pathlib import Path
import pandas as pd
import regex 
import unicodedata

from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# Path to your data folder
DATA_ROOT = Path("../data") 
OUTPUT_FILE = "../data/Raw_Dataset_LinkedIn.csv"

COLUMN_ORDER = [
    # identifiers
    "User ID",
    "Profile Summary",
    "Post URL",
    "Post Timestamp DT",

    # content
    "Post Content",
    "Post Content Length",
    "Hook Length",
    "Emoji Count",
    "Hashtag Count",
    "Linebreak Count",
    "Link Count",
    "Semantic Alignment",
    "Sentiment Score",

    # media
    "Has Image",
    "Has Video",

    # network & activity
    "Network Size",

    # time features
    "Hour",
    "Day Of Week",

    # targets
    "Engagement Score",
    "Relative Engagement",
    "Is High Performing",
]

In [None]:
# Robust date parser for LinkedIn date formats
def robust_date_parser(series):
    # Try multiple formats to parse LinkedIn dates correctly.
    # List of formats possible in LinkedIn data
    formats = [
        '%d.%b.%y',          # 15.Dec.25
        '%Y-%m-%d %H:%M:%S', # 2025-12-16 09:19:30
        '%d.%m.%Y %H:%M',    # 12.12.2025 15:26
        '%m/%d/%Y %H:%M:%S'  # Potential US format
    ]
    
    parsed = pd.to_datetime(series, format='mixed', errors='coerce') # Try default first
    
    for fmt in formats:
        mask = parsed.isna()
        if not mask.any(): break
        parsed[mask] = pd.to_datetime(series[mask], format=fmt, errors='coerce')
        
    return parsed

# Processes one individual data folder and returns a DataFrame.
def process_single_user(folder):
    shares_path = folder / "Shares.csv"
    conn_path = folder / "Connections.csv"
    profile_path = folder / "Profile.csv"

    if not (shares_path.exists() and conn_path.exists()):
        return None

    # Load Connections and calculate Network Size growth
    df_conn = pd.read_csv(conn_path)
    df_conn['Connected On DT'] = robust_date_parser(df_conn['Connected On'])
    df_conn = df_conn.dropna(subset=['Connected On DT']).sort_values('Connected On DT')
    df_conn['Network Size'] = range(1, len(df_conn) + 1)

    # Load Shares
    df_shares = pd.read_csv(shares_path)
    df_shares['Post Timestamp DT'] = robust_date_parser(df_shares['Post Timestamp'])
    df_shares = df_shares.dropna(subset=['Post Timestamp DT']).sort_values('Post Timestamp DT')

    # Merge Network Size at the exact time of each post
    df_merged = pd.merge_asof(
        df_shares, 
        df_conn[['Connected On DT', 'Network Size']],
        left_on='Post Timestamp DT', 
        right_on='Connected On DT',
        direction='backward'
    )
    df_merged['Network Size'] = df_merged['Network Size'].fillna(1)

    # Calculate Engagement Score (Excluding Reposts as requested)
    df_merged['Likes'] = pd.to_numeric(df_merged['Likes'], errors='coerce').fillna(0)
    df_merged['Comments'] = pd.to_numeric(df_merged['Comments'], errors='coerce').fillna(0)
    df_merged['Engagement Score'] = (df_merged['Likes'] * 1) + (df_merged['Comments'] * 3)

    # Normalize (Relative Engagement per 1000 connections)
    df_merged['Relative Engagement'] = (df_merged['Engagement Score'] / df_merged['Network Size']) * 1000

    # Set Target Label (Success = Above User's Median)
    median_val = df_merged['Relative Engagement'].median()
    df_merged['Is High Performing'] = (df_merged['Relative Engagement'] > median_val).astype(int)

    # Add Profile Metadata for NLP
    if profile_path.exists():
        prof = pd.read_csv(profile_path)
        headline = str(prof['Headline'].iloc[0]) if not pd.isna(prof['Headline'].iloc[0]) else ""
        summary = str(prof['Summary'].iloc[0]) if not pd.isna(prof['Summary'].iloc[0]) else ""
        # df_merged['Profile Summary'] = f"{summary}"
        # df_merged['Profile Summary'] = f"{headline} {summary}"
        df_merged['Profile Summary'] = f"{headline} {summary}"
    else:
        df_merged['Profile Summary'] = ""

    df_merged['User ID'] = folder.name

    return df_merged

def count_visual_emoji(s: str) -> int:
    clusters = regex.findall(r"\X", str(s))
    return sum(
        1 for g in clusters
        if regex.search(r"\p{Extended_Pictographic}|\p{Regional_Indicator}", g)
    )

def count_links(text):
    if pd.isna(text):
        return 0
    # Matches most URL formats
    links = regex.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', str(text))
    return len(links)


def add_additional_features(df):
    # Introduce new features based on Post Content
    df['Hook Length'] = df['Post Content'].apply(lambda x: len(str(x).split('\n')[0]))
    df["Emoji Count"] = df["Post Content"].apply(count_visual_emoji)
    df['Hashtag Count'] = df['Post Content'].apply(lambda x: len(regex.findall(r'#\w+', str(x))))
    df['Linebreak Count'] = df['Post Content'].apply(lambda x: str(x).count('\n'))
    df['Link Count'] = df['Post Content'].apply(count_links)
    df['Hour'] = df['Post Timestamp DT'].dt.hour
    df['Day Of Week'] = df['Post Timestamp DT'].dt.dayofweek # Monday=0, Sunday=6
    return df

def cleanup_data(df):
    original_count = len(df)
    print(f" - Original rows: {original_count}")

    # Deduplicate based on Post URL
    df = df.drop_duplicates(subset=['Post URL']).copy()
    print(f" - Rows after deduplication:  {len(df)}")

    # Remove unnecessary Has Document columns
    df.drop(columns=["Has Document"], inplace=True)

    # Introduce a feature post content length
    df['Post Content Length'] = df['Post Content'].str.len().fillna(0)

    # Filter out rows that are TRULY empty (NaN)
    df = df.dropna(subset=['Post Content']).copy()
 
    print(f"Cleanup complete:")
    print(f" - Original rows: {original_count}")
    print(f" - Cleaned rows:  {len(df)}")
    print(f" - Removed:       {original_count - len(df)}")

    return df

def arrange_columns_strict(df: pd.DataFrame, order: list[str]) -> pd.DataFrame:
    missing = [c for c in order if c not in df.columns]
    if missing:
        raise KeyError(f"Missing required columns: {missing}")
    return df[order].copy()

In [None]:
# Execute consolidation and feature engineering
all_results = []

if DATA_ROOT.exists():
    for folder in DATA_ROOT.iterdir():
        if folder.is_dir():
            #print(f"Recreating data for: {folder.name}...")
            df = process_single_user(folder)
            if df is not None:
                all_results.append(df)

if all_results:
    # Concatenate all user data into one Raw Master Dataset
    master_df = pd.concat(all_results, ignore_index=True)
    # Clean up data
    master_df = cleanup_data(master_df).copy()
    # Add additional features
    master_df = add_additional_features(master_df)

    print(f"User Breakdown:\n{master_df['User ID'].value_counts()}")
else:
    print("No data was found. Please ensure your folder structure is: /data/Name/Shares.csv")

 - Original rows: 10912
 - Rows after deduplication:  10531
Cleanup complete:
 - Original rows: 10912
 - Cleaned rows:  8542
 - Removed:       2370
User Breakdown:
User ID
BuÃàsÃßra CosÃßkuner       1053
Marc Hauser             1049
Michael Wood             973
Jonas Kamber             650
Andy Lavicka             442
Laurent Decrue           438
Reto Laemmler            436
Oliver Ganz              372
Tigran Arzumanov         341
David Butler             305
Sabine Wildemann         227
Lisa Winter              207
Tobias Clemens           165
Valentin Binnendijk      159
Dr. Martin Feuz          153
Kateryna Osadchuk        144
Beat Brun                138
Andreas Stutz            133
Arinda Huber-Bouman      126
Theresa Engl             113
Ksenija Korolova         103
Tilman Eberle            103
Joshua Steffen            98
Michael Lanker            91
Bernhard von Allmen       71
Gerhard Wesp              65
Philippe Theis            63
Julien Silva              51
ReneÃÅ Goebel

In [None]:
# Load NLP Models for semantic and setiment analysis

print("Loading NLP Models...")

# Model for vector embeddings
semantic_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment", use_fast=False)
sentiment_model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

# Sentiment analysis pipeline
sent_pipe = pipeline(
    "sentiment-analysis",
    model=sentiment_model,
    tokenizer=tokenizer,
    batch_size=32,       
    device=-1,
    top_k=None,
    truncation=True
)

print("Models loaded successfully.\n")

Loading NLP Models...


Device set to use cpu


Models loaded successfully.



In [None]:
# Helper functions semantic alignment and sentiment scores

def clean_text(text):
    if not isinstance(text, str) or pd.isna(text):
        return ""
    
    # Remove URLs (Standard regex)
    text = regex.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    
    # Remove Emojis & Symbols via Unicode Category
    # We keep: L (Letters), N (Numbers), P (Punctuation), Z (Spaces)
    # We remove: S (Symbols/Emojis), M (Marks/Accents that aren't letters)
    text = "".join(
        ch for ch in text 
        if unicodedata.category(ch)[0] in ['L', 'N', 'P', 'Z']
    )
    
    # Clean up Whitespace and Newlines
    text = text.replace('\n', ' ').replace('\r', ' ')
    text = regex.sub(r'\s+', ' ', text).strip()
    
    return text


def add_semantic_alignment(df):
    # Ensure all text is string and handle NaNs
    df['Post Content Clean'] = df['Post Content Clean'].fillna("").astype(str)
    df['Profile Summary Clean'] = df['Profile Summary Clean'].fillna("").astype(str)
    
    # Initialize the new column
    df['Semantic Alignment'] = 0.0
    
    # We process per user to be efficient (one profile vector vs many post vectors)
    unique_users = df['User ID'].unique()
    
    for user in unique_users:
        print(f"Calculating alignment for user: {user}")
        user_mask = df['User ID'] == user

        profile_text = df.loc[user_mask, 'Profile Summary Clean'].iloc[0]

        # üö´ If profile summary empty ‚Üí alignment = 0 for all posts
        if not isinstance(profile_text, str) or profile_text.strip() == "":
            df.loc[user_mask, 'Semantic Alignment'] = 0.0
            continue

        profile_embedding = semantic_model.encode(profile_text, convert_to_tensor=True)

        posts_text = df.loc[user_mask, 'Post Content Clean'].tolist()

        # Encode posts
        post_embeddings = semantic_model.encode(posts_text, convert_to_tensor=True)
        cosine_scores = util.cos_sim(profile_embedding, post_embeddings)[0].tolist()

        # üö´ Force 0 if post content empty
        cosine_scores = [
            score if isinstance(text, str) and text.strip() != "" else 0.0
            for score, text in zip(cosine_scores, posts_text)
        ]

        df.loc[user_mask, 'Semantic Alignment'] = cosine_scores

    return df

def add_sentiment_score(text):
    if pd.isna(text) or str(text).strip() == "":
        return 0.0
    print(f"Calculating sentiment for text: {text[:30]}...")
    scores = sent_pipe(str(text))[0]
    # labels: 1..5 stars ‚Üí map to [-1, +1]
    star = int(max(scores, key=lambda x: x["score"])["label"][0])
    return (star - 3) / 2


In [None]:
# Execution of semantic alignment and sentiment scoring
# Using cleaned text for semantic alignment
# Using original text for sentiment (includes icons etc.)

# Clean Post Content
master_df['Post Content Clean'] = master_df['Post Content'].apply(clean_text)
# Clean Profile Summary
master_df['Profile Summary Clean'] = master_df['Profile Summary'].apply(clean_text)

# Run vectorization
enriched_df = add_semantic_alignment(master_df)

# Add Sentiment Score
print("Calculating sentiment scores for all posts...")
enriched_df["Sentiment Score"] = enriched_df["Post Content"].apply(add_sentiment_score)

# Arrange columns
enriched_df = arrange_columns_strict(enriched_df, COLUMN_ORDER)

# Save the new version
enriched_df = arrange_columns_strict(enriched_df, COLUMN_ORDER)
enriched_df.sort_values(by=['User ID', 'Post Timestamp DT'], ascending=[True, False], inplace=True)
enriched_df.to_csv(OUTPUT_FILE, index=False)
print(f"\nSUCCESS: Enriched dataset saved to {OUTPUT_FILE}")

# Preview the results
print("Preview of sentiment score.")
print(enriched_df[['Post Content', 'Sentiment Score']].head())

print("\nPreview of top Alignment scores:")
print(enriched_df[['Post Content', 'Semantic Alignment']].sort_values(by='Semantic Alignment', ascending=False).head())

Calculating alignment for user: Bernardo Romero
Calculating alignment for user: Valentin Binnendijk
Calculating alignment for user: Michael Wood
Calculating alignment for user: Tigran Arzumanov
Calculating alignment for user: Dr. Martin Feuz
Calculating alignment for user: Robin Setzer
Calculating alignment for user: David Butler
Calculating alignment for user: Tobias Clemens
Calculating alignment for user: Lisa Winter
Calculating alignment for user: BuÃàsÃßra CosÃßkuner
Calculating alignment for user: Andreas Stutz
Calculating alignment for user: Oliver Ganz
Calculating alignment for user: Daniel Graf
Calculating alignment for user: ReneÃÅ Goebels
Calculating alignment for user: Kateryna Osadchuk
Calculating alignment for user: Jonas Kamber
Calculating alignment for user: Oliver Notz
Calculating alignment for user: Michael Scheiwiller
Calculating alignment for user: Alexander RuÃàegg
Calculating alignment for user: Philippe Theis
Calculating alignment for user: Martin Nyffenegger
Calc