# Feature Engineering
Create a Dataframe for model input:
- Feature creation
- Daily aggregation of features
- Vizualization

# Import Libraries and Data

In [73]:
# --- Standard Libraries ---
import os
import re
from collections import Counter

# --- Data Science ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# --- NLP ---
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from langdetect import detect, DetectorFactory, LangDetectException

# --- Transformers ---
import torch
from torch.nn.functional import sigmoid
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    BertTokenizer,
    BertForSequenceClassification
)
from scipy.special import softmax

# --- Utils ---
from tqdm.auto import tqdm
from IPython.display import display

# --- Setup ---
tqdm.pandas()
nltk.download("punkt")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\paull\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\paull\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [74]:
TEST = True

In [None]:
musk_twitter_data_all = pd.read_csv(os.path.join('cleaned', 'musk_twitter_data_all.csv'),parse_dates=["createdAt"])
musk_twitter_data_nlp = pd.read_csv(os.path.join('cleaned', 'musk_twitter_data_nlp.csv'),parse_dates=["createdAt"])

for df in (musk_twitter_data_all, musk_twitter_data_nlp):
    df["isRetweet"] = df["isRetweet"].astype(str).str.lower()
    df["possiblySensitive"] = df["possiblySensitive"].astype(str).str.lower()
    df["fullText"] = df["fullText"].astype(str)

musk_twitter_data_nlp["text_raw"] = musk_twitter_data_nlp["text_raw"].astype(str)
musk_twitter_data_nlp["text_lemmatized"] = musk_twitter_data_nlp["text_lemmatized"].astype(str)

for df in (musk_twitter_data_all, musk_twitter_data_nlp):
    df["date"] = df["createdAt"].dt.date

if TEST:
    start_date = pd.to_datetime("2025-01-01").date()
else:
    start_date = pd.to_datetime("2015-01-01").date()

end_date = musk_twitter_data_all["date"].max()

mask_all = (musk_twitter_data_all["date"] >= start_date) & (musk_twitter_data_all["date"] <= end_date)
musk_twitter_data_all = musk_twitter_data_all.loc[mask_all].reset_index(drop=True)

mask_nlp = (musk_twitter_data_nlp["date"] >= start_date) & (musk_twitter_data_nlp["date"] <= end_date)
musk_twitter_data_nlp = musk_twitter_data_nlp.loc[mask_nlp].reset_index(drop=True)

final_daily_df = pd.DataFrame({
    'date': pd.date_range(start=start_date, end=end_date)
})
final_daily_df["date"] = final_daily_df["date"].dt.date 

print("NLPTweets:", musk_twitter_data_nlp.shape, "AllTweets:", musk_twitter_data_all.shape)
musk_twitter_data_nlp.info()
musk_twitter_data_all.info()

  musk_twitter_data_all = pd.read_csv(os.path.join('processed', 'musk_twitter_data_all.csv'),parse_dates=["createdAt"])
  musk_twitter_data_nlp = pd.read_csv(os.path.join('processed', 'musk_twitter_data_nlp.csv'),parse_dates=["createdAt"])


NLPTweets: (4111, 28) AllTweets: (6327, 25)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4111 entries, 0 to 4110
Data columns (total 28 columns):
 #   Column                    Non-Null Count  Dtype              
---  ------                    --------------  -----              
 0   id                        4111 non-null   int64              
 1   url                       4111 non-null   object             
 2   twitterUrl                4111 non-null   object             
 3   fullText                  4111 non-null   object             
 4   retweetCount              4111 non-null   float64            
 5   replyCount                4111 non-null   float64            
 6   likeCount                 4111 non-null   float64            
 7   quoteCount                4111 non-null   float64            
 8   viewCount                 4111 non-null   float64            
 9   createdAt                 4111 non-null   datetime64[ns, UTC]
 10  bookmarkCount             4111 non-null 

# Tweet activity
New features:
- Number of tweets per day

In [76]:
tweet_counts_daily = (
    musk_twitter_data_all
    .groupby("date")
    .size()
    .reset_index(name="tweet_count")
)

# Sentiment Analysis

New Features:
- Poitve, Neutral ans Negative percentage of posts
- Polarization: Tweets with pos/neg > 0,6

In [77]:
# Tweet sentiment analysis
# Model: https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def preprocess(text):
    return text.replace("\n", " ").strip()

def get_sentiment_probs(text):
    text = preprocess(text)
    tokens = tokenizer(text, return_tensors='pt', truncation=True)
    with torch.no_grad():
        output = model(**tokens)
    probs = softmax(output.logits.cpu().numpy()[0])
    return {
        "sentiment": ['negative', 'neutral', 'positive'][probs.argmax()],
        "neg": probs[0],
        "neu": probs[1],
        "pos": probs[2],
    }

def polarized_label(row):
    return "polarized" if max(row["pos"], row["neg"]) > 0.6 else "not_polarized"

results = musk_twitter_data_nlp["text_raw"].progress_apply(get_sentiment_probs).apply(pd.Series)
musk_twitter_data_nlp = pd.concat([musk_twitter_data_nlp, results], axis=1)
musk_twitter_data_nlp["sentiment_polarity"] = musk_twitter_data_nlp.apply(polarized_label, axis=1)

# Daily aggregation
sentiment_avg = (musk_twitter_data_nlp.groupby("date")[["neg", "neu", "pos"]].mean().reset_index())
nlp_counts = (musk_twitter_data_nlp.groupby("date").size().reset_index(name="nlp_tweet_count"))
polarization = (musk_twitter_data_nlp.groupby("date")["sentiment_polarity"].value_counts(normalize=True).unstack(fill_value=0).reset_index())

# 4. Merge zu tagesbasiertem Datensatz
sentiment_daily = sentiment_avg.merge(nlp_counts, on="date", how="left")
sentiment_daily = sentiment_daily.merge(polarization, on="date", how="left")

  0%|          | 0/4111 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


# Emotions & Personality

New Features:
- Ekman Emotions: anger, disgust, fear, joy, neutral, sadness, surprise
- Big 5 personality traits: Extroversion, Neuroticism, Agreeableness, Conscientiousness, Openness

In [78]:
# Ekman Emotionen
# Model: https://huggingface.co/j-hartmann/emotion-english-distilroberta-base
model_name = "j-hartmann/emotion-english-distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

emotion_labels = ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']

# Helper that returns a dict of probabilities
def get_emotions(text):
    tokens = tokenizer(text, return_tensors='pt', truncation=True)
    with torch.no_grad():
        logits = model(**tokens).logits
    probs = softmax(logits.numpy()[0])
    return dict(zip(emotion_labels, probs))

# Apply to every tweet
print("Calculating emotion probabilities...")
emotion_probs = musk_twitter_data_nlp['text_raw'].progress_apply(get_emotions).apply(pd.Series)

# Append those new columns back onto your original DF
musk_twitter_data_nlp = pd.concat([musk_twitter_data_nlp.reset_index(drop=True), emotion_probs],axis=1)

# Aggregate by day (mean probability for each emotion)
print("Aggregating daily emotions...")
emotion_daily = (musk_twitter_data_nlp.groupby('date')[emotion_labels].mean().reset_index())

Calculating emotion probabilities...


  0%|          | 0/4111 [00:00<?, ?it/s]

Aggregating daily emotions...


In [79]:
# Big Five Personality Traits
# Model: https://huggingface.co/Minej/bert-base-personality
tokenizer = BertTokenizer.from_pretrained("Minej/bert-base-personality")
model = BertForSequenceClassification.from_pretrained("Minej/bert-base-personality")

personality_labels = ['Extroversion', 'Neuroticism', 'Agreeableness', 'Conscientiousness', 'Openness']

def get_personality(text):
    inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    probs = sigmoid(outputs.logits).squeeze().numpy()
    return dict(zip(personality_labels, probs))

# Apply to every tweet
print("Calculating personality traits...")
personality_probs = musk_twitter_data_nlp['text_raw'].progress_apply(get_personality).apply(pd.Series)

# Append those new columns back onto your original DF
musk_twitter_data_nlp = pd.concat([musk_twitter_data_nlp.reset_index(drop=True), personality_probs], axis=1)

# Aggregate by day (mean probability for each emotion)
print("Aggregating daily personality...")
personality_daily = (musk_twitter_data_nlp.groupby('date')[personality_labels].mean().reset_index())

Calculating personality traits...


  0%|          | 0/4111 [00:00<?, ?it/s]

Aggregating daily personality...


# Topic and word counts

New Features: 
- Daily Word counts
    - Rationale of Definition of words:
        - Company/ticker terms (e.g. tesla, tsla, spacex) capture direct references to publicly traded entities.
        - Product names (e.g. model, cybertruck, starship) often precede news that can move stock prices.
        - Crypto tokens (e.g. bitcoin, dogecoin, ethereum, crypto) map to Musk-driven volatility in the digital-asset markets
        - Financial keywords (e.g. stock, market, price, profit, loss, revenue) directly signal earnings or valuation discussions.
        - Macro terms (e.g. inflation, interest) reflect broader economic commentary that can sway sentiment.
        - Action verbs (buy, sell) often presage trading intent or recommendations.
- Topics of posts

In [81]:
# Words
def tokenize(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|@\S+|[^a-z\s]", "", text)
    return text.split()

# All words
all_tokens = musk_twitter_data_nlp["text_lemmatized"].dropna().apply(tokenize)
flat_tokens = [token for sublist in all_tokens for token in sublist]
word_counts = Counter(flat_tokens)
word_counts = pd.DataFrame(word_counts.items(), columns=["word", "count"]).sort_values("count", ascending=False).reset_index(drop=True)

# Top 20 words daily
top20 = [
    'tesla','tsla','stock','market','price','profit','loss','revenue',
    'inflation','interest','bitcoin','dogecoin','crypto','ethereum',
    'spacex','model','cybertruck','starship','buy','sell'
]

top_word_df = musk_twitter_data_nlp.dropna(subset=['text_lemmatized']).copy()
top_word_df['tokens'] = top_word_df['text_lemmatized'].apply(tokenize)
top_word_df = top_word_df.explode('tokens')
top_word_df = top_word_df[top_word_df['tokens'].isin(top20)].copy()

daily_word_counts = (
    top_word_df
    .groupby(['date','tokens'])
    .size()               
    .unstack(fill_value=0)   
    .reindex(columns=top20,    
               fill_value=0)
    .sort_index()
)

In [82]:
# Topics
model_name = "cardiffnlp/tweet-topic-21-multi"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

topic_labels = [
    "arts_&_culture", "business_&_entrepreneurs", "celebrity_&_pop_culture",
    "diaries_&_daily_life", "family", "fashion_&_style", "film_tv_&_video",
    "fitness_&_health", "food_&_dining", "gaming", "learning_&_educational",
    "music", "news_&_social_concern", "other_hobbies", "relationships",
    "science_&_technology", "sports", "travel_&_adventure", "youth_&_student_life"
]

def get_topics(text):
    tokens = tokenizer(text, truncation=True, padding=True, return_tensors="pt")
    with torch.no_grad():
        output = model(**tokens)
    probs = softmax(output.logits.numpy()[0])
    return dict(zip(topic_labels, probs))

# Apply to every tweet
print("Calculating topic probabilities...")
topic_scores = musk_twitter_data_nlp['text_lemmatized'].progress_apply(get_topics).apply(pd.Series)

# Append those new columns back onto your original DF
musk_twitter_data_nlp = pd.concat([musk_twitter_data_nlp.reset_index(drop=True), topic_scores], axis=1)

# Aggregate by day (mean probability for each topic)
print("Aggregating daily topics...")
topics_daily = (musk_twitter_data_nlp.groupby('date')[topic_labels].mean().reset_index())

Calculating topic probabilities...


  0%|          | 0/4111 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Aggregating daily topics...


# Merge and Create final df
Merge the daily dfs in one dataframe and create csv

In [83]:
# Merge with complete date, fill missing days with zero
final_daily_df = final_daily_df.merge(tweet_counts_daily, on="date", how="left").fillna(0)
final_daily_df["tweet_count"] = final_daily_df["tweet_count"].astype(int)
final_daily_df = final_daily_df.merge(sentiment_daily, on="date", how="left")
final_daily_df = final_daily_df.merge(emotion_daily, on="date", how="left")
final_daily_df = final_daily_df.merge(personality_daily, on="date", how="left")
final_daily_df = final_daily_df.merge(daily_word_counts, on="date", how="left")
final_daily_df = final_daily_df.merge(topics_daily, on="date", how="left")

display(final_daily_df.info())
display(final_daily_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 59 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   date                      103 non-null    object 
 1   tweet_count               103 non-null    int64  
 2   neg                       103 non-null    float32
 3   neu                       103 non-null    float32
 4   pos                       103 non-null    float32
 5   nlp_tweet_count           103 non-null    int64  
 6   not_polarized             103 non-null    float64
 7   polarized                 103 non-null    float64
 8   anger                     103 non-null    float32
 9   disgust                   103 non-null    float32
 10  fear                      103 non-null    float32
 11  joy                       103 non-null    float32
 12  neutral                   103 non-null    float32
 13  sadness                   103 non-null    float32
 14  surprise  

None

Unnamed: 0,date,tweet_count,neg,neu,pos,nlp_tweet_count,not_polarized,polarized,anger,disgust,...,gaming,learning_&_educational,music,news_&_social_concern,other_hobbies,relationships,science_&_technology,sports,travel_&_adventure,youth_&_student_life
0,2025-01-01,71,0.501142,0.275038,0.223821,57,0.368421,0.631579,0.144144,0.16513,...,0.002792,0.004627,0.016108,0.550684,0.032012,0.006912,0.004881,0.076754,0.005526,0.002157
1,2025-01-02,93,0.483342,0.388767,0.127891,71,0.521127,0.478873,0.154978,0.18599,...,0.019952,0.015618,0.00778,0.628945,0.023301,0.004411,0.004784,0.05836,0.003083,0.004201
2,2025-01-03,87,0.426233,0.373209,0.200558,78,0.5,0.5,0.14202,0.235458,...,0.013754,0.017788,0.016134,0.547873,0.033602,0.007094,0.039875,0.077551,0.01295,0.003873
3,2025-01-04,71,0.523883,0.304259,0.171858,58,0.396552,0.603448,0.140071,0.260048,...,0.03134,0.018186,0.00874,0.527992,0.037701,0.006327,0.042403,0.071947,0.003538,0.003296
4,2025-01-05,102,0.472078,0.317763,0.210159,77,0.441558,0.558442,0.120174,0.197293,...,0.014166,0.013848,0.018849,0.449201,0.032162,0.010302,0.007183,0.074956,0.006958,0.005427


In [84]:
final_daily_df.to_csv(os.path.join('processed', 'final_daily_df.csv'), index=False)

# Vizualization

Visualize (daily and rolling window):
- Tweet activity over time
- Sentiment development over time
- Polarisation percentage over time
- Ekman Emotions over time
- Big 5 Personality traits over time
- Word distribution over time
- Tweet topics over time

In [85]:
fig1 = px.line(
    final_daily_df,
    x="date",
    y=["tweet_count", "nlp_tweet_count"],
    labels={"value": "Tweetanzahl", "variable": "Typ"},
    title="Elon Musks Tweet-Aktivität über Zeit (interaktiv)"
)

fig2 = px.line(
    final_daily_df,
    x="date",
    y=["pos", "neu", "neg"],
    labels={"value": "Sentiment-Wahrscheinlichkeit", "variable": "Sentiment"},
    title="Durchschnittliches Sentiment pro Tag (interaktiv)"
)

fig3 = px.line(
    final_daily_df,
    x="date",
    y=["share_polarized", "share_neutral"],
    labels={"value": "Anteil", "variable": "Kategorie"},
    title="Anteil polarisiert vs. neutral pro Tag (interaktiv)"
)

fig1.show()
fig2.show()
fig3.show()

window_size = 365

rolling_sentiment = final_daily_df[['pos', 'neu', 'neg']].rolling(window=window_size, min_periods=1).mean()
rolling_meta = final_daily_df[['tweet_count', 'share_polarized', 'share_neutral']].rolling(window=window_size, min_periods=1).mean()
rolling_emotions = emotion_daily[['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']].rolling(window=window_size, min_periods=1).mean()
rolling_emotions["date"] = emotion_daily["date"]
rolling_personality = personality_daily[['Extroversion', 'Neuroticism', 'Agreeableness', 'Conscientiousness', 'Openness']].rolling(window=window_size, min_periods=1).mean()
rolling_personality["date"] = personality_daily["date"]

# Plot 1: Tweet-Aktivität
fig1 = go.Figure()
fig1.add_trace(go.Scatter(x=final_daily_df["date"], y=rolling_meta['tweet_count'], mode='lines', name='Tweet Count (Rolling)'))
fig1.add_vline(x="2022-10-27", line=dict(color="red", dash="dash"), name="Twitter-Übernahme")
fig1.update_layout(title="Rolling Tweet Count", xaxis_title="Datum", yaxis_title="Tweetanzahl")

# Plot 2: Sentiment-Trends
fig2 = go.Figure()
for col in ['pos', 'neu', 'neg']:
    fig2.add_trace(go.Scatter(x=final_daily_df["date"], y=rolling_sentiment[col], mode='lines', name=col.capitalize()))
fig2.add_vline(x="2022-10-27", line=dict(color="red", dash="dash"))
fig2.update_layout(title="Rolling Sentiment Trends", xaxis_title="Datum", yaxis_title="Anteil (0–1)")

# Plot 3: Polarisiert vs Neutral
fig3 = go.Figure()
fig3.add_trace(go.Scatter(x=final_daily_df["date"], y=rolling_meta['share_polarized'], name='Polarisiert', mode='lines'))
fig3.add_trace(go.Scatter(x=final_daily_df["date"], y=rolling_meta['share_neutral'], name='Neutral', mode='lines'))
fig3.update_layout(title="Rolling Anteil polarisiert vs. neutral", xaxis_title="Datum", yaxis_title="Anteil")

# Plot 4: Emotionen
fig4 = go.Figure()
for col in ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']:
    fig4.add_trace(go.Scatter(x=rolling_emotions["date"], y=rolling_emotions[col], mode='lines', name=col.capitalize()))
fig4.update_layout(title="Rolling Emotion Scores", xaxis_title="Datum", yaxis_title="Score")

# Plot 5: Persönlichkeit
fig5 = go.Figure()
for col in ['Extroversion', 'Neuroticism', 'Agreeableness', 'Conscientiousness', 'Openness']:
    fig5.add_trace(go.Scatter(x=rolling_personality["date"], y=rolling_personality[col], mode='lines', name=col))
fig5.update_layout(title="Rolling Big Five Traits", xaxis_title="Datum", yaxis_title="Score (0–1)")

fig1.show()
fig2.show()
fig3.show()
fig4.show()
fig5.show()

# vizulaize Top 50 words
plt.figure(figsize=(14, 100))
sns.barplot(data=word_counts.head(100), x="count", y="word", palette="viridis")
plt.title("Top 50 Words in Musk's Tweets")
plt.xlabel("Count")
plt.ylabel("Word")

ValueError: All arguments should have the same length. The length of argument `y` is 2, whereas the length of previously-processed arguments ['date'] is 103