In [24]:
# Import libraries for data processing, modeling, and evaluation
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score

# CHANGE THE PATH TO YOUR DATA

# Load price data (OHLCV) and sentiment scores from CSV files
price_data = pd.read_csv("/Users/marvinchen/Desktop/ECE 473/forecast-pumpdump/data/crypto_price/DOGE_price_data.csv", parse_dates=["datetime"])

# Load sentiment data with emojis
sentiment_data = pd.read_csv("/Users/marvinchen/Desktop/ECE 473/forecast-pumpdump/data/reddit_posts/post_sentiment/dogecoin_sentiment_wo_emoji.csv", parse_dates=["created_utc"])

In [15]:
# Merge price and sentiment data
merged = pd.merge_asof(
    price_data.sort_values("datetime"),
    sentiment_data.sort_values("created_utc"),
    left_on="datetime",
    right_on="created_utc",
    direction="nearest"
)

# Create target variable
merged["next_close"] = merged["close"].shift(-1)
merged["price_up"] = (merged["next_close"] > merged["close"]).astype(int)
merged.dropna(subset=["price_up"], inplace=True)

In [16]:
# Ensure sentiment columns are numeric
merged["vader_sentiment"] = pd.to_numeric(merged["vader_sentiment"], errors="coerce")
merged["finbert_sliding_sentiment"] = merged["finbert_sliding_sentiment"].map(
    {"neutral": 0, "positive": 1, "negative": -1}
)
merged.dropna(subset=["vader_sentiment", "finbert_sliding_sentiment"], inplace=True)

In [17]:
# Feature engineering
for lag in [1, 2, 3, 4, 5, 6]:
    merged[f"vader_lag{lag}"] = merged["vader_sentiment"].shift(lag)
    merged[f"finbert_lag{lag}"] = merged["finbert_sliding_sentiment"].shift(lag)

merged["returns"] = merged["close"].pct_change()
merged["volatility"] = merged["returns"].rolling(6).std()
merged["volume_change"] = merged["volume"].pct_change()
merged.dropna(inplace=True)

In [25]:
# Define features and target
features = [
    "returns", "volatility", "volume_change",
    "vader_lag1", "vader_lag2", "vader_lag3",
    "finbert_lag1", "finbert_lag2", "finbert_lag3"
]
X = merged[features]
y = merged["price_up"]

print(y.value_counts(normalize=True))

price_up
1    0.530259
0    0.469741
Name: proportion, dtype: float64


In [19]:
# Train-test split
split_idx = int(0.8 * len(X))
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

In [20]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [21]:
# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)

In [22]:
# Evaluate model
y_pred = model.predict(X_test_scaled)
y_proba = model.predict_proba(X_test_scaled)[:, 1]

print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(f"AUC-ROC: {roc_auc_score(y_test, y_proba):.2f}")

# Feature importance
coefs = pd.Series(model.coef_[0], index=features)
print("Top Predictive Features:")
print(coefs.abs().sort_values(ascending=False).head(5))

Accuracy: 0.54
AUC-ROC: 0.54
Top Predictive Features:
finbert_lag3    0.510584
finbert_lag2    0.410136
volatility      0.288777
returns         0.233386
vader_lag2      0.128560
dtype: float64
