In [7]:
import pandas as pd

# Paths
google_path = "/home/rupam/DataAlpha/data/raw/processed/google_trends_sentiment_2020.csv"
returns_path = "/home/rupam/DataAlpha/data/raw/processed/all_returns.csv"

# Load
df_google = pd.read_csv(google_path, parse_dates=['date'])
df_returns = pd.read_csv(returns_path, parse_dates=['date'])

# Get trading dates
all_dates = pd.date_range(df_returns['date'].min(), df_returns['date'].max(), freq='D')
unique_tickers = df_returns['ticker'].unique()

# Build full date ticker grid
calendar = pd.MultiIndex.from_product([all_dates, unique_tickers], names=['date', 'ticker']).to_frame(index=False)
calendar.sort_values(['ticker', 'date'], inplace=True)

# Merge raw google data
google_full = calendar.merge(df_google, on=['date', 'ticker'], how='left')

# Sort
google_full.sort_values(['ticker', 'date'], inplace=True)

# For each ticker, expand weekend data into next Monday-Saturday
def expand_weekly(series, dates):
    values = series.copy()
    last_value = None
    for i in range(len(values)):
        weekday = dates.iloc[i].weekday()  # Monday=0, Sunday=6
        if weekday in [5,6] and pd.notnull(values.iloc[i]):
            last_value = values.iloc[i]
        elif weekday in [0,1,2,3,4]:  # Mon-Fri
            if last_value is not None:
                values.iloc[i] = last_value
        # Reset last_value on next weekend
        if weekday == 4:  # Friday
            last_value = None
    return values

google_full['trend_score'] = google_full.groupby('ticker').apply(
    lambda g: expand_weekly(g['trend_score'], g['date'])
).reset_index(drop=True)

# Filter only trading dates (weekdays)
google_expanded = google_full[google_full['date'].dt.weekday < 5]

# Save
output_path = "/home/rupam/DataAlpha/data/raw/processed/google_trends_expanded.csv"
google_expanded.to_csv(output_path, index=False)


  google_full['trend_score'] = google_full.groupby('ticker').apply(


In [9]:
import pandas as pd

# Paths
reddit_path = "/home/rupam/DataAlpha/data/raw/processed/reddit_sentiment_2020.csv"
returns_path = "/home/rupam/DataAlpha/data/raw/processed/all_returns.csv"

# Load
df_reddit = pd.read_csv(reddit_path, parse_dates=['date'])
df_returns = pd.read_csv(returns_path, parse_dates=['date'])

# Build calendar
all_dates = pd.date_range(df_returns['date'].min(), df_returns['date'].max(), freq='D')
unique_tickers = df_returns['ticker'].unique()
calendar = pd.MultiIndex.from_product([all_dates, unique_tickers], names=['date', 'ticker']).to_frame(index=False)
calendar.sort_values(['ticker', 'date'], inplace=True)

# Merge reddit data
reddit_full = calendar.merge(df_reddit, on=['date', 'ticker'], how='left')

# Sort
reddit_full.sort_values(['ticker', 'date'], inplace=True)

# Forward fill 5 days limit, preserve weekday data, else 0
def ffill_with_limit(series, limit):
    filled = series.fillna(method='ffill', limit=limit)
    return filled.fillna(0)

reddit_full['sentiment'] = reddit_full.groupby('ticker')['sentiment'].transform(lambda s: ffill_with_limit(s, 5))

# Filter only trading dates (weekdays)
reddit_expanded = reddit_full[reddit_full['date'].dt.weekday < 5]

# Save
output_path = "/home/rupam/DataAlpha/data/raw/processed/reddit_sentiment_expanded.csv"
reddit_expanded.to_csv(output_path, index=False)


  filled = series.fillna(method='ffill', limit=limit)


In [10]:
import pandas as pd

# Paths
news_path = "/home/rupam/DataAlpha/data/raw/processed/news_sentiment_2020.csv"
returns_path = "/home/rupam/DataAlpha/data/raw/processed/all_returns.csv"

# Load
df_news = pd.read_csv(news_path, parse_dates=['date'])
df_returns = pd.read_csv(returns_path, parse_dates=['date'])

# Build calendar
all_dates = pd.date_range(df_returns['date'].min(), df_returns['date'].max(), freq='D')
unique_tickers = df_returns['ticker'].unique()
calendar = pd.MultiIndex.from_product([all_dates, unique_tickers], names=['date', 'ticker']).to_frame(index=False)
calendar.sort_values(['ticker', 'date'], inplace=True)

# Merge news data
news_full = calendar.merge(df_news, on=['date', 'ticker'], how='left')

# Sort
news_full.sort_values(['ticker', 'date'], inplace=True)

# Forward fill 10 days limit, preserve weekday data, else 0
def ffill_with_limit(series, limit):
    filled = series.fillna(method='ffill', limit=limit)
    return filled.fillna(0)

news_full['sentiment'] = news_full.groupby('ticker')['sentiment'].transform(lambda s: ffill_with_limit(s, 10))

# Filter only trading dates (weekdays)
news_expanded = news_full[news_full['date'].dt.weekday < 5]

# Save
output_path = "/home/rupam/DataAlpha/data/raw/processed/news_sentiment_expanded.csv"
news_expanded.to_csv(output_path, index=False)


  filled = series.fillna(method='ffill', limit=limit)


In [11]:
import pandas as pd

# Paths
returns_path = "/home/rupam/DataAlpha/data/raw/processed/all_returns.csv"
reddit_path = "/home/rupam/DataAlpha/data/raw/processed/reddit_sentiment_expanded.csv"
news_path = "/home/rupam/DataAlpha/data/raw/processed/news_sentiment_expanded.csv"
google_path = "/home/rupam/DataAlpha/data/raw/processed/google_trends_expanded.csv"

# Load
df_returns = pd.read_csv(returns_path, parse_dates=['date'])
df_reddit = pd.read_csv(reddit_path, parse_dates=['date'])
df_news = pd.read_csv(news_path, parse_dates=['date'])
df_google = pd.read_csv(google_path, parse_dates=['date'])

# Merge
df = df_returns.merge(df_reddit[['date', 'ticker', 'sentiment']], on=['date', 'ticker'], how='left')
df.rename(columns={'sentiment': 'reddit_sentiment'}, inplace=True)

df = df.merge(df_news[['date', 'ticker', 'sentiment']], on=['date', 'ticker'], how='left')
df.rename(columns={'sentiment': 'news_sentiment'}, inplace=True)

df = df.merge(df_google[['date', 'ticker', 'trend_score']], on=['date', 'ticker'], how='left')

# Save
output_path = "/home/rupam/DataAlpha/data/raw/processed/final_merged_scores.csv"
df.to_csv(output_path, index=False)
