In [None]:
import pandas as pd 
df = pd.read_csv('../data/raw_analyst_ratings')
df.head()
df.shape
df.info()
df.isna().sum()
df.duplicated().sum(), df[df.duplicated(keep=False)].shape

In [None]:
# Parse dates robustly
df['date'] = pd.to_datetime(df['date'], utc=True)  # interprets timezone offset and converts to UTC

# If the date is given as local UTC-4 without offset string, add tz:
# df['date'] = pd.to_datetime(df['date']).dt.tz_localize('Etc/GMT+4').dt.tz_convert('UTC')

# Create extra columns
df['date_utc'] = df['date']   # timezone-aware in UTC
df['date_local'] = df['date_utc'].dt.tz_convert('America/New_York')  # example convert to US Eastern (if useful)
df['date_only'] = df['date_utc'].dt.date
df['hour_utc'] = df['date_utc'].dt.hour
df['weekday'] = df['date_utc'].dt.day_name()


In [None]:
df['headline_len_chars'] = df['headline'].str.len().fillna(0)
df['headline_len_words'] = df['headline'].str.split().map(len).fillna(0)

# Summary stats
df[['headline_len_chars','headline_len_words']].describe()
# Visualize distribution
import matplotlib.pyplot as plt
plt.hist(df['headline_len_chars'], bins=50)
plt.title("Headline length (chars)")
plt.xlabel("chars"); plt.ylabel("count")


In [None]:
# Count articles per publisher
publisher_counts = df['publisher'].value_counts().reset_index()
publisher_counts.columns = ['publisher','count']
publisher_counts.head(20)

# If publishers look like emails, extract domains
import re
def extract_domain(p):
    if pd.isna(p): return None
    m = re.search(r'@([A-Za-z0-9.-]+)', p)
    if m: return m.group(1)
    # try url-like
    m2 = re.search(r'https?://([A-Za-z0-9.-]+)', p)
    if m2: return m2.group(1)
    # else try domain from publisher string (some sites include site.com in text)
    return None

df['publisher_domain'] = df['publisher'].apply(extract_domain)
df['publisher_domain'].value_counts().head(20)


In [None]:
top = publisher_counts.head(15)
plt.barh(top['publisher'][::-1], top['count'][::-1])
plt.title("Top 15 publishers by article count")
plt.xlabel("article count")


In [None]:
# daily counts
daily = df.set_index('date_utc').resample('D').size()
daily.plot(figsize=(12,4), title="Daily article counts")

# hourly pattern across all days (use local hour if you converted)
hourly = df['date_utc'].dt.hour.value_counts().sort_index()
hourly.plot(kind='bar', title='Articles by hour (UTC)')

In [None]:
daily.rolling(window=7).mean().plot(title="7-day rolling avg of daily articles")
daily.nlargest(10)

In [None]:
# weekday activity
weekday_counts = df['weekday'].value_counts().reindex(
    ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
)
weekday_counts.plot(kind='bar', title="Articles by weekday")

# hour-of-day heatmap: hour vs weekday
pivot = pd.crosstab(df['date_utc'].dt.hour, df['weekday'])
import seaborn as sns
plt.figure(figsize=(12,6))
sns.heatmap(pivot, cmap='viridis')
plt.title("Heatmap of articles: hour vs weekday")


In [None]:
import re
from collections import Counter
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))

def clean_text(s):
    if pd.isna(s): return ""
    s = s.lower()
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

df['headline_clean'] = df['headline'].apply(clean_text)
all_tokens = df['headline_clean'].str.split().explode()
freq = all_tokens[~all_tokens.isin(stop)].value_counts()
freq.head(30)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# bigrams
vect = CountVectorizer(ngram_range=(2,2), min_df=10, stop_words='english')
X = vect.fit_transform(df['headline_clean'].fillna(''))
bigram_counts = pd.Series(X.sum(axis=0).A1, index=vect.get_feature_names_out()).sort_values(ascending=False)
bigram_counts.head(30)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

vect = CountVectorizer(max_df=0.95, min_df=10, stop_words='english')
X = vect.fit_transform(df['headline_clean'].fillna(''))

n_topics = 8
lda = LatentDirichletAllocation(n_components=n_topics, random_state=0, learning_method='online')
lda.fit(X)

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        top_features = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        print(f"Topic {topic_idx}: {', '.join(top_features)}\n")

print_top_words(lda, vect.get_feature_names_out(), 12)

In [None]:
# Get topic per headline (dominant)
topic_dist = lda.transform(X)  # shape (n_articles, n_topics)
df['dominant_topic'] = topic_dist.argmax(axis=1)

# Topic counts per stock
topic_by_stock = pd.crosstab(df['stock'], df['dominant_topic']).apply(lambda r: r/r.sum(), axis=1)
# topic_by_stock.head()

# Publisher-topic mix
topic_by_publisher = pd.crosstab(df['publisher'], df['dominant_topic'])
topic_by_publisher_norm = topic_by_publisher.div(topic_by_publisher.sum(axis=1), axis=0)
topic_by_publisher_norm.loc[top_publishers_list]  # inspect only top publishers

In [None]:

df['date_day'] = pd.to_datetime(df['date_utc'].dt.date)
topic_ts = df.groupby(['date_day','dominant_topic']).size().unstack(fill_value=0)
topic_ts.plot(subplots=True, figsize=(12, 2*n_topics), sharex=True)

In [None]:
# Correlation matrix for numeric features
num = df[['headline_len_chars','headline_len_words']]
num.corr()



In [None]:
df['stock'] = df['stock'].str.upper().str.strip()