In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from dateutil import parser
import torch
from transformers import AutoTokenizer, AutoModel
import os, glob, pickle
import pandas as pd
import numpy as np
from tqdm import tqdm
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
!pip install -q transformers torch pandas tqdm python-dateutil joblib

In [16]:
RAW_FOLDER = "/home/sunkari/Stock_price_predictor/Dataset"
OUTPUT_FOLDER = "./processed_datasets"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModel.from_pretrained(
    "ProsusAI/finbert",
    use_safetensors=True
).to(DEVICE)
model.eval()

def normalize_date_column(df):
    def parse_date_safe(x):
        try:
            return parser.parse(str(x), dayfirst=False)
        except:
            return parser.parse(str(x), dayfirst=True)
    df["Date"] = df["Date"].apply(parse_date_safe)
    df = df.dropna(subset=["Date"])
    df = df.sort_values("Date").reset_index(drop=True)
    return df

def split_headlines(text):
    if pd.isna(text): return []
    return [t.strip() for t in str(text).split('|') if t.strip()]

def get_emb(headlines):
    """
    Compute mean embedding and sentiment score for a list of headlines.
    Neutral if empty or NaN.
    """
    if not headlines:
        return np.zeros(768, dtype=np.float32), 0.0  # neutral embedding & score

    embs, scores = [], []
    for h in headlines:
        if pd.isna(h) or h.strip() == "":
            continue
        inp = tokenizer(h, truncation=True, padding=True, return_tensors="pt").to(DEVICE)
        with torch.no_grad():
            out = model(**inp)
        cls = out.last_hidden_state[:, 0, :].mean(dim=0).cpu().numpy()
        embs.append(cls)
        scores.append(cls.mean())

    if len(embs) == 0:
        return np.zeros(768, dtype=np.float32), 0.0

    return np.mean(embs, axis=0), np.mean(scores)


files = [f for f in os.listdir(RAW_FOLDER) if f.endswith(".csv")]
for f in tqdm(files):
    df = pd.read_csv(os.path.join(RAW_FOLDER, f))
    df.columns = df.columns.str.strip().str.replace('\ufeff','')  # clean column names
    df = normalize_date_column(df)
    df["Headline_List"] = df["Headlines"].apply(split_headlines)

    embs, scores = [], []
    for hl in tqdm(df["Headline_List"], leave=False):
        e, s = get_emb(hl)
        embs.append(e)
        scores.append(s)

    emb_df = pd.DataFrame(embs, columns=[f"emb_{i}" for i in range(768)])
    df["sentiment_score"] = scores
    df_out = pd.concat([df.drop(columns=["Headline_List"]), emb_df], axis=1)
    df_out.to_csv(os.path.join(OUTPUT_FOLDER, f.replace(".csv", "_merged.csv")), index=False)
print("✅ Saved processed CSVs with neutral embeddings")


100%|██████████| 10/10 [04:23<00:00, 26.40s/it]

✅ Saved processed CSVs with neutral embeddings





In [1]:
import os, glob, pickle, random
import numpy as np
import pandas as pd
from tqdm import tqdm

WINDOW_SIZE = 8
INPUT_FOLDER = "./processed_datasets"
SAVE_FOLDER = "./windows"
os.makedirs(SAVE_FOLDER, exist_ok=True)

files = glob.glob(os.path.join(INPUT_FOLDER, "*_merged.csv"))
all_windows, companies = [], []

for f in tqdm(files):
    df = pd.read_csv(f, parse_dates=["Date"])
    ticker = df["Ticker"].iloc[0] if "Ticker" in df.columns else os.path.basename(f).split("_")[0]
    
    # ✅ Keep only numeric columns (drops text like Headlines, Company, etc.)
    numeric_df = df.select_dtypes(include=[np.number])
    
    feat_cols = numeric_df.columns.tolist()
    
    # Ensure Close exists for target y
    if "Close" not in df.columns:
        print(f"⚠️ Skipping {f}: 'Close' column missing.")
        continue

    for i in range(len(df) - WINDOW_SIZE):
        X = numeric_df.iloc[i:i+WINDOW_SIZE].values.astype(np.float32)
        y = float(df["Close"].iloc[i+WINDOW_SIZE])
        all_windows.append((X, y, ticker))

    companies.append(ticker)

random.shuffle(all_windows)
split = int(0.8 * len(all_windows))
train_windows = all_windows[:split]
test_windows = all_windows[split:]

pickle.dump(train_windows, open(f"{SAVE_FOLDER}/train_windows.pkl", "wb"))
pickle.dump(test_windows, open(f"{SAVE_FOLDER}/test_windows.pkl", "wb"))
pickle.dump(sorted(list(set(companies))), open(f"{SAVE_FOLDER}/company_list.pkl", "wb"))
print("✅ Saved train/test windows in", SAVE_FOLDER)

100%|██████████| 10/10 [00:07<00:00,  1.39it/s]


✅ Saved train/test windows in ./windows
