In [1]:
import os
import time
import logging
import pandas as pd
import torch
import torch.nn.functional as F
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from dateutil import parser

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
print(torch.__version__)

2.6.0+cu124


In [3]:
import os
import time
import logging
import torch
import torch.nn.functional as F
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from dateutil import parser

# =====================================================
LOG_FILE = "processing_log.txt"
logging.basicConfig(
    filename=LOG_FILE,
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)

def log(msg, level="info"):
    tqdm.write(msg)
    if level == "error":
        logging.error(msg)
    elif level == "warning":
        logging.warning(msg)
    else:
        logging.info(msg)


# =====================================================
def normalize_date_column(df):
    def parse_date_safe(x):
        try:
            return parser.parse(str(x), dayfirst=False)
        except Exception:
            try:
                return parser.parse(str(x), dayfirst=True)
            except Exception:
                return None

    df["Date"] = df["Date"].apply(parse_date_safe)
    df = df.dropna(subset=["Date"])
    df = df.sort_values("Date").reset_index(drop=True)
    return df


def split_headlines(text):
    if pd.isna(text):
        return []
    return [t.strip() for t in str(text).split('|') if t.strip()]


# =====================================================
DATA_DIR = "/home/sunkari/Stock_price_predictor/Dataset"
OUTPUT_DIR = "./Processed"
os.makedirs(OUTPUT_DIR, exist_ok=True)

TRAIN_RATIO = 0.8  # 80% train, 20% test

# =====================================================
MODEL_NAME = "yiyanghkust/finbert-tone"
log(f"üîπ Loading model: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.eval()
log("‚úÖ Model loaded successfully.")


# =====================================================
def get_sentiment_scores(texts):
    if len(texts) == 0:
        return [0.0, 1.0, 0.0]  # Neutral default
    inputs = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )
    with torch.no_grad():
        outputs = model(**inputs)
        probs = F.softmax(outputs.logits, dim=-1)
    return probs.mean(dim=0).numpy().tolist()


# =====================================================
start_time_total = time.time()

for file in os.listdir(DATA_DIR):
    if not file.endswith(".csv"):
        continue
    start_time = time.time()
    company_path = os.path.join(DATA_DIR, file)
    log(f"\nüîç Processing {file} ...")

    try:
        df = pd.read_csv(company_path)
        df.columns = df.columns.str.strip()
        log(f"üìÇ Loaded file with {len(df)} rows.")

        df = normalize_date_column(df)
        log(f"üóìÔ∏è Normalized dates, {len(df)} rows remain after cleaning.")

        # Split headlines into lists
        df["Headline_List"] = df["Headlines"].apply(split_headlines)

        # Compute daily sentiment
        sentiments = []
        for headlines in tqdm(df["Headline_List"].tolist(), desc=f"Sentiment {file}"):
            try:
                probs = get_sentiment_scores(headlines)
                sentiments.append(probs)
            except Exception as e:
                log(f"‚ö†Ô∏è Error processing headlines: {headlines[:3]}... | {e}", "warning")
                sentiments.append([0.0, 1.0, 0.0])  # default neutral

        sentiments = pd.DataFrame(sentiments, columns=["negative", "neutral", "positive"])
        df = pd.concat([df, sentiments], axis=1)

        # =====================================================
        # ‚úÇÔ∏è Split into Train and Test
        # =====================================================
        split_idx = int(len(df) * TRAIN_RATIO)
        train_df = df.iloc[:split_idx].reset_index(drop=True)
        test_df = df.iloc[split_idx:].reset_index(drop=True)

        # Save processed versions
        base_name = os.path.splitext(file)[0]
        train_out = os.path.join(OUTPUT_DIR, f"{base_name}_train.csv")
        test_out = os.path.join(OUTPUT_DIR, f"{base_name}_test.csv")

        train_df.to_csv(train_out, index=False)
        test_df.to_csv(test_out, index=False)

        elapsed = time.time() - start_time
        log(f"‚úÖ Saved train/test split for {file} | Train={len(train_df)}, Test={len(test_df)} | ‚è±Ô∏è {elapsed:.2f}s")

    except Exception as e:
        log(f"‚ùå Error processing {file}: {e}", "error")

total_time = time.time() - start_time_total
log(f"\nüèÅ All files processed successfully in {total_time/60:.2f} minutes.")

üîπ Loading model: yiyanghkust/finbert-tone
‚úÖ Model loaded successfully.

üîç Processing XOM_stock_gdelt_final.csv ...
üìÇ Loaded file with 1003 rows.
üóìÔ∏è Normalized dates, 1003 rows remain after cleaning.


Sentiment XOM_stock_gdelt_final.csv: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1003/1003 [03:05<00:00,  5.40it/s]


‚úÖ Saved train/test split for XOM_stock_gdelt_final.csv | Train=802, Test=201 | ‚è±Ô∏è 185.98s

üîç Processing MSFT_stock_gdelt_final.csv ...
üìÇ Loaded file with 1003 rows.
üóìÔ∏è Normalized dates, 1003 rows remain after cleaning.


Sentiment MSFT_stock_gdelt_final.csv: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1003/1003 [03:40<00:00,  4.56it/s]


‚úÖ Saved train/test split for MSFT_stock_gdelt_final.csv | Train=802, Test=201 | ‚è±Ô∏è 220.27s

üîç Processing V_stock_gdelt_final.csv ...
üìÇ Loaded file with 1003 rows.
üóìÔ∏è Normalized dates, 1003 rows remain after cleaning.


Sentiment V_stock_gdelt_final.csv: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1003/1003 [02:36<00:00,  6.42it/s]


‚úÖ Saved train/test split for V_stock_gdelt_final.csv | Train=802, Test=201 | ‚è±Ô∏è 156.33s

üîç Processing PFE_stock_gdelt_final.csv ...
üìÇ Loaded file with 1003 rows.
üóìÔ∏è Normalized dates, 1003 rows remain after cleaning.


Sentiment PFE_stock_gdelt_final.csv: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1003/1003 [04:06<00:00,  4.07it/s]


‚úÖ Saved train/test split for PFE_stock_gdelt_final.csv | Train=802, Test=201 | ‚è±Ô∏è 246.90s

üîç Processing NVDA_stock_gdelt_final.csv ...
üìÇ Loaded file with 1003 rows.
üóìÔ∏è Normalized dates, 1003 rows remain after cleaning.


Sentiment NVDA_stock_gdelt_final.csv: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1003/1003 [03:21<00:00,  4.99it/s]


‚úÖ Saved train/test split for NVDA_stock_gdelt_final.csv | Train=802, Test=201 | ‚è±Ô∏è 201.23s

üîç Processing AMZN_stock_gdelt_final.csv ...
üìÇ Loaded file with 1003 rows.
üóìÔ∏è Normalized dates, 1003 rows remain after cleaning.


Sentiment AMZN_stock_gdelt_final.csv: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1003/1003 [02:48<00:00,  5.95it/s]


‚úÖ Saved train/test split for AMZN_stock_gdelt_final.csv | Train=802, Test=201 | ‚è±Ô∏è 168.71s

üîç Processing GOOG_stock_gdelt_final.csv ...
üìÇ Loaded file with 1003 rows.
üóìÔ∏è Normalized dates, 1003 rows remain after cleaning.


Sentiment GOOG_stock_gdelt_final.csv: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1003/1003 [03:42<00:00,  4.50it/s]


‚úÖ Saved train/test split for GOOG_stock_gdelt_final.csv | Train=802, Test=201 | ‚è±Ô∏è 222.84s

üîç Processing TSLA_stock_gdelt_final.csv ...
üìÇ Loaded file with 1003 rows.
üóìÔ∏è Normalized dates, 1003 rows remain after cleaning.


Sentiment TSLA_stock_gdelt_final.csv: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1003/1003 [04:32<00:00,  3.68it/s]


‚úÖ Saved train/test split for TSLA_stock_gdelt_final.csv | Train=802, Test=201 | ‚è±Ô∏è 273.02s

üîç Processing JPM_stock_gdelt_final.csv ...
üìÇ Loaded file with 1003 rows.
üóìÔ∏è Normalized dates, 1003 rows remain after cleaning.


Sentiment JPM_stock_gdelt_final.csv: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1003/1003 [03:39<00:00,  4.57it/s]


‚úÖ Saved train/test split for JPM_stock_gdelt_final.csv | Train=802, Test=201 | ‚è±Ô∏è 219.45s

üîç Processing AAPL_stock_gdelt_final.csv ...
üìÇ Loaded file with 1003 rows.
üóìÔ∏è Normalized dates, 1003 rows remain after cleaning.


Sentiment AAPL_stock_gdelt_final.csv: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1003/1003 [03:02<00:00,  5.51it/s]


‚úÖ Saved train/test split for AAPL_stock_gdelt_final.csv | Train=802, Test=201 | ‚è±Ô∏è 182.31s

üèÅ All files processed successfully in 34.62 minutes.
