In [15]:
!pip install  numpy yfinance pandas pandas-ta matplotlib xgboost nltk  --upgrade scipy scikit-learn 

Defaulting to user installation because normal site-packages is not writeable
Collecting numpy
  Using cached numpy-2.4.0-cp313-cp313-win_amd64.whl.metadata (6.6 kB)
Collecting nltk
  Downloading nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Collecting click (from nltk)
  Downloading click-8.3.1-py3-none-any.whl.metadata (2.6 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2025.11.3-cp313-cp313-win_amd64.whl.metadata (41 kB)
Downloading nltk-3.9.2-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 1.5/1.5 MB 13.1 MB/s  0:00:00
Downloading regex-2025.11.3-cp313-cp313-win_amd64.whl (277 kB)
Downloading click-8.3.1-py3-none-any.whl (108 kB)
Installing collected packages: regex, click, nltk

   -------------------------- ------------- 2/3 [nltk]
   -------------------------- ------------- 2/3 [nltk]
   -------------------------- ------------- 2/3 [nltk]
   -------------------------- -----


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: C:\Users\prana\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [16]:
import os
import requests
import time
import pandas as pd
import matplotlib.pyplot as plt
import yfinance as yf
from pathlib import Path
import numpy as np
import joblib
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import euclidean
from scipy.spatial.distance import cdist
from sklearn.metrics import accuracy_score
import xgboost as xgb
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [6]:
TOP_50_TICKERS = [
    "NVDA", "AAPL", "GOOG", "MSFT", "AMZN", "META", "TSLA", "AVGO", "2222.SR", "TSM",
    "BRK-B", "LLY", "WMT", "JPM", "TCEHY", "V", "ORCL", "MA", "005930.KS", "XOM",
    "JNJ", "PLTR", "BAC", "ASML", "ABBV", "NFLX", "601288.SS", "COST", "MC.PA", "BABA",
    "1398.HK", "AMD", "HD", "601939.SS", "ROG.SW", "PG", "GE", "MU", "CSCO", "KO",
    "WFC", "CVX", "UNH", "MS", "SAP", "TM", "AZN", "IBM", "CAT", "000660.KS"
]

current_dir = Path.cwd()
project_root = current_dir if current_dir.name != 'Notebooks' else current_dir.parent
raw_data_path = project_root / "Data" 
raw_data_path.mkdir(parents=True, exist_ok=True)

def fetch_tickers_in_batches(tickers, batch_size=10, period="6mo"):
    """Downloads tickers in small batches to prevent connection timeouts."""
    all_data = []
    
    for i in range(0, len(tickers), batch_size):
        batch = tickers[i:i + batch_size]
        print(f"Downloading batch: {batch}")
        
        # Download batch with auto_adjust as per report 
        data = yf.download(batch, period=period, group_by='ticker', auto_adjust=True, threads=True)
        
        if not data.empty:
            all_data.append(data)
        
        # Brief pause to respect API limits and server load
        time.sleep(1)
        
    return pd.concat(all_data, axis=1) if all_data else pd.DataFrame()

def robust_downloader(tickers, period="6mo", max_retries=3):
    print(f"Initiating resilient download for {len(tickers)} tickers...")
    
    # 1. Initial Batch Download
    df = fetch_tickers_in_batches(tickers, batch_size=15, period=period)
    
    # 2. Identify and Retry Failures
    for attempt in range(max_retries):
        existing_tickers = df.columns.get_level_values(0).unique()
        failed_tickers = [t for t in tickers if t not in existing_tickers or df[t].isnull().all().all()]
        
        if not failed_tickers:
            print("All tickers downloaded successfully.")
            break
            
        # Exponential backoff: sleep longer on each failure 
        wait_time = (attempt + 1) * 5
        print(f"Attempt {attempt + 1}/{max_retries}: {len(failed_tickers)} failures. Retrying in {wait_time}s...")
        time.sleep(wait_time)
        
        # Retry failures one by one for maximum stability
        for ticker in failed_tickers:
            try:
                retry_data = yf.download(ticker, period=period, auto_adjust=True)
                if not retry_data.empty:
                    # Align columns with multi-index structure
                    retry_data.columns = pd.MultiIndex.from_product([[ticker], retry_data.columns])
                    df = pd.concat([df, retry_data], axis=1)
            except Exception as e:
                print(f"Failed again for {ticker}: {e}")

    # 3. Quality Assurance: Automated Interpolation [cite: 384, 480]
    # This fills small gaps (holidays, glitches) ensuring indicator stability [cite: 386]
    df = df.sort_index().interpolate(method='time').ffill().bfill()
    
    # Final check
    missing_final = [t for t in tickers if t not in df.columns.get_level_values(0).unique()]
    if missing_final:
        print(f"CRITICAL: Data missing for {missing_final}")
    
    # Save the synchronized raw data [cite: 371]
    file_path = raw_data_path / "market_data_raw.csv"
    df.to_csv(file_path)
    print(f"Data ingestion complete. File saved: {file_path}")
    return df

# Run the updated downloader
raw_df = robust_downloader(TOP_50_TICKERS)

Initiating resilient download for 50 tickers...
Downloading batch: ['NVDA', 'AAPL', 'GOOG', 'MSFT', 'AMZN', 'META', 'TSLA', 'AVGO', '2222.SR', 'TSM', 'BRK-B', 'LLY', 'WMT', 'JPM', 'TCEHY']


[**********************80%*************          ]  12 of 15 completedFailed to get ticker 'BRK-B' reason: Failed to perform, curl: (28) Connection timed out after 10001 milliseconds. See https://curl.se/libcurl/c/libcurl-errors.html first for more details.
Failed to get ticker 'AMZN' reason: Failed to perform, curl: (28) Connection timed out after 10005 milliseconds. See https://curl.se/libcurl/c/libcurl-errors.html first for more details.
[*********************100%***********************]  14 of 15 completed

1 Failed download:
['AMZN']: SSLError('Failed to perform, curl: (35) TLS connect error: error:00000000:invalid library (0):OPENSSL_internal:invalid library (0). See https://curl.se/libcurl/c/libcurl-errors.html first for more details.')
[*********************100%***********************]  14 of 15 completed

Downloading batch: ['V', 'ORCL', 'MA', '005930.KS', 'XOM', 'JNJ', 'PLTR', 'BAC', 'ASML', 'ABBV', 'NFLX', '601288.SS', 'COST', 'MC.PA', 'BABA']


[**********************87%*****************      ]  13 of 15 completedFailed to get ticker 'MA' reason: Failed to perform, curl: (28) Connection timed out after 10005 milliseconds. See https://curl.se/libcurl/c/libcurl-errors.html first for more details.
[*********************100%***********************]  15 of 15 completed


Downloading batch: ['1398.HK', 'AMD', 'HD', '601939.SS', 'ROG.SW', 'PG', 'GE', 'MU', 'CSCO', 'KO', 'WFC', 'CVX', 'UNH', 'MS', 'SAP']


[*********************100%***********************]  15 of 15 completed


Downloading batch: ['TM', 'AZN', 'IBM', 'CAT', '000660.KS']


[*********************100%***********************]  5 of 5 completed


Attempt 1/3: 1 failures. Retrying in 5s...


[*********************100%***********************]  1 of 1 completed


Failed again for AMZN: isna is not defined for MultiIndex
Attempt 2/3: 1 failures. Retrying in 10s...


[*********************100%***********************]  1 of 1 completed


Failed again for AMZN: isna is not defined for MultiIndex
Attempt 3/3: 1 failures. Retrying in 15s...


[*********************100%***********************]  1 of 1 completed

Failed again for AMZN: isna is not defined for MultiIndex
Data ingestion complete. File saved: d:\VSCode\Projects\Stock-Market-Trend-Analysis\Data\market_data_raw.csv





In [7]:
# Feature engineering 
current_dir = Path.cwd()
project_root = current_dir if current_dir.name != 'Notebooks' else current_dir.parent
raw_file = project_root / "Data" / "market_data_raw.csv"
proc_dir = project_root / "Data" 
proc_dir.mkdir(parents=True, exist_ok=True)

def calculate_indicators(df):
    """Manually calculates the Top 5 indicators from Appendix 11.1"""
    # Rank 1: RSI_14 (Momentum) [cite: 513]
    delta = df['Close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df['RSI_14'] = 100 - (100 / (1 + rs))

    # Rank 2: SMA Ratio 5/20 (Trend) [cite: 513]
    df['SMA_5'] = df['Close'].rolling(window=5).mean()
    df['SMA_20'] = df['Close'].rolling(window=20).mean()
    df['SMA_Ratio_5_20'] = df['SMA_5'] / df['SMA_20']

    # Rank 3: MACD Histogram (Trend) [cite: 513]
    ema12 = df['Close'].ewm(span=12, adjust=False).mean()
    ema26 = df['Close'].ewm(span=26, adjust=False).mean()
    df['MACD'] = ema12 - ema26
    df['Signal'] = df['MACD'].ewm(span=9, adjust=False).mean()
    df['MACD_Histogram'] = df['MACD'] - df['Signal']

    # Rank 4: Bollinger Bands Position (Volatility) [cite: 513]
    std = df['Close'].rolling(window=20).std()
    df['BB_Upper'] = df['SMA_20'] + (std * 2)
    df['BB_Lower'] = df['SMA_20'] - (std * 2)
    df['BB_Position'] = (df['Close'] - df['BB_Lower']) / (df['BB_Upper'] - df['BB_Lower'])

    # Rank 5: Volume Ratio to 20-day SMA (Volume) [cite: 513]
    df['Vol_SMA_20'] = df['Volume'].rolling(window=20).mean()
    df['Volume_Ratio'] = df['Volume'] / df['Vol_SMA_20']

    return df.dropna()

print("Loading raw data and calculating features...")
df_raw = pd.read_csv(raw_file, header=[0, 1], index_col=0, parse_dates=True)
tickers = df_raw.columns.get_level_values(0).unique()

all_processed = []
for ticker in tickers:
    try:
        stock_data = df_raw[ticker].copy()
        if stock_data.empty or len(stock_data) < 50:
            continue 

        processed_stock = calculate_indicators(stock_data)
        processed_stock.loc[:, 'Ticker'] = ticker
        all_processed.append(processed_stock)
    except Exception as e:
        print(f"Skipping {ticker}: Not enough valid data found.")

final_df = pd.concat(all_processed)
final_df.to_csv(proc_dir / "market_data_features.csv")
print(f"Processed file location: {proc_dir / 'market_data_features.csv'}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_stock.loc[:, 'Ticker'] = ticker
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_stock.loc[:, 'Ticker'] = ticker
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_stock.loc[:, 'Ticker'] = ticker
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .l

Loading raw data and calculating features...
Skipping AMZN: Not enough valid data found.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_stock.loc[:, 'Ticker'] = ticker
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_stock.loc[:, 'Ticker'] = ticker
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_stock.loc[:, 'Ticker'] = ticker
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .l

Processed file location: d:\VSCode\Projects\Stock-Market-Trend-Analysis\Data\market_data_features.csv


In [8]:
anomaly_df = pd.read_csv(proc_dir / "market_data_features.csv")
model_root = project_root / "Models"
model_root.mkdir(parents=True, exist_ok=True)

features = [
    'RSI_14',
    'SMA_Ratio_5_20',
    'MACD_Histogram',
    'BB_Position',
    'Volume_Ratio'
]

clean_anomaly_df = anomaly_df.dropna(subset=features).copy()

iso_model = IsolationForest(
    n_estimators=100, 
    max_samples='auto', 
    contamination=0.05, 
    random_state=42, 
    max_features=len(features), 
    n_jobs=-1
)

clean_anomaly_df['Anomaly_Flag'] = iso_model.fit_predict(clean_anomaly_df[features])
clean_anomaly_df['Is_Anomaly'] = clean_anomaly_df['Anomaly_Flag'].apply(lambda x: 1 if x == -1 else 0)

joblib.dump(iso_model, model_root / "isolation_forest_model.pkl")
clean_anomaly_df.to_csv(proc_dir / "market_data_anomalies.csv", index=False)

print(f"Detected {clean_anomaly_df['Is_Anomaly'].sum()} anomalies across all tickers.")

Detected 348 anomalies across all tickers.


In [9]:
clustering_df = pd.read_csv(proc_dir / "market_data_anomalies.csv")
clean_df = clustering_df.dropna(subset=features).copy()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(clean_df[features])

optimal_k = 5
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
clean_df['Cluster'] = kmeans.fit_predict(X_scaled)

model_root = project_root / "Models"
model_root.mkdir(parents=True, exist_ok=True)
joblib.dump(kmeans, model_root / "kmeans_model.pkl")
joblib.dump(scaler, model_root / "scaler.pkl")
clean_df.to_csv(proc_dir / "market_data_clusters.csv", index=False)

def find_similar_tickers(query_features_dict, df, kmeans_model, scaler_model, top_n=5):
    query_df = pd.DataFrame([query_features_dict])
    query_scaled = scaler_model.transform(query_df[features])
    
    query_cluster = kmeans_model.predict(query_scaled)[0]
    
    latest_states = df.sort_values('Date').groupby('Ticker').last().reset_index()
    cluster_peers = latest_states[latest_states['Cluster'] == query_cluster].copy()
    
    if cluster_peers.empty:
        return pd.DataFrame()
        
    peer_features_scaled = scaler_model.transform(cluster_peers[features])
    distances = cdist(query_scaled, peer_features_scaled, metric='euclidean')[0]
    
    cluster_peers['Distance'] = distances
    return cluster_peers.nsmallest(top_n, 'Distance')[['Ticker', 'Cluster', 'Distance'] + features]

query_features = {
    'RSI_14': 65.5, 
    'SMA_Ratio_5_20': 1.02, 
    'MACD_Histogram': 0.5,
    'BB_Position': 0.7, 
    'Volume_Ratio': 1.3
}

similar = find_similar_tickers(query_features, clean_df, kmeans, scaler, top_n=3)
print(similar)

   Ticker  Cluster  Distance     RSI_14  SMA_Ratio_5_20  MACD_Histogram  \
28  MC.PA        2  0.599815  63.760157        1.012284        0.800818   
15    CAT        2  1.223403  81.320726        0.996986        0.015378   
34   NVDA        2  1.294737  82.959137        1.024854        0.608714   

    BB_Position  Volume_Ratio  
28     0.867614      1.233367  
15     0.763900      1.078673  
34     0.725977      0.938105  


In [14]:
df = pd.read_csv(proc_dir / "market_data_clusters.csv")
df['Target'] = (df.groupby('Ticker')['Close'].shift(-1) > df['Close']).astype(int)
features = [
    'RSI_14', 
    'MACD_Histogram',
    'BB_Position', 
    'SMA_Ratio_5_20',
    'Volume_Ratio', 
    'Is_Anomaly', 
    'Cluster'
]
clean_df = df.dropna(subset=['Target'] + features).copy()
split_idx = int(len(clean_df) * 0.8)
train_df = clean_df.iloc[:split_idx]
test_df = clean_df.iloc[split_idx:]

X_train, y_train = train_df[features], train_df['Target']
X_test, y_test = test_df[features], test_df['Target']

xgb_model = xgb.XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    random_state=42,
    eval_metric='logloss'
)

xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=False
)

y_pred = xgb_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Directional Accuracy: {acc:.2%}")

joblib.dump(xgb_model, model_root / "trend_model.pkl")

Directional Accuracy: 53.30%


['d:\\VSCode\\Projects\\Stock-Market-Trend-Analysis\\Models\\trend_model.pkl']

In [19]:
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

def get_live_sentiment(ticker):
    stock = yf.Ticker(ticker)
    news = stock.news
    opinions = []
    total_score = 0
    
    for article in news[:5]:
        title = article['title']
        score = sia.polarity_scores(title)['compound']
        
        label = "Bullish" if score > 0.05 else "Bearish" if score < -0.05 else "Neutral"
        
        opinions.append({
            'headline': title,
            'sentiment': label,
            'score': score
        })
        total_score += score
        
    avg_score = total_score / len(news) if news else 0
    return avg_score, opinions

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\prana\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
