In [1]:
!pip install numpy yfinance pandas matplotlib xgboost nltk --upgrade scipy scikit-learn



In [2]:
import os
import requests
import time
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import yfinance as yf
from pathlib import Path
import numpy as np
import joblib
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import euclidean
from scipy.spatial.distance import cdist
from sklearn.metrics import accuracy_score
import xgboost as xgb

In [3]:
TOP_50_TICKERS = [
    "NVDA", "AAPL", "GOOG", "MSFT", "AMZN", "META", "TSLA", "AVGO", "2222.SR", "TSM",
    "BRK-B", "LLY", "WMT", "JPM", "TCEHY", "V", "ORCL", "MA", "005930.KS", "XOM",
    "JNJ", "PLTR", "BAC", "ASML", "ABBV", "NFLX", "601288.SS", "COST", "MC.PA", "BABA",
    "1398.HK", "AMD", "HD", "601939.SS", "ROG.SW", "PG", "GE", "MU", "CSCO", "KO",
    "WFC", "CVX", "UNH", "MS", "SAP", "TM", "AZN", "IBM", "CAT", "000660.KS"
]

current_dir = Path.cwd()
project_root = current_dir if current_dir.name != 'Notebooks' else current_dir.parent
raw_data_path = project_root / "Data" 
raw_data_path.mkdir(parents=True, exist_ok=True)

def fetch_tickers_in_batches(tickers, batch_size=10, period="6mo"):
    all_data = []
    for i in range(0, len(tickers), batch_size):
        batch = tickers[i:i + batch_size]
        print(f"Downloading batch: {batch}")
        try:
            data = yf.download(batch, period=period, group_by='ticker', auto_adjust=True, threads=True)
            if not data.empty:
                all_data.append(data)
        except Exception as e:
            print(f"Batch download error: {e}")
        time.sleep(1)
    return pd.concat(all_data, axis=1) if all_data else pd.DataFrame()

def robust_downloader(tickers, period="6mo", max_retries=3):
    print(f"Initiating resilient download for {len(tickers)} tickers...")
    df = fetch_tickers_in_batches(tickers, batch_size=15, period=period)
    
    for attempt in range(max_retries):
        existing_tickers = df.columns.get_level_values(0).unique().tolist()
        failed_tickers = [t for t in tickers if t not in existing_tickers]
        
        if not failed_tickers:
            print("All tickers downloaded successfully.")
            break
            
        wait_time = (attempt + 1) * 5
        print(f"Attempt {attempt + 1}/{max_retries}: {len(failed_tickers)} failures. Retrying in {wait_time}s...")
        time.sleep(wait_time)
        
        for ticker in failed_tickers:
            try:
                retry_data = yf.download(ticker, period=period, auto_adjust=True, progress=False)
                if not retry_data.empty and len(retry_data) > 20:
                    if not isinstance(retry_data.columns, pd.MultiIndex):
                        retry_data.columns = pd.MultiIndex.from_product([[ticker], retry_data.columns])
                    df = pd.concat([df, retry_data], axis=1)
                    print(f"✓ Successfully retrieved {ticker}")
            except Exception as e:
                print(f"✗ Failed for {ticker}: {str(e)[:50]}")

    df = df.sort_index().interpolate(method='time').ffill().bfill()
    
    missing_final = [t for t in tickers if t not in df.columns.get_level_values(0).unique()]
    if missing_final:
        print(f"⚠ CRITICAL: Still missing {len(missing_final)} tickers: {missing_final}")
    
    file_path = raw_data_path / "market_data_raw.csv"
    df.to_csv(file_path)
    print(f"✓ Data ingestion complete. File saved: {file_path}")
    print(f"✓ Successfully collected {df.columns.get_level_values(0).nunique()} tickers")
    return df

raw_df = robust_downloader(TOP_50_TICKERS)

Initiating resilient download for 50 tickers...
Downloading batch: ['NVDA', 'AAPL', 'GOOG', 'MSFT', 'AMZN', 'META', 'TSLA', 'AVGO', '2222.SR', 'TSM', 'BRK-B', 'LLY', 'WMT', 'JPM', 'TCEHY']


[*********************100%***********************]  15 of 15 completed


Downloading batch: ['V', 'ORCL', 'MA', '005930.KS', 'XOM', 'JNJ', 'PLTR', 'BAC', 'ASML', 'ABBV', 'NFLX', '601288.SS', 'COST', 'MC.PA', 'BABA']


[*********************100%***********************]  15 of 15 completed
[******                13%                       ]  2 of 15 completed

Downloading batch: ['1398.HK', 'AMD', 'HD', '601939.SS', 'ROG.SW', 'PG', 'GE', 'MU', 'CSCO', 'KO', 'WFC', 'CVX', 'UNH', 'MS', 'SAP']


[*********************100%***********************]  15 of 15 completed


Downloading batch: ['TM', 'AZN', 'IBM', 'CAT', '000660.KS']


[*********************100%***********************]  5 of 5 completed


All tickers downloaded successfully.
✓ Data ingestion complete. File saved: d:\VSCode\Projects\Stock-Market-Trend-Analysis\Data\market_data_raw.csv
✓ Successfully collected 50 tickers


In [4]:
current_dir = Path.cwd()
project_root = current_dir if current_dir.name != 'Notebooks' else current_dir.parent
raw_file = project_root / "Data" / "market_data_raw.csv"
proc_dir = project_root / "Data" 
proc_dir.mkdir(parents=True, exist_ok=True)

def calculate_indicators(df):
    """Calculates technical indicators"""
    df = df.copy()  # Avoid SettingWithCopyWarning
    
    # RSI_14
    delta = df['Close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df['RSI_14'] = 100 - (100 / (1 + rs))

    # SMA Ratio 5/20
    df['SMA_5'] = df['Close'].rolling(window=5).mean()
    df['SMA_20'] = df['Close'].rolling(window=20).mean()
    df['SMA_Ratio_5_20'] = df['SMA_5'] / df['SMA_20']

    # MACD Histogram
    ema12 = df['Close'].ewm(span=12, adjust=False).mean()
    ema26 = df['Close'].ewm(span=26, adjust=False).mean()
    df['MACD'] = ema12 - ema26
    df['Signal'] = df['MACD'].ewm(span=9, adjust=False).mean()
    df['MACD_Histogram'] = df['MACD'] - df['Signal']

    # Bollinger Bands Position
    std = df['Close'].rolling(window=20).std()
    df['BB_Upper'] = df['SMA_20'] + (std * 2)
    df['BB_Lower'] = df['SMA_20'] - (std * 2)
    df['BB_Position'] = (df['Close'] - df['BB_Lower']) / (df['BB_Upper'] - df['BB_Lower'])

    # Volume Ratio
    df['Vol_SMA_20'] = df['Volume'].rolling(window=20).mean()
    df['Volume_Ratio'] = df['Volume'] / df['Vol_SMA_20']

    return df.dropna()

print("Loading raw data and calculating features...")
df_raw = pd.read_csv(raw_file, header=[0, 1], index_col=0, parse_dates=True)
tickers = df_raw.columns.get_level_values(0).unique()

all_processed = []
for ticker in tickers:
    try:
        stock_data = df_raw[ticker].copy()
        if stock_data.empty or len(stock_data) < 50:
            print(f"⚠ Skipping {ticker}: Insufficient data ({len(stock_data)} days)")
            continue 

        processed_stock = calculate_indicators(stock_data)
        processed_stock['Ticker'] = ticker  # No more SettingWithCopyWarning
        all_processed.append(processed_stock)
        print(f"✓ Processed {ticker}: {len(processed_stock)} days")
    except Exception as e:
        print(f"✗ Error processing {ticker}: {str(e)[:50]}")

final_df = pd.concat(all_processed)
final_df.to_csv(proc_dir / "market_data_features.csv")
print(f"\n✓ Total: {len(final_df)} rows across {final_df['Ticker'].nunique()} tickers")
print(f"✓ File saved: {proc_dir / 'market_data_features.csv'}")

Loading raw data and calculating features...
✓ Processed AMZN: 141 days
✓ Processed GOOG: 141 days
✓ Processed TSLA: 141 days
✓ Processed MSFT: 141 days
✓ Processed TCEHY: 141 days
✓ Processed META: 141 days
✓ Processed NVDA: 141 days
✓ Processed AVGO: 141 days
✓ Processed BRK-B: 141 days
✓ Processed LLY: 141 days
✓ Processed 2222.SR: 141 days
✓ Processed AAPL: 141 days
✓ Processed WMT: 141 days
✓ Processed JPM: 141 days
✓ Processed TSM: 141 days
✓ Processed ORCL: 141 days
✓ Processed 601288.SS: 141 days
✓ Processed BABA: 141 days
✓ Processed NFLX: 141 days
✓ Processed V: 141 days
✓ Processed MA: 141 days
✓ Processed 005930.KS: 141 days
✓ Processed PLTR: 141 days
✓ Processed ABBV: 141 days
✓ Processed MC.PA: 141 days
✓ Processed BAC: 141 days
✓ Processed ASML: 141 days
✓ Processed XOM: 141 days
✓ Processed COST: 141 days
✓ Processed JNJ: 141 days
✓ Processed 1398.HK: 141 days
✓ Processed 601939.SS: 141 days
✓ Processed HD: 141 days
✓ Processed CSCO: 141 days
✓ Processed AMD: 141 days
✓

In [5]:
anomaly_df = pd.read_csv(proc_dir / "market_data_features.csv")
model_root = project_root / "Models"
model_root.mkdir(parents=True, exist_ok=True)

features = [
    'RSI_14',
    'SMA_Ratio_5_20',
    'MACD_Histogram',
    'BB_Position',
    'Volume_Ratio'
]

clean_anomaly_df = anomaly_df.dropna(subset=features).copy()

iso_model = IsolationForest(n_estimators=100, max_samples='auto', contamination=0.10, random_state=42, max_features=len(features), n_jobs=-1)

clean_anomaly_df['Anomaly_Flag'] = iso_model.fit_predict(clean_anomaly_df[features])
clean_anomaly_df['Is_Anomaly'] = clean_anomaly_df['Anomaly_Flag'].apply(lambda x: 1 if x == -1 else 0)

joblib.dump(iso_model, model_root / "isolation_forest_model.pkl")
clean_anomaly_df.to_csv(proc_dir / "market_data_anomalies.csv", index=False)

print(f"Detected {clean_anomaly_df['Is_Anomaly'].sum()} anomalies across all tickers.")

Detected 705 anomalies across all tickers.


In [6]:
clustering_df = pd.read_csv(proc_dir / "market_data_anomalies.csv")
clean_df = clustering_df.dropna(subset=features).copy()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(clean_df[features])

optimal_k = 5
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
clean_df['Cluster'] = kmeans.fit_predict(X_scaled)

model_root = project_root / "Models"
model_root.mkdir(parents=True, exist_ok=True)
joblib.dump(kmeans, model_root / "kmeans_model.pkl")
joblib.dump(scaler, model_root / "scaler.pkl")
clean_df.to_csv(proc_dir / "market_data_clusters.csv", index=False)

def find_similar_tickers(query_features_dict, df, kmeans_model, scaler_model, top_n=5):
    query_df = pd.DataFrame([query_features_dict])
    query_scaled = scaler_model.transform(query_df[features])
    
    query_cluster = kmeans_model.predict(query_scaled)[0]
    
    latest_states = df.sort_values('Date').groupby('Ticker').last().reset_index()
    cluster_peers = latest_states[latest_states['Cluster'] == query_cluster].copy()
    
    if cluster_peers.empty:
        return pd.DataFrame()
        
    peer_features_scaled = scaler_model.transform(cluster_peers[features])
    distances = cdist(query_scaled, peer_features_scaled, metric='euclidean')[0]
    
    cluster_peers['Distance'] = distances
    return cluster_peers.nsmallest(top_n, 'Distance')[['Ticker', 'Cluster', 'Distance'] + features]

query_features = {
    'RSI_14': 65.5, 
    'SMA_Ratio_5_20': 1.02, 
    'MACD_Histogram': 0.5,
    'BB_Position': 0.7, 
    'Volume_Ratio': 1.3
}

similar = find_similar_tickers(query_features, clean_df, kmeans, scaler, top_n=3)
print(similar)

   Ticker  Cluster  Distance     RSI_14  SMA_Ratio_5_20  MACD_Histogram  \
9    AMZN        2  0.609341  59.823787        1.015117       -0.159489   
20     GE        2  0.807588  60.864803        1.009872       -0.832476   
45    UNH        2  0.819458  58.988426        1.000641       -0.506044   

    BB_Position  Volume_Ratio  
9      0.550899      1.208331  
20     0.546844      1.075569  
45     0.571502      1.175841  


In [7]:
df = pd.read_csv(proc_dir / "market_data_clusters.csv")
df['Target'] = (df.groupby('Ticker')['Close'].shift(-1) > df['Close']).astype(int)
features = [
    'RSI_14', 
    'MACD_Histogram',
    'BB_Position', 
    'SMA_Ratio_5_20',
    'Volume_Ratio', 
    'Is_Anomaly', 
    'Cluster'
]
clean_df = df.dropna(subset=['Target'] + features).copy()
split_idx = int(len(clean_df) * 0.8)
train_df = clean_df.iloc[:split_idx]
test_df = clean_df.iloc[split_idx:]
X_train, y_train = train_df[features], train_df['Target']
X_test, y_test = test_df[features], test_df['Target']

xgb_model = xgb.XGBClassifier(n_estimators=500,learning_rate=0.05,max_depth=5,random_state=42,eval_metric='logloss')
xgb_model.fit(X_train, y_train,eval_set=[(X_test, y_test)],verbose=False)

y_pred = xgb_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Directional Accuracy: {acc:.2%}")

joblib.dump(xgb_model, model_root / "trend_model.pkl")

Directional Accuracy: 52.48%


['d:\\VSCode\\Projects\\Stock-Market-Trend-Analysis\\Models\\trend_model.pkl']