### Import statements

In [None]:
# Part 1: Imports and Setup
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.api import SimpleExpSmoothing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.fft import fft
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
import os

warnings.filterwarnings('ignore')

The data is organized into two separate folders: stocks and etf. Additionally, there is a metadata file, symbols_valid_meta.csv, which contains crucial information such as stock symbols, security names, market categories, and whether a symbol represents an ETF.

The dataset is structured as follows:

Metadata file (symbols_valid_meta.csv):
Columns include Symbol, Security Name, ETF, and Market Category. This file provides descriptive context about each symbol, enabling me to differentiate between stocks and ETFs and link symbols to their respective markets.

Stocks and ETF folders:
These folders contain time-series CSV files for individual stock/ETF symbols, including columns such as Date, Open, High, Low, Close, Volume, and Adjusted Close.

In [None]:


META_PATH = r"D:\Master Things\Fall Sem Classes\Intro to Machine Learning\Homework\Project Submission\Project Notebooks\symbols_valid_meta.csv"
STOCKS_PATH = r"D:\Master Things\Fall Sem Classes\Intro to Machine Learning\Homework\Project Submission\Project Notebooks\stocks"
ETFS_PATH = r"D:\Master Things\Fall Sem Classes\Intro to Machine Learning\Homework\Project Submission\Project Notebooks\etfs"


### Exploratory Data Analysis

To analyze the data structure of the dataset, I begin by importing a single sample file from the collection of multiple files in the dataset. This approach allows me to examine its structure and content in detail. Using functions like `.head()`, `.info()`, and `.describe()`, I gain a preliminary understanding of the dataset, including its columns, data types, summary statistics, and any potential data quality issues such as missing values or irregularities. This exploration helps establish a baseline for subsequent preprocessing and analysis steps.I have then proceeded to use the entire dataset to create a pdf for the analyis of all stocks and etfs

To analyze the data structure of the dataset, I begin by importing a single file, `AAPL.csv`, from the collection of multiple files in the dataset. Focusing on this specific file allows me to examine its structure and content in detail before scaling up to the entire dataset. Using functions like `.head()`, `.info()`, and `.describe()`, I gain a preliminary understanding of the data, including its columns, data types, summary statistics, and any potential data quality issues such as missing values or irregularities. This focused exploration of the `AAPL.csv` file serves as a representative example, helping establish a baseline for subsequent preprocessing and analysis steps. Once the methodology is refined using this single file, it can be extended to analyze all files in the dataset comprehensively.

In [None]:

file_path = "D:/Master Things/Fall Sem Classes/Intro to Machine Learning/Homework/Project Submission/Project Notebooks/stocks/AAPL.csv"
df = pd.read_csv(file_path, parse_dates=['Date'])

df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Plot missing values as a heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title("Missing Values Heatmap")
plt.show()


In [None]:
# Plot histograms for numerical columns
df.hist(figsize=(12, 10), bins=30)
plt.suptitle("Distribution of Numerical Features", fontsize=16)
plt.show()


In [None]:
# Line plots for key features
plt.figure(figsize=(12, 6))
plt.plot(df['Date'], df['Close'], label='Close Price')
plt.plot(df['Date'], df['Volume'] / 1e6, label='Volume (in millions)')
plt.legend()
plt.title("Time-Series Trends of Close Price and Volume")
plt.xlabel("Date")
plt.ylabel("Value")
plt.show()


In [None]:
# Correlation matrix
correlation = df[['Open', 'Close', 'High', 'Low', 'Volume']].corr()

# Heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()


In [None]:
# Pairplot to show pairwise relationships
sns.pairplot(df[['Open', 'Close', 'High', 'Low', 'Volume']])
plt.suptitle("Pairwise Relationships", y=1.02)
plt.show()


In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages

### Redfifing the path of each folder including the output folder to store the data 
stocks_path = r"D:\Master Things\Fall Sem Classes\Intro to Machine Learning\Homework\Project Submission\Project Notebooks\stocks"
etfs_path = r"D:\Master Things\Fall Sem Classes\Intro to Machine Learning\Homework\Project Submission\Project Notebooks\etfs"
output_folder = r"D:\Master Things\Fall Sem Classes\Intro to Machine Learning\Homework\Project Submission\Processed_Data"
output_pdf_path = os.path.join(output_folder, "Stock_ETF_Summary.pdf")

# Ensure the output directory exists
os.makedirs(output_folder, exist_ok=True)

# Function to process a single file and return DataFrame and summary
def process_file(file_path):
    df = pd.read_csv(file_path, parse_dates=['Date'])
    summary = df.describe().transpose()
    return df, summary

# Function to iterate through files in a folder and generate PDF content
def iterate_folder(folder_path, label, pdf_writer):
    all_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.csv')]
    
    for idx, file_path in enumerate(all_files):
        print(f"Processing file: {os.path.basename(file_path)}")
        df, summary = process_file(file_path)

        # Missing Value Analysis
        fig, ax = plt.subplots(figsize=(12, 6))
        missing_values = df.isnull().sum()
        missing_values_percentage = (missing_values / len(df)) * 100
        ax.bar(missing_values.index, missing_values_percentage)
        ax.set_title(f"Missing Value Analysis: {os.path.basename(file_path)}", fontsize=14)
        ax.set_ylabel("Percentage of Missing Values")
        ax.set_xlabel("Columns")
        pdf_writer.savefig(fig)
        plt.close(fig)

        # Summary Statistics Table
        fig, ax = plt.subplots(figsize=(12, 6))
        ax.axis('off')
        ax.table(cellText=summary.values, colLabels=summary.columns, rowLabels=summary.index, loc='center')
        ax.set_title(f"Summary Statistics: {os.path.basename(file_path)}", fontsize=14)
        pdf_writer.savefig(fig)
        plt.close(fig)

        # Histograms for Each Feature
        fig, ax = plt.subplots(len(df.select_dtypes(include=['float64', 'int64']).columns), 1, figsize=(12, 18))
        num_columns = df.select_dtypes(include=['float64', 'int64']).columns
        for i, col in enumerate(num_columns):
            sns.histplot(df[col], bins=30, kde=True, ax=ax[i])
            ax[i].set_title(f"Histogram: {col}")
        plt.tight_layout()
        pdf_writer.savefig(fig)
        plt.close(fig)

        # Time-Series Trends Analysis
        fig, ax = plt.subplots(figsize=(12, 6))
        ax.plot(df['Date'], df['Close'], label='Close Price')
        ax.plot(df['Date'], df['High'], label='High Price')
        ax.plot(df['Date'], df['Volume'] / 1e6, label='Volume (in millions)')
        ax.set_title(f"Time-Series Trends: {os.path.basename(file_path)}", fontsize=14)
        ax.set_xlabel("Date")
        ax.set_ylabel("Value")
        ax.legend()
        pdf_writer.savefig(fig)
        plt.close(fig)

        # Correlation Analysis
        fig, ax = plt.subplots(figsize=(10, 8))
        correlation = df.select_dtypes(include=['float64', 'int64']).corr()
        sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt=".2f", ax=ax)
        ax.set_title(f"Correlation Heatmap: {os.path.basename(file_path)}", fontsize=14)
        pdf_writer.savefig(fig)
        plt.close(fig)

        # Pairwise Relationships
        fig = sns.pairplot(df.select_dtypes(include=['float64', 'int64']), diag_kind='kde', corner=True)
        fig.fig.suptitle(f"Pairwise Relationships: {os.path.basename(file_path)}", y=1.02, fontsize=14)
        pdf_writer.savefig(fig.fig)
        plt.close(fig.fig)

# Create the output PDF
with PdfPages(output_pdf_path) as pdf_writer:
    # Process stocks folder
    iterate_folder(stocks_path, "Stocks", pdf_writer)

    # Process etfs folder
    iterate_folder(etfs_path, "ETFs", pdf_writer)

print(f"Processing complete. Summary PDF saved at: {output_pdf_path}")


In [None]:
# Part 2a: Data Loading and Preprocessing

def load_metadata(meta_path):
    """Load and process symbol metadata"""
    meta_df = pd.read_csv(meta_path)
    symbol_info = meta_df.set_index('Symbol')[['Security Name', 'ETF', 'Market Category']].to_dict('index')
    return symbol_info




def calculate_rsi(prices, period=14):
    """Calculate Relative Strength Index"""
    delta = prices.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

def calculate_macd(prices, fast=12, slow=26, signal=9):
    """Calculate MACD (Moving Average Convergence Divergence)"""
    exp1 = prices.ewm(span=fast, adjust=False).mean()
    exp2 = prices.ewm(span=slow, adjust=False).mean()
    macd = exp1 - exp2
    signal_line = macd.ewm(span=signal, adjust=False).mean()
    return macd - signal_line

def create_features(df):
    """Create technical and statistical features"""
    df['Returns'] = df['Close'].pct_change()
    df['Log_Returns'] = np.log1p(df['Returns'])
    df['Price_Range'] = df['High'] - df['Low']
    
    # Moving averages and technical indicators
    for window in [5, 10, 20, 50]:
        df[f'SMA_{window}'] = df['Close'].rolling(window=window).mean()
        df[f'STD_{window}'] = df['Close'].rolling(window=window).std()
    
    df['RSI'] = calculate_rsi(df['Close'])
    df['MACD'] = calculate_macd(df['Close'])
    
    df['Volume_MA5'] = df['Volume'].rolling(window=5).mean()
    df['Volume_MA20'] = df['Volume'].rolling(window=20).mean()
    df['Volume_Ratio'] = df['Volume'] / df['Volume_MA20']
    
    df['Next_Day_Return'] = df['Returns'].shift(-1)
    df['Trend'] = np.where(df['Next_Day_Return'] > 0, 1, 0)
    
    return df.dropna()


In [None]:
# Part 2b: Forecasting and Trend Classification

def basic_forecasting(df, forecast_days=5):
    """Implement multiple basic forecasting methods"""
    train_size = int(len(df) * 0.8)
    train_data = df['Close'][:train_size]
    test_data = df['Close'][train_size:]
    
    forecasts = {}
    
    # Forecasting Methods
    sma = df['Close'].rolling(window=20).mean()
    forecasts['SMA'] = sma
    
    ema = df['Close'].ewm(span=20, adjust=False).mean()
    forecasts['EMA'] = ema
    
    ses_model = SimpleExpSmoothing(train_data).fit()
    ses_forecast = ses_model.forecast(len(test_data))
    forecasts['SES'] = pd.concat([train_data, ses_forecast])
    
    des_model = ExponentialSmoothing(train_data, trend='add').fit()
    des_forecast = des_model.forecast(len(test_data))
    forecasts['DES'] = pd.concat([train_data, des_forecast])
    
    # Calculate Errors
    errors = {}
    for method, forecast in forecasts.items():
        error = np.sqrt(mean_squared_error(test_data, forecast[train_size:train_size+len(test_data)]))
        errors[method] = error
    
    return forecasts, errors



In [None]:
def trend_classification(df):
    """Implement Random Forest for trend classification"""
    feature_cols = ['Returns', 'Price_Range', 'SMA_5', 'SMA_20', 'STD_20', 'RSI', 'MACD', 'Volume_Ratio']
    X = df[feature_cols]
    y = df['Trend']
    
    train_size = int(len(df) * 0.8)
    X_train = X[:train_size]
    X_test = X[train_size:]
    y_train = y[:train_size]
    y_test = y[train_size:]
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    rf_model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
    rf_model.fit(X_train_scaled, y_train)
    
    y_pred = rf_model.predict(X_test_scaled)
    importance_df = pd.DataFrame({'feature': feature_cols, 'importance': rf_model.feature_importances_}).sort_values('importance', ascending=False)
    
    return {
        'accuracy': accuracy_score(y_test, y_pred),
        'report': classification_report(y_test, y_pred),
        'importance': importance_df,
        'predictions': y_pred,
        'true_values': y_test
    }


In [None]:
# Part 2c: Visualization Functions

def plot_price_and_forecasts(df, forecasts, symbol):
    """Plot actual prices with different forecasting methods"""
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=df.index, y=df['Close'], name='Actual Price', line=dict(color='black')))
    
    colors = ['blue', 'red', 'green', 'purple']
    for (method, forecast), color in zip(forecasts.items(), colors):
        fig.add_trace(go.Scatter(x=df.index, y=forecast, name=f'{method} Forecast', line=dict(color=color, dash='dash')))
    
    fig.update_layout(title=f'Price Forecasts - {symbol}', xaxis_title='Date', yaxis_title='Price', height=600, showlegend=True)
    fig.show()

def plot_trend_analysis(results, symbol):
    """Plot trend classification results"""
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Feature Importance', 'Prediction Distribution', 'Confusion Matrix', 'Prediction Timeline'),
        specs=[[{"type": "bar"}, {"type": "domain"}], [{"type": "heatmap"}, {"type": "scatter"}]]
    )
    
    fig.add_trace(go.Bar(x=results['importance']['importance'], y=results['importance']['feature'], orientation='h'), row=1, col=1)
    
    pred_dist = pd.Series(results['predictions']).value_counts()
    fig.add_trace(go.Pie(labels=['Downtrend', 'Uptrend'], values=[pred_dist.get(0, 0), pred_dist.get(1, 0)]), row=1, col=2)
    
    cm = pd.crosstab(results['true_values'], results['predictions'])
    fig.add_trace(go.Heatmap(z=cm.values, x=['Pred Down', 'Pred Up'], y=['True Down', 'True Up']), row=2, col=1)
    
    fig.add_trace(go.Scatter(x=np.arange(len(results['predictions'])), y=results['predictions'], mode='markers', marker=dict(color=results['predictions'], colorscale='RdYlGn')), row=2, col=2)
    
    fig.update_layout(height=800, title_text=f"Trend Analysis - {symbol}", showlegend=False)
    fig.show()


In [None]:
# Part 3: Main Analysis Function

def analyze_stock(symbol, is_etf=False):
    """Complete analysis pipeline for a single stock"""
    symbol_info = load_metadata(META_PATH)
    
    base_path = ETFS_PATH if is_etf else STOCKS_PATH
    file_path = os.path.join(base_path, f"{symbol}.csv")
    
    df = pd.read_csv(file_path)
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.set_index('Date')
    
    if symbol in symbol_info:
        df['Security_Name'] = symbol_info[symbol]['Security Name']
        df['Is_ETF'] = symbol_info[symbol]['ETF']
    else:
        df['Security_Name'] = symbol
        df['Is_ETF'] = 'Y' if is_etf else 'N'
    
    df = create_features(df)
    forecasts, errors = basic_forecasting(df)
    trend_results = trend_classification(df)
    
    plot_price_and_forecasts(df, forecasts, symbol)
    plot_trend_analysis(trend_results, symbol)
    
    print(f"\nResults for {symbol}")
    print("="*50)
    print("\nForecasting RMSE:")
    for method, error in errors.items():
        print(f"{method}: {error:.2f}")
    
    print("\nTrend Classification Report:")
    print(trend_results['report'])
    
    return {'symbol': symbol, 'forecast_errors': errors, 'trend_accuracy': trend_results['accuracy']}

# Example Usage
# Analyze a stock (e.g., 'AAPL')
results = analyze_stock('AAPL', is_etf=False)

# Analyze an ETF (e.g., 'SPY')
# results = analyze_stock('SPY', is_etf=True)


In [None]:


# Load metadata with symbols (assuming it has a 'symbol' column for stock symbols)
metadata_path = r"D:\Master Things\Fall Sem Classes\Intro to Machine Learning\Homework\Project Submission\Project Notebooks\symbols_valid_meta.csv"
metadata_df = pd.read_csv(metadata_path)
random_symbols = metadata_df['Symbol'].sample(n=5, random_state=42)  # Setting random_state for reproducibility

all_results = []

# Loop through each symbol in the random sample and run the analysis
for symbol in random_symbols:
    try:
        # Analyze each stock (set is_etf based on your criteria)
        result = analyze_stock(symbol, is_etf=False)
        all_results.append(result)
    except Exception as e:
        print(f"Error analyzing {symbol}: {e}")

# Convert results into a DataFrame for easier access
results_df = pd.DataFrame(all_results)

# Display or save the results DataFrame
print(results_df)
# Optionally, save to CSV
results_df.to_csv("all_stock_analysis_results.csv", index=False)

