# Data Exploration Notebook

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully!")
print("Ready to explore stock market data!")

## Load the Data

First, let's load the raw stock data that we collected.

In [None]:
# Load raw stock data
try:
    df = pd.read_csv('../data/raw/stock_data.csv')
    df['Date'] = pd.to_datetime(df['Date'])
    print(f"Data loaded successfully!")
    print(f"Shape: {df.shape}")
    print(f"Date range: {df['Date'].min().date()} to {df['Date'].max().date()}")
    print(f"Stocks: {', '.join(df['Symbol'].unique())}")
except FileNotFoundError:
    print("Data file not found! Please run data_collector.py first.")
    print("Run this command in terminal: python data_collector.py")

## Basic Data Overview

In [None]:
# Display basic information
print("Dataset Info:")
print("-" * 40)
df.info()

print("\nFirst few rows:")
print("-" * 40)
df.head()

In [None]:
# Statistical summary
print("Statistical Summary:")
print("-" * 40)
df.describe()

## Price Visualization

Let's create some beautiful charts to understand price movements.

In [None]:
# Create price comparison chart
fig = go.Figure()

colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']

for i, symbol in enumerate(df['Symbol'].unique()):
    symbol_data = df[df['Symbol'] == symbol].sort_values('Date')
    
    fig.add_trace(go.Scatter(
        x=symbol_data['Date'],
        y=symbol_data['Close'],
        mode='lines',
        name=symbol,
        line=dict(color=colors[i % len(colors)], width=2)
    ))

fig.update_layout(
    title='Stock Price Comparison Over Time',
    xaxis_title='Date',
    yaxis_title='Price ($)',
    hovermode='x unified',
    height=600
)

fig.show()

## Volume Analysis

In [None]:
# Volume analysis
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
fig.suptitle('Trading Volume Analysis', fontsize=16, fontweight='bold')

symbols = df['Symbol'].unique()

for i, symbol in enumerate(symbols):
    row = i // 3
    col = i % 3
    
    if row < 2 and col < 3:
        symbol_data = df[df['Symbol'] == symbol]
        
        axes[row, col].hist(symbol_data['Volume'], bins=50, alpha=0.7, color=colors[i])
        axes[row, col].set_title(f'{symbol} Volume Distribution')
        axes[row, col].set_xlabel('Volume')
        axes[row, col].set_ylabel('Frequency')
        axes[row, col].ticklabel_format(style='scientific', axis='x', scilimits=(0,0))

if len(symbols) < 6:
    fig.delaxes(axes[1, 2])

plt.tight_layout()
plt.show()

## Daily Returns Analysis

In [None]:
# Calculate daily returns
df['Daily_Return'] = df.groupby('Symbol')['Close'].pct_change()

# Create returns distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

returns_data = []
labels = []

for symbol in df['Symbol'].unique():
    symbol_returns = df[df['Symbol'] == symbol]['Daily_Return'].dropna()
    returns_data.append(symbol_returns)
    labels.append(symbol)

axes[0].boxplot(returns_data, labels=labels)
axes[0].set_title('Daily Returns Distribution by Stock')
axes[0].set_ylabel('Daily Return')
axes[0].grid(True, alpha=0.3)

all_returns = df['Daily_Return'].dropna()
axes[1].hist(all_returns, bins=100, alpha=0.7, color='skyblue', edgecolor='black')
axes[1].axvline(all_returns.mean(), color='red', linestyle='--', label=f'Mean: {all_returns.mean():.4f}')
axes[1].set_title('Overall Daily Returns Distribution')
axes[1].set_xlabel('Daily Return')
axes[1].set_ylabel('Frequency')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Daily Returns Statistics:")
print("-" * 40)
returns_stats = df.groupby('Symbol')['Daily_Return'].agg(['mean', 'std', 'min', 'max'])
returns_stats.columns = ['Mean Return', 'Volatility', 'Min Return', 'Max Return']
print(returns_stats.round(4))

## Volatility Analysis

In [None]:
# Calculate rolling volatility
df['Volatility_30'] = df.groupby('Symbol')['Daily_Return'].rolling(window=30).std() * np.sqrt(252)
df.reset_index(drop=True, inplace=True)

# Plot volatility over time
fig = go.Figure()

for i, symbol in enumerate(df['Symbol'].unique()):
    symbol_data = df[df['Symbol'] == symbol].sort_values('Date')
    
    fig.add_trace(go.Scatter(
        x=symbol_data['Date'],
        y=symbol_data['Volatility_30'],
        mode='lines',
        name=f'{symbol} Volatility',
        line=dict(color=colors[i % len(colors)], width=2)
    ))

fig.update_layout(
    title='30-Day Rolling Volatility (Annualized)',
    xaxis_title='Date',
    yaxis_title='Volatility',
    hovermode='x unified',
    height=500
)

fig.show()

## Key Insights and Next Steps

Based on your data exploration, here are some key insights:

In [None]:
# Generate insights
print("KEY INSIGHTS FROM YOUR DATA:")
print("=" * 50)

# Data quality
missing_data = df.isnull().sum().sum()
print(f"Data Quality: {len(df):,} total rows, {missing_data} missing values")

# Volatility ranking
volatility_ranking = df.groupby('Symbol')['Daily_Return'].std().sort_values(ascending=False)
print(f"\nMost Volatile Stock: {volatility_ranking.index[0]} ({volatility_ranking.iloc[0]:.4f})")
print(f"Least Volatile Stock: {volatility_ranking.index[-1]} ({volatility_ranking.iloc[-1]:.4f})")

# Best performer
total_returns = df.groupby('Symbol').apply(lambda x: (x['Close'].iloc[-1] / x['Close'].iloc[0] - 1))
total_returns = total_returns.sort_values(ascending=False)
print(f"\nBest Performer: {total_returns.index[0]} ({total_returns.iloc[0]:.2%} total return)")
print(f"Worst Performer: {total_returns.index[-1]} ({total_returns.iloc[-1]:.2%} total return)")