# üîç CryptoSense - Data Exploration Notebook

This notebook explores the cryptocurrency data before modeling.

**Why explore first?**
- Understand data distributions
- Identify missing values and outliers
- Discover patterns and relationships
- Make informed feature engineering decisions

Real data scientists always explore before modeling!

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

print("‚úÖ Imports successful!")

In [None]:
# Load configuration
from config import *

print(f"Data directory: {DATA_DIR}")
print(f"Lookback window: {LOOKBACK_WINDOW} days")
print(f"Train/Val/Test split: {TRAIN_SPLIT}/{VAL_SPLIT}/{TEST_SPLIT}")

## 1. Load Raw Data

Let's load the raw Bitcoin data and see what we're working with.

In [None]:
# Load BTC raw data
btc_file = os.path.join(RAW_DATA_DIR, 'btc_usd_raw.csv')

if os.path.exists(btc_file):
    df_raw = pd.read_csv(btc_file)
    df_raw['date'] = pd.to_datetime(df_raw['date'])
    print(f"‚úÖ Loaded {len(df_raw)} rows of BTC data")
    print(f"Date range: {df_raw['date'].min()} to {df_raw['date'].max()}")
else:
    print("‚ùå Raw data not found. Run data_collection.py first!")
    df_raw = None

In [None]:
# First look at the data
if df_raw is not None:
    display(df_raw.head(10))
    display(df_raw.tail(10))

In [None]:
# Data info
if df_raw is not None:
    print("\nDataset Info:")
    print("="*60)
    df_raw.info()
    print("\n" + "="*60)
    print("\nBasic Statistics:")
    print("="*60)
    display(df_raw.describe())

## 2. Missing Values Analysis

Check for missing data and understand gaps.

In [None]:
if df_raw is not None:
    missing = df_raw.isnull().sum()
    missing_pct = (missing / len(df_raw)) * 100
    
    missing_df = pd.DataFrame({
        'Missing Count': missing,
        'Percentage': missing_pct
    }).sort_values('Missing Count', ascending=False)
    
    print("Missing Values:")
    display(missing_df[missing_df['Missing Count'] > 0])
    
    # Visualize
    if missing.sum() > 0:
        plt.figure(figsize=(10, 6))
        missing[missing > 0].plot(kind='barh')
        plt.title('Missing Values by Column')
        plt.xlabel('Count')
        plt.show()
    else:
        print("‚úÖ No missing values in raw price data!")

## 3. Price Visualization

Visualize the price history with volume.

In [None]:
if df_raw is not None:
    # Interactive candlestick chart
    fig = make_subplots(
        rows=2, cols=1,
        shared_xaxes=True,
        vertical_spacing=0.03,
        subplot_titles=('BTC Price', 'Volume'),
        row_heights=[0.7, 0.3]
    )
    
    # Candlestick
    fig.add_trace(
        go.Candlestick(
            x=df_raw['date'],
            open=df_raw['open'],
            high=df_raw['high'],
            low=df_raw['low'],
            close=df_raw['close'],
            name='Price'
        ),
        row=1, col=1
    )
    
    # Volume
    fig.add_trace(
        go.Bar(x=df_raw['date'], y=df_raw['volume'], name='Volume', marker_color='lightblue'),
        row=2, col=1
    )
    
    fig.update_layout(
        title='Bitcoin Price History and Volume',
        xaxis_rangeslider_visible=False,
        height=700
    )
    
    fig.show()

## 4. Returns Analysis

Calculate and analyze daily returns.

In [None]:
if df_raw is not None:
    # Calculate returns
    df_raw['daily_return'] = df_raw['close'].pct_change()
    df_raw['log_return'] = np.log(df_raw['close'] / df_raw['close'].shift(1))
    
    print("Returns Statistics:")
    print("="*60)
    print(f"Mean daily return: {df_raw['daily_return'].mean()*100:.4f}%")
    print(f"Std daily return: {df_raw['daily_return'].std()*100:.4f}%")
    print(f"Max daily return: {df_raw['daily_return'].max()*100:.2f}%")
    print(f"Min daily return: {df_raw['daily_return'].min()*100:.2f}%")
    print(f"Sharpe Ratio (approx): {(df_raw['daily_return'].mean() / df_raw['daily_return'].std()) * np.sqrt(365):.2f}")

In [None]:
if df_raw is not None:
    # Distribution of returns
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Histogram
    axes[0].hist(df_raw['daily_return'].dropna(), bins=100, edgecolor='black', alpha=0.7)
    axes[0].axvline(0, color='red', linestyle='--', linewidth=2)
    axes[0].set_xlabel('Daily Return')
    axes[0].set_ylabel('Frequency')
    axes[0].set_title('Distribution of Daily Returns')
    axes[0].grid(True, alpha=0.3)
    
    # Q-Q plot
    from scipy import stats
    stats.probplot(df_raw['daily_return'].dropna(), dist="norm", plot=axes[1])
    axes[1].set_title('Q-Q Plot (Normality Check)')
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## 5. Fear & Greed Index Analysis

Explore the sentiment indicator.

In [None]:
if df_raw is not None and 'fear_greed_value' in df_raw.columns:
    print("Fear & Greed Index Statistics:")
    print("="*60)
    print(df_raw['fear_greed_value'].describe())
    print("\nValue Classification Distribution:")
    print(df_raw['fear_greed_classification'].value_counts())

In [None]:
if df_raw is not None and 'fear_greed_value' in df_raw.columns:
    # Fear & Greed over time
    fig = go.Figure()
    
    fig.add_trace(go.Scatter(
        x=df_raw['date'],
        y=df_raw['fear_greed_value'],
        mode='lines',
        name='Fear & Greed',
        line=dict(color='purple', width=2)
    ))
    
    # Add zones
    fig.add_hline(y=25, line_dash="dash", line_color="red", annotation_text="Extreme Fear")
    fig.add_hline(y=75, line_dash="dash", line_color="green", annotation_text="Extreme Greed")
    
    fig.update_layout(
        title='Fear & Greed Index Over Time',
        xaxis_title='Date',
        yaxis_title='Fear & Greed Value',
        height=500
    )
    
    fig.show()

## 6. Correlation Analysis

Understand relationships between variables.

In [None]:
if df_raw is not None:
    # Select numeric columns
    numeric_cols = df_raw.select_dtypes(include=[np.number]).columns.tolist()
    
    # Correlation matrix
    corr = df_raw[numeric_cols].corr()
    
    # Plot
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr, annot=True, fmt='.2f', cmap='RdBu_r', center=0, 
                square=True, linewidths=1, cbar_kws={"shrink": 0.8})
    plt.title('Correlation Matrix of Raw Features')
    plt.tight_layout()
    plt.show()

## 7. Volatility Analysis

Crypto is known for volatility. Let's analyze it!

In [None]:
if df_raw is not None:
    # Calculate rolling volatility
    df_raw['volatility_30d'] = df_raw['daily_return'].rolling(window=30).std() * np.sqrt(365) * 100
    
    # Plot
    fig = make_subplots(
        rows=2, cols=1,
        shared_xaxes=True,
        subplot_titles=('BTC Close Price', '30-Day Rolling Volatility (Annualized)'),
        row_heights=[0.5, 0.5]
    )
    
    fig.add_trace(
        go.Scatter(x=df_raw['date'], y=df_raw['close'], name='Close Price', line=dict(color='blue')),
        row=1, col=1
    )
    
    fig.add_trace(
        go.Scatter(x=df_raw['date'], y=df_raw['volatility_30d'], name='Volatility', line=dict(color='red')),
        row=2, col=1
    )
    
    fig.update_layout(height=700, title_text='Price vs Volatility')
    fig.show()
    
    print(f"\nAverage annualized volatility: {df_raw['volatility_30d'].mean():.2f}%")
    print(f"Max volatility: {df_raw['volatility_30d'].max():.2f}%")
    print(f"Min volatility: {df_raw['volatility_30d'].min():.2f}%")

## 8. Feature Engineering Preview

Load and explore engineered features.

In [None]:
# Load featured data
featured_file = os.path.join(PROCESSED_DATA_DIR, 'btc_usd_featured.csv')

if os.path.exists(featured_file):
    df_featured = pd.read_csv(featured_file)
    df_featured['date'] = pd.to_datetime(df_featured['date'])
    print(f"‚úÖ Loaded featured data: {len(df_featured)} rows, {len(df_featured.columns)} features")
    display(df_featured.head())
else:
    print("‚ö†Ô∏è Featured data not found. Run feature_engineering.py first!")
    df_featured = None

In [None]:
if df_featured is not None:
    # Feature importance based on correlation with target
    target = 'close'
    feature_cols = [col for col in df_featured.columns if col not in ['date', target]]
    
    correlations = df_featured[feature_cols + [target]].corr()[target].drop(target).abs().sort_values(ascending=False)
    
    print("Top 20 features correlated with closing price:")
    print("="*60)
    display(correlations.head(20))
    
    # Plot
    plt.figure(figsize=(10, 8))
    correlations.head(20).plot(kind='barh')
    plt.xlabel('Absolute Correlation with Close Price')
    plt.title('Top 20 Features by Correlation')
    plt.tight_layout()
    plt.show()

## 9. Key Insights

**Summary of findings:**

1. **Data Quality**: [Your observations here]
2. **Price Trends**: [Your observations here]
3. **Volatility Patterns**: [Your observations here]
4. **Sentiment Relationship**: [Your observations here]
5. **Feature Importance**: [Your observations here]

**Next Steps:**
- Complete feature engineering
- Train LSTM model
- Evaluate predictions
- Build dashboard