# Notebook 1: Data Loading, Cleaning & Exploratory Data Analysis (EDA)

**Objective:** Load and clean the historical trader data and Fear/Greed sentiment data, merge them, and perform initial exploratory analysis.

**Datasets:**
1. Fear/Greed Index Data
2. Historical Trader Data from Hyperliquid

---

## Step 1: Import Required Libraries

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("âœ… Libraries imported successfully!")

## Step 2: Load Datasets

In [None]:
# Load Fear/Greed Index Data
fear_greed_df = pd.read_csv('../fear_greed_index.csv')

# Load Historical Trader Data
trader_df = pd.read_csv('../historical_data.csv')

print("Fear/Greed Dataset Shape:", fear_greed_df.shape)
print("Trader Dataset Shape:", trader_df.shape)
print("\nâœ… Data loaded successfully!")

## Step 3: Inspect Fear/Greed Dataset

In [None]:
# Display first few rows
print("=== Fear/Greed Index - First 5 Rows ===")
display(fear_greed_df.head())

print("\n=== Data Types ===")
print(fear_greed_df.dtypes)

print("\n=== Basic Statistics ===")
display(fear_greed_df.describe())

print("\n=== Missing Values ===")
print(fear_greed_df.isnull().sum())

In [None]:
# Check unique classifications
print("=== Unique Classifications ===")
print(fear_greed_df['classification'].value_counts())

## Step 4: Inspect Trader Dataset

In [None]:
# Display first few rows
print("=== Historical Trader Data - First 5 Rows ===")
display(trader_df.head())

print("\n=== Data Types ===")
print(trader_df.dtypes)

print("\n=== Column Names ===")
print(trader_df.columns.tolist())

In [None]:
# Basic statistics
print("=== Basic Statistics ===")
display(trader_df.describe())

print("\n=== Missing Values ===")
print(trader_df.isnull().sum())

## Step 5: Data Cleaning - Fear/Greed Dataset

In [None]:
# Convert date column to datetime
fear_greed_df['date'] = pd.to_datetime(fear_greed_df['date'])

# Sort by date
fear_greed_df = fear_greed_df.sort_values('date').reset_index(drop=True)

# Remove duplicates if any
fear_greed_df = fear_greed_df.drop_duplicates(subset=['date'], keep='first')

print("âœ… Fear/Greed dataset cleaned!")
print(f"Date range: {fear_greed_df['date'].min()} to {fear_greed_df['date'].max()}")
print(f"Total days: {len(fear_greed_df)}")

## Step 6: Data Cleaning - Trader Dataset

In [None]:
# Convert Timestamp IST to datetime
trader_df['Timestamp IST'] = pd.to_datetime(trader_df['Timestamp IST'], format='%d-%m-%Y %H:%M', errors='coerce')

# Extract date only for merging with sentiment data
trader_df['date'] = trader_df['Timestamp IST'].dt.date
trader_df['date'] = pd.to_datetime(trader_df['date'])

# Remove rows with missing critical values
print(f"Original rows: {len(trader_df)}")
trader_df = trader_df.dropna(subset=['date', 'Closed PnL'])
print(f"After removing NaN: {len(trader_df)}")

# Convert Closed PnL to numeric (in case there are any issues)
trader_df['Closed PnL'] = pd.to_numeric(trader_df['Closed PnL'], errors='coerce')

print("\nâœ… Trader dataset cleaned!")
print(f"Date range: {trader_df['date'].min()} to {trader_df['date'].max()}")
print(f"Total trades: {len(trader_df)}")

## Step 7: Merge Datasets on Date

In [None]:
# Merge trader data with sentiment data
merged_df = trader_df.merge(fear_greed_df[['date', 'classification', 'value']], 
                            on='date', 
                            how='inner')

print(f"Merged dataset shape: {merged_df.shape}")
print(f"Date range: {merged_df['date'].min()} to {merged_df['date'].max()}")
print(f"Total trades with sentiment: {len(merged_df)}")

# Display sample
display(merged_df.head())

## Step 8: Basic Exploratory Data Analysis (EDA)

In [None]:
# Distribution of sentiment classifications
print("=== Sentiment Distribution ===")
print(merged_df['classification'].value_counts())

plt.figure(figsize=(10, 6))
sentiment_counts = merged_df['classification'].value_counts()
plt.bar(sentiment_counts.index, sentiment_counts.values, color=['red', 'orange', 'green', 'darkgreen'])
plt.title('Distribution of Market Sentiment in Trading Data', fontsize=14, fontweight='bold')
plt.xlabel('Sentiment Classification')
plt.ylabel('Number of Trades')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('outputs/sentiment_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nâœ… Chart saved to outputs/sentiment_distribution.png")

In [None]:
# Trading side distribution
print("=== Trading Side Distribution ===")
print(merged_df['Side'].value_counts())

plt.figure(figsize=(8, 6))
side_counts = merged_df['Side'].value_counts()
plt.pie(side_counts.values, labels=side_counts.index, autopct='%1.1f%%', startangle=90, colors=['#66b3ff', '#ff9999'])
plt.title('Buy vs Sell Trades', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('outputs/buy_sell_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nâœ… Chart saved to outputs/buy_sell_distribution.png")

In [None]:
# Distribution of Closed PnL
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.hist(merged_df['Closed PnL'], bins=50, color='skyblue', edgecolor='black')
plt.title('Distribution of Closed PnL', fontsize=12, fontweight='bold')
plt.xlabel('Closed PnL')
plt.ylabel('Frequency')
plt.axvline(0, color='red', linestyle='--', label='Break-even')
plt.legend()

plt.subplot(1, 2, 2)
plt.boxplot(merged_df['Closed PnL'], vert=True)
plt.title('Boxplot of Closed PnL', fontsize=12, fontweight='bold')
plt.ylabel('Closed PnL')
plt.axhline(0, color='red', linestyle='--')

plt.tight_layout()
plt.savefig('outputs/pnl_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nâœ… Chart saved to outputs/pnl_distribution.png")

In [None]:
# Summary statistics
print("=== Summary Statistics ===")
print(f"Total Trades: {len(merged_df):,}")
print(f"Total PnL: ${merged_df['Closed PnL'].sum():,.2f}")
print(f"Average PnL per Trade: ${merged_df['Closed PnL'].mean():,.2f}")
print(f"Median PnL: ${merged_df['Closed PnL'].median():,.2f}")
print(f"Profitable Trades: {(merged_df['Closed PnL'] > 0).sum():,} ({(merged_df['Closed PnL'] > 0).sum()/len(merged_df)*100:.2f}%)")
print(f"Losing Trades: {(merged_df['Closed PnL'] < 0).sum():,} ({(merged_df['Closed PnL'] < 0).sum()/len(merged_df)*100:.2f}%)")

## Step 9: Save Cleaned and Merged Dataset

In [None]:
# Save merged dataset for use in notebook_2
merged_df.to_csv('csv_files/merged_trader_sentiment.csv', index=False)

# Also save cleaned individual datasets
fear_greed_df.to_csv('csv_files/cleaned_fear_greed.csv', index=False)
trader_df.to_csv('csv_files/cleaned_trader_data.csv', index=False)

print("âœ… All cleaned datasets saved to csv_files/ folder!")
print("\nðŸ“‹ Files created:")
print("   - csv_files/merged_trader_sentiment.csv")
print("   - csv_files/cleaned_fear_greed.csv")
print("   - csv_files/cleaned_trader_data.csv")

## Step 10: Data Summary Report

In [None]:
print("="*60)
print("ðŸ“Š NOTEBOOK 1 SUMMARY - DATA CLEANING & EDA COMPLETE")
print("="*60)
print(f"\nâœ… Fear/Greed Dataset: {len(fear_greed_df)} days of sentiment data")
print(f"âœ… Trader Dataset: {len(trader_df):,} total trades")
print(f"âœ… Merged Dataset: {len(merged_df):,} trades with sentiment labels")
print(f"\nðŸ“… Date Range: {merged_df['date'].min().strftime('%Y-%m-%d')} to {merged_df['date'].max().strftime('%Y-%m-%d')}")
print(f"\nðŸ’° Overall Trading Performance:")
print(f"   Total PnL: ${merged_df['Closed PnL'].sum():,.2f}")
print(f"   Win Rate: {(merged_df['Closed PnL'] > 0).sum()/len(merged_df)*100:.2f}%")
print(f"\nðŸŽ¯ Next Step: Proceed to notebook_2.ipynb for sentiment analysis")
print("="*60)