# Data Analysis Project

This notebook provides a structured approach to data analysis including:
- Data loading and preprocessing
- Exploratory data analysis
- Data visualization
- Statistical analysis
- Results and insights

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

## Data Loading and Preparation

Load your dataset here or generate sample data for analysis.

In [None]:
# Load or create your dataset here
# Example: df = pd.read_csv('your_data.csv')

# For demonstration, creating sample data
np.random.seed(42)
n_samples = 1000

# Create sample dataset
data = {
    'date': pd.date_range('2024-01-01', periods=n_samples, freq='D'),
    'value': np.random.normal(100, 15, n_samples),
    'category': np.random.choice(['A', 'B', 'C', 'D'], n_samples),
    'metric': np.random.exponential(50, n_samples)
}

df = pd.DataFrame(data)
print(f"Dataset created with {len(df)} rows and {len(df.columns)} columns")
print("\nFirst few rows:")
df.head()

## Exploratory Data Analysis

Examine the structure and characteristics of the data.

In [None]:
# Data overview
print("Dataset Info:")
print(df.info())
print("\nDataset Description:")
print(df.describe())
print("\nMissing Values:")
print(df.isnull().sum())

## Data Visualization

Create visualizations to understand patterns and relationships in the data.

In [None]:
# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Time series plot
axes[0, 0].plot(df['date'], df['value'])
axes[0, 0].set_title('Value Over Time')
axes[0, 0].set_xlabel('Date')
axes[0, 0].set_ylabel('Value')
axes[0, 0].tick_params(axis='x', rotation=45)

# Category distribution
df['category'].value_counts().plot(kind='bar', ax=axes[0, 1])
axes[0, 1].set_title('Category Distribution')
axes[0, 1].set_xlabel('Category')
axes[0, 1].set_ylabel('Count')

# Value distribution
axes[1, 0].hist(df['value'], bins=30, alpha=0.7)
axes[1, 0].set_title('Value Distribution')
axes[1, 0].set_xlabel('Value')
axes[1, 0].set_ylabel('Frequency')

# Box plot by category
df.boxplot(column='value', by='category', ax=axes[1, 1])
axes[1, 1].set_title('Value by Category')
axes[1, 1].set_xlabel('Category')
axes[1, 1].set_ylabel('Value')

plt.tight_layout()
plt.show()

## Statistical Analysis

Perform statistical tests and analysis on the data.

In [None]:
# Statistical analysis
print("Correlation Matrix:")
numeric_cols = df.select_dtypes(include=[np.number]).columns
correlation_matrix = df[numeric_cols].corr()
print(correlation_matrix)

# Visualize correlation matrix
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.show()

# Group statistics
print("\nStatistics by Category:")
print(df.groupby('category')['value'].agg(['mean', 'std', 'min', 'max']))

## Results and Insights

Summarize key findings and insights from the analysis.

In [None]:
# Summary and insights
print("Key Findings:")
print(f"• Total records analyzed: {len(df):,}")
print(f"• Date range: {df['date'].min()} to {df['date'].max()}")
print(f"• Average value: {df['value'].mean():.2f}")
print(f"• Value standard deviation: {df['value'].std():.2f}")
print(f"• Number of categories: {df['category'].nunique()}")
print(f"• Most common category: {df['category'].mode()[0]}")

# Additional analysis can be added here
print("\nAnalysis complete!")