# Exploratory Data Analysis

This notebook is used for performing exploratory data analysis (EDA) on the dataset. The goal is to understand the data better through visualizations and summary statistics.

In [1]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set(style='whitegrid')

In [2]:
# Load the dataset
data = pd.read_csv('../data/processed/your_processed_data.csv')

# Display the first few rows of the dataset
data.head()

In [3]:
# Summary statistics
data.describe()

In [4]:
# Visualize the distribution of a specific column
plt.figure(figsize=(10, 6))
sns.histplot(data['your_column_name'], bins=30, kde=True)
plt.title('Distribution of Your Column')
plt.xlabel('Your Column Name')
plt.ylabel('Frequency')
plt.show()

In [5]:
# Visualize relationships between variables
plt.figure(figsize=(10, 6))
sns.scatterplot(x='column_x', y='column_y', data=data)
plt.title('Scatter Plot between Column X and Column Y')
plt.xlabel('Column X')
plt.ylabel('Column Y')
plt.show()

In [6]:
# Correlation matrix
plt.figure(figsize=(12, 8))
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()