# Data Exploration Notebook

This notebook is used for exploring the transaction dataset to understand its structure, identify patterns, and prepare for model training.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set aesthetic style for the plots
sns.set(style='whitegrid')

# Load the dataset
data = pd.read_csv('../data/transactions.csv')

# Display the first few rows of the dataset
data.head()

In [2]:
# Summary statistics of the dataset
data.describe()

In [3]:
# Check for missing values
missing_values = data.isnull().sum()
missing_values[missing_values > 0]

In [4]:
# Visualize the distribution of transaction amounts
plt.figure(figsize=(10, 6))
sns.histplot(data['amount'], bins=50, kde=True)
plt.title('Distribution of Transaction Amounts')
plt.xlabel('Amount')
plt.ylabel('Frequency')
plt.show()

In [5]:
# Correlation matrix
plt.figure(figsize=(12, 8))
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [6]:
# Save the cleaned dataset for model training
cleaned_data = data.dropna()  # Example of cleaning step
cleaned_data.to_csv('../data/cleaned_transactions.csv', index=False)