In [None]:
# Cell 1: Import Necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Set seaborn style
sns.set(style='whitegrid')


In [None]:
# Cell 2: Load Data
processed_data_path = '../data/processed/sec_filings_processed.csv'

if os.path.exists(processed_data_path):
    df = pd.read_csv(processed_data_path)
    print(f"Data loaded successfully from {processed_data_path}")
else:
    print(f"Data file not found at {processed_data_path}")


In [None]:
# Cell 3: Preview Data
df.head()


In [None]:
# Cell 4: Summary Statistics
df.describe(include='all')


In [None]:
# Cell 5: Check for Missing Values
missing_values = df.isnull().sum()
print("Missing values in each column:")
print(missing_values)


In [None]:
# Cell 6: Data Types and Info
df.info()


In [None]:
# Cell 7: Distribution of Numerical Features
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns

plt.figure(figsize=(14, 8))
df[numerical_columns].hist(bins=20, figsize=(14, 8), layout=(5, 3))
plt.suptitle('Distribution of Numerical Features', fontsize=16)
plt.show()


In [None]:
# Cell 8: Distribution of Categorical Features
categorical_columns = df.select_dtypes(include=['object']).columns

plt.figure(figsize=(14, 8))
for i, column in enumerate(categorical_columns, 1):
    plt.subplot(3, 3, i)
    df[column].value_counts().plot(kind='bar')
    plt.title(f'Distribution of {column}')
    plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Cell 9: Correlation Analysis
plt.figure(figsize=(12, 8))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix of Numerical Features')
plt.show()


In [None]:
# Cell 10: Feature Analysis Based on Target Variable
# Example: Analyzing a specific feature based on target variable
if 'target' in df.columns:
    sns.boxplot(x='target', y='some_numerical_feature', data=df)
    plt.title('Distribution of Some Numerical Feature by Target Variable')
    plt.show()
else:
    print("Target variable not found in the dataset.")


In [None]:
# Cell 11: Term Frequency Visualization
term_columns = df.columns[1:]  # Assuming the first column is CIK and others are terms
term_counts = df[term_columns].sum().sort_values(ascending=False)

plt.figure(figsize=(14, 6))
sns.barplot(x=term_counts.index, y=term_counts.values)
plt.title('Frequency of Key Terms in SEC Filings')
plt.xticks(rotation=90)
plt.show()


In [None]:
# Cell 12: Advanced Text Analysis (Optional)
# Example: Word Cloud of Terms
from wordcloud import WordCloud

all_terms = ' '.join(df[term_columns].columns)
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_terms)

plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Key Terms in SEC Filings')
plt.show()


In [None]:
# Cell 13: Outliers Detection
plt.figure(figsize=(12, 6))
sns.boxplot(data=df[numerical_columns])
plt.title('Box Plot for Outliers Detection')
plt.xticks(rotation=45)
plt.show()


In [None]:
# Cell 14: Conclusion and Next Steps
"""
**Key Findings from EDA**:
- Summary statistics reveal that [describe insights].
- Visualizations show that [describe findings].
- Correlation analysis indicates that [describe correlations].

**Next Steps**:
- Feature engineering based on identified patterns and correlations.
- Further refinement of text processing and pattern detection.
- Model training and evaluation using processed data.
"""
