In [None]:
# 1. Setup and Data Loading
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import io

df = pd.read_csv('data_directories/output/training_data.csv')
print("Shape of the dataset:", df.shape)
print("\nFirst 5 rows of the dataset:\n", df.head())
print("\nData types and non-null values:\n")
df.info()

In [None]:
# 2. Summary Statistics
print("Descriptive Statistics:\n")
print(df.describe())

In [None]:
# 3. Class Distribution
plt.figure(figsize=(6, 4))
sns.countplot(x='class_label', data=df)
plt.title('Distribution of Class Label')
plt.show()


print("Class Value Counts:\n")
print(df['class_label'].value_counts())
print("\nClass Proportions:\n")
print(df['class_label'].value_counts(normalize=True))

In [None]:
# 4. Correlation Analysis
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix of Features')
plt.show()


## Key Observations from Correlation Matrix:
* **`class_label` vs. Features:** Observe features that show a strong correlation with the target variable (`class_label`). A high absolute correlation suggests a strong linear relationship for classification.
* **Inter-Feature Correlation (Multicollinearity):** High correlation between independent features (e.g., `num_points` and `surface_area`) might indicate multicollinearity, which is important to note for certain modeling techniques.


In [None]:
# 5. Feature Distribution Visualization (Histograms)
numerical_features = df.drop('class_label', axis=1).columns

df[numerical_features].hist(figsize=(15, 12), bins=15)
plt.suptitle('Histograms of Numerical Features', y=1.02)
plt.tight_layout()
plt.show()


## Observations from Histograms:
* **Skewness/Normality:** Check for the distribution shape of each feature. Features like `num_points`, `surface_area`, and `bbox_volume` appear to be highly right-skewed, suggesting the presence of large outliers or needing a transformation (like log-transform) for modeling.


In [None]:
# 6. Box Plots by Class
fig, axes = plt.subplots(3, 3, figsize=(18, 15))
fig.suptitle('Box Plots of Features by Class Label', y=1.02)

for i, col in enumerate(numerical_features):
    sns.boxplot(x='class_label', y=col, data=df, ax=axes[i//3, i%3])
    axes[i//3, i%3].set_title(col)
    axes[i//3, i%3].set_xlabel('Class Label')

plt.tight_layout()
plt.show()


# Example of a potential differentiating feature: mean_dist
plt.figure(figsize=(7, 5))
sns.boxplot(x='class_label', y='mean_dist', data=df)
plt.title('mean_dist by Class Label')
plt.show()


## Observations from Box Plots:
* **Feature Separation:** Features like **`mean_dist`** and **`density`** show a clear difference in their median and overall distribution between `class_label` 0 and 1, suggesting they are good discriminators for the classification task.
* **Outliers:** Features such as `max_dist`, `num_points`, `surface_area`, and `bbox_volume` have extreme outliers, especially in one of the classes, which confirms the observations from the histograms.