In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style for plots
sns.set(style="whitegrid")


In [None]:
# Load the dataset
df = pd.read_csv('/mnt/data/train (3).csv')

# Basic information
print("\nDataset Information:")
df.info()


In [None]:
# Statistical summary
print("\nStatistical Description:")
print(df.describe())


In [None]:
# Checking value counts for categorical variables
print("\nValue Counts for Categorical Variables:")
for col in df.select_dtypes(include=['object']).columns:
    print(f"\nColumn: {col}")
    print(df[col].value_counts())


In [None]:
# Histograms for numerical features
df.hist(figsize=(15, 10), bins=30, edgecolor='black')
plt.suptitle('Histograms of Numerical Features', fontsize=16)
plt.show()


In [None]:
# Boxplots to check for outliers
num_cols = df.select_dtypes(include=[np.number]).columns
plt.figure(figsize=(15, 10))
for i, col in enumerate(num_cols, 1):
    plt.subplot(3, 3, i)
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot of {col}')
plt.tight_layout()
plt.show()


In [None]:
# Pairplot to visualize relationships
sns.pairplot(df)
plt.suptitle('Pairplot of Features', y=1.02)
plt.show()


In [None]:
# Correlation heatmap
plt.figure(figsize=(12, 8))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix Heatmap')
plt.show()


# Final Summary
- Data types verified, missing values (if any) identified.
- Distribution of numerical features explored with histograms.
- Outliers detected via boxplots.
- Pairwise relationships visualized.
- Correlation among features analyzed.
- Key patterns and potential issues noted.
