In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style for plots
sns.set(style="whitegrid")


In [None]:
# Load the original dataset
df = pd.read_csv('train (3).csv')  # Update path if needed

# Check missing values
print("Missing values before cleaning:")
print(df.isnull().sum())

# Handle missing values
# For numeric columns -> fill with median
num_cols = df.select_dtypes(include=[np.number]).columns
df[num_cols] = df[num_cols].apply(lambda x: x.fillna(x.median()))

# For categorical columns -> fill with mode
cat_cols = df.select_dtypes(include=['object']).columns
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# Confirm no missing values remain
print("\nMissing values after cleaning:")
print(df.isnull().sum())


In [None]:
# Dataset Information
print("\nDataset Information:")
df.info()


In [None]:
# Statistical Description
print("\nStatistical Description:")
print(df.describe())


In [None]:
# Value Counts for Categorical Variables
print("\nValue Counts for Categorical Variables:")
for col in df.select_dtypes(include=['object']).columns:
    print(f"\nColumn: {col}")
    print(df[col].value_counts())


In [None]:
# Histograms for Numerical Features
df.hist(figsize=(15, 10), bins=30, edgecolor='black')
plt.suptitle('Histograms of Numerical Features', fontsize=16)
plt.show()


In [None]:
# Boxplots for Outliers
num_cols = df.select_dtypes(include=[np.number]).columns
plt.figure(figsize=(15, 10))
for i, col in enumerate(num_cols, 1):
    plt.subplot(3, 3, i)
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot of {col}')
plt.tight_layout()
plt.show()


In [None]:
# Pairplot to visualize relationships
sns.pairplot(df)
plt.suptitle('Pairplot of Features', y=1.02)
plt.show()


In [None]:
# Correlation Matrix Heatmap
plt.figure(figsize=(12, 8))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix Heatmap')
plt.show()


# Final Summary
- Handled missing values without dropping data (numeric: filled with median, categorical: filled with mode).
- Verified data types and missing values.
- Explored distributions with histograms.
- Detected outliers using boxplots.
- Analyzed feature relationships with pairplots.
- Visualized feature correlations using a heatmap.


In [None]:

# Boxplots to check for outliers (dynamic layout)
num_cols = df.select_dtypes(include=[np.number]).columns
n_cols = 3
n_rows = int(np.ceil(len(num_cols) / n_cols))

plt.figure(figsize=(n_cols*5, n_rows*4))
for i, col in enumerate(num_cols, 1):
    plt.subplot(n_rows, n_cols, i)
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot of {col}')

plt.tight_layout()
plt.show()


In [None]:

# Scatterplots to check relationships between important numeric columns
important_cols = ['magnitude', 'depth', 'latitude', 'longitude']  # Example columns

for i in range(len(important_cols)):
    for j in range(i+1, len(important_cols)):
        plt.figure(figsize=(6, 4))
        sns.scatterplot(x=df[important_cols[i]], y=df[important_cols[j]])
        plt.title(f'Scatterplot: {important_cols[i]} vs {important_cols[j]}')
        plt.xlabel(important_cols[i])
        plt.ylabel(important_cols[j])
        plt.show()
