In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style='whitegrid')

In [None]:
# Load variable definitions for reference
var_defs = pd.read_excel('../Xente_Variable_Definitions (1).xlsx')
var_defs

In [None]:
## Data Overview

Load the dataset and display its structure.

In [None]:
df = pd.read_csv('../data.csv')
df.info()
df.head()

In [None]:
## Summary Statistics

Central tendency, dispersion, and shape of the dataset's distribution.

In [None]:
df.describe(include='all')

In [None]:
## Distribution of Numerical Features

Visualize the distribution of numerical features.

In [None]:
num_cols = df.select_dtypes(include=np.number).columns
df[num_cols].hist(figsize=(15, 10), bins=30)
plt.suptitle('Numerical Feature Distributions')
plt.show()

In [None]:
## Distribution of Categorical Features

Analyze the distribution of categorical features.

In [None]:
cat_cols = df.select_dtypes(include='object').columns
for col in cat_cols:
    plt.figure(figsize=(8, 4))
    df[col].value_counts().plot(kind='bar')
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.show()

In [None]:
## Correlation Analysis

Visualize correlations between numerical features.

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(df[num_cols].corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

In [None]:
## Identifying Missing Values

Check for missing data.

In [None]:
## Outlier Detection

Box plots for numerical features to identify outliers.

In [None]:
missing = df.isnull().sum()
missing[missing > 0].sort_values(ascending=False)

In [None]:
for col in num_cols:
    plt.figure(figsize=(8, 4))
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot of {col}')
    plt.show()