# Liver Disease Dataset EDA
This notebook performs exploratory data analysis (EDA) on the liver disease dataset.

In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## Load the Dataset
Load the liver disease dataset for analysis.

In [None]:
import os
print("Files in current directory:")
print(os.listdir())

In [None]:
# Load the dataset from the workspace
try:
    df = pd.read_csv('Liver Patient Dataset (LPD)_train.csv')
    print("Dataset loaded successfully!")
    print(f"Dataset shape: {df.shape}")
    display(df.head())
except FileNotFoundError:
    print("Dataset file not found. Please check the file path.")
except Exception as e:
    print(f"Error loading dataset: {e}")

## Dataset Overview
Check the shape, columns, and basic info of the dataset.

In [None]:
# Dataset shape and info
print("Dataset Shape:", df.shape)
print("\nColumn Names:")
print(df.columns.tolist())
print("\nDataset Info:")
df.info()

In [None]:
# Data Types and Quality Check
print("Data Types:")
print(df.dtypes)
print("\nUnique values per column:")
for col in df.columns:
    print(f"{col}: {df[col].nunique()} unique values")

# Check for any potential data quality issues
print("\nData Quality Check:")
print(f"Total rows: {len(df)}")
print(f"Total columns: {len(df.columns)}")
print(f"Missing values: {df.isnull().sum().sum()}")
print(f"Duplicate rows: {df.duplicated().sum()}")

# Handle any 'Gender' column encoding if present
if 'Gender' in df.columns:
    print(f"\nGender distribution:")
    print(df['Gender'].value_counts())
    
# Check for any unusual values or outliers
print("\nBasic statistics for numeric columns:")
numeric_cols = df.select_dtypes(include=[np.number]).columns
if len(numeric_cols) > 0:
    print(df[numeric_cols].describe())

## Statistical Summary
Get a statistical summary of the dataset.

In [None]:
# Statistical summary
print("Statistical Summary:")
display(df.describe())

# For categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns
if len(categorical_cols) > 0:
    print("\nCategorical columns summary:")
    for col in categorical_cols:
        print(f"\n{col}:")
        print(df[col].value_counts())

## Check for Missing Values
Identify missing values in the dataset.

In [None]:
# Missing values analysis
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_data,
    'Percentage': missing_percent
}).sort_values('Missing Count', ascending=False)

print("Missing Values Summary:")
print(missing_df[missing_df['Missing Count'] > 0])

# Visualize missing values if any
if missing_data.sum() > 0:
    plt.figure(figsize=(10, 6))
    missing_data[missing_data > 0].plot(kind='bar')
    plt.title('Missing Values by Column')
    plt.xlabel('Columns')
    plt.ylabel('Missing Value Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print("No missing values found in the dataset!")

## Visualize Feature Distributions
Visualize the distribution of key features.

In [None]:
# Visualize Age distribution
if 'Age' in df.columns:
    plt.figure(figsize=(10,6))
    sns.histplot(df['Age'], kde=True, bins=30)
    plt.title('Age Distribution')
    plt.xlabel('Age')
    plt.ylabel('Frequency')
    plt.grid(True, alpha=0.3)
    plt.show()
else:
    print("Age column not found in the dataset.")

In [None]:
# Visualize Total Bilirubin distribution
if 'Total_Bilirubin' in df.columns:
    plt.figure(figsize=(10,6))
    sns.histplot(df['Total_Bilirubin'], kde=True, bins=30)
    plt.title('Total Bilirubin Distribution')
    plt.xlabel('Total Bilirubin')
    plt.ylabel('Frequency')
    plt.grid(True, alpha=0.3)
    plt.show()
else:
    print("Total_Bilirubin column not found in the dataset.")

## Correlation Heatmap
Visualize correlations between features.

In [None]:
# Select only numeric columns for correlation analysis
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if len(numeric_cols) > 1:
    plt.figure(figsize=(12,10))
    correlation_matrix = df[numeric_cols].corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
                square=True, fmt='.2f', cbar_kws={'shrink': 0.8})
    plt.title('Feature Correlation Heatmap (Numeric Features Only)')
    plt.tight_layout()
    plt.show()
    
    # Print highly correlated pairs
    print("\nHighly correlated feature pairs (|correlation| > 0.7):")
    for i in range(len(correlation_matrix.columns)):
        for j in range(i+1, len(correlation_matrix.columns)):
            corr_val = correlation_matrix.iloc[i, j]
            if abs(corr_val) > 0.7:
                print(f"{correlation_matrix.columns[i]} - {correlation_matrix.columns[j]}: {corr_val:.3f}")
else:
    print("Not enough numeric columns for correlation analysis.")

## Summary
Key findings from the exploratory data analysis.

In [None]:
# Summary of key findings
print("=== EDA SUMMARY ===")
print(f"Dataset Shape: {df.shape}")
print(f"Number of Features: {len(df.columns)}")
print(f"Number of Samples: {len(df)}")
print(f"\nData Types:")
print(df.dtypes.value_counts())
print(f"\nMissing Values: {df.isnull().sum().sum()}")
print(f"Duplicate Rows: {df.duplicated().sum()}")

# Memory usage
print(f"\nMemory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print("\n=== RECOMMENDATIONS ===")
print("1. Check for and handle missing values if any")
print("2. Consider feature scaling for machine learning models")
print("3. Look for outliers in numeric features")
print("4. Consider feature engineering based on domain knowledge")
print("5. Ensure proper encoding of categorical variables")