In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use("seaborn-v0_8")
pd.set_option("display.max_columns", None)

In [None]:
#1 Load Dataset

df = pd.read_csv("Dataset/dataset-uci.csv")
print("Shape:", df.shape)

df.head()

In [None]:
# 2 Basic Information

df.info()

In [None]:
# 3 Descriptive Statistics

df.describe().T

In [None]:
# 4 Target Distribution

plt.figure(figsize=(5,5))
df['Gallstone Status'].value_counts().plot(kind='bar', color=['skyblue', 'salmon'])
plt.title("Distribution of Gallstone vs Non-Gallstone")
plt.xlabel("Gallstone (0 = No, 1 = Yes)")
plt.ylabel("Count")
plt.show()

df['Gallstone Status'].value_counts(normalize=True)

In [None]:
# 5 Check Missing Values

df.isnull().sum()

In [None]:
# 6 Correlation Heatmap

plt.figure(figsize=(16, 14))
corr = df.corr()
sns.heatmap(corr, cmap='coolwarm', center=0)
plt.title("Correlation Heatmap")
plt.show()

In [None]:
# 7 Correlation with Target

target_corr = df.corr()['Gallstone Status'].sort_values(ascending=False)
target_corr

In [None]:
# 8 Top 10 Positively and Negatively Correlated

plt.figure(figsize=(8,10))
target_corr.drop('Gallstone Status').sort_values().plot(kind='barh')
plt.title("Feature Correlation with Gallstone")
plt.xlabel("Correlation")
plt.show()

In [None]:
# 9 Boxplots for Top 10 Most Correlated Features

top10 = target_corr.drop("Gallstone Status").abs().sort_values(ascending=False).head(10).index.tolist()

for col in top10:
    plt.figure(figsize=(6,4))
    sns.boxplot(x="Gallstone Status", y=col, data=df)
    plt.title(f"{col} vs Gallstone Status")
    plt.show()

In [None]:
# 10 Distribution Plots

numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
numeric_cols.remove("Gallstone Status")

df[numeric_cols].hist(figsize=(18, 16), bins=20)
plt.suptitle("Distribution of Numeric Features", y=1.02)
plt.show()

In [None]:
# 11 Group Averages (Clinical Insight)

group_means = df.groupby("Gallstone Status").mean().T
group_means.columns = ["No Gallstones", "Gallstones"]
group_means.sort_values("Gallstones", ascending=False)

In [None]:
# 12 Differences in Means

mean_diff = (group_means["Gallstones"] - group_means["No Gallstones"]).sort_values()
plt.figure(figsize=(8,12))
mean_diff.plot(kind="barh")
plt.title("Mean Difference (Gallstone âˆ’ No Gallstone)")
plt.xlabel("Difference")
plt.show()

In [None]:
# 13 Feature Variance

variances = df.var().sort_values(ascending=False)
variances