In [1]:
import pandas as pd
import altair as alt
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Load dataset
df = pd.read_csv("oasis_cross-sectional.csv")

# Drop missing or invalid data
df = df.dropna(subset=['Age', 'nWBV', 'CDR'])

# Create a categorical dementia status variable
df['Dementia Status'] = df['CDR'].apply(lambda x: 'Dementia' if x > 0 else 'No Dementia')

In [4]:
# creating categorical dementia
df['dementia_status'] = df['CDR'].apply(lambda x: 'Dementia' if x > 0 else 'No Dementia')

# plotting distribution histogram with updated colors
age_dist_chart = (
    alt.Chart(df)
    .mark_bar(opacity=0.7)
    .encode(
        alt.X('Age:Q', bin=alt.Bin(maxbins=30), title='Age (years)'),
        alt.Y('count()', title='Number of Participants'),
        alt.Color(
            'dementia_status:N',
            title='Cognitive Status',
            scale=alt.Scale(
                domain=['No Dementia', 'Dementia'],
                range=['#FF8C00', '#1E90FF']  # orange then blue
            )
        ),
        tooltip=['dementia_status', 'count()']
    )
    .properties(
        title='Age Distribution by Cognitive Status (OASIS Cross-Sectional)',
        width=500,
        height=300
    )
    .interactive()
)

age_dist_chart


  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


This chart shows how age is spread out for people with and without dementia in the OASIS dataset. To make it, the ages were grouped into bins, and we counted how many people fell into each age range. We also created a simple label using the CDR score so we could split the sample into “No Dementia” (orange) and “Dementia” (blue).

The plot shows that people without dementia tend to be younger, while people with dementia are mostly older. This matters because age is one of the biggest risk factors for dementia. Seeing the two age groups side-by-side helps show that the dataset follows a real-world pattern where dementia becomes more common as people get older.

In [9]:

# loading data
df = pd.read_csv("oasis_cross-sectional.csv")

# cognitive status from CDR
df["Status"] = df["CDR"].apply(
    lambda x: "Nondemented" if x == 0 else "Demented / Impaired"
)

df = df.dropna(subset=["nWBV", "Status"])

colors = ["#1E90FF", "#FF8C00"]   # blue, orange

# plotting boxplot
box = (
    alt.Chart(df)
    .mark_boxplot(size=40)
    .encode(
        y=alt.Y("Status:N", title="Cognitive Status"),
        x=alt.X("nWBV:Q", title="Normalized Whole Brain Volume"),
        color=alt.Color("Status:N", scale=alt.Scale(range=colors), legend=None)
    )
)

points = (
    alt.Chart(df)
    .mark_circle(size=35, opacity=0.25)
    .encode(
        y="Status:N",
        x="nWBV:Q",
        color=alt.Color("Status:N", scale=alt.Scale(range=colors), legend=None)
    )
)

means = (
    alt.Chart(df)
    .mark_point(shape="diamond", size=80, filled=True)
    .encode(
        y="Status:N",
        x="mean(nWBV):Q",
        color=alt.Color("Status:N", scale=alt.Scale(range=colors), legend=None)
    )
)

chart = (box + points + means).properties(
    width=500,
    height=250,
    title="Brain Volume (nWBV) by Cognitive Status (OASIS Cross-Sectional)"
)

chart


  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


This chart compares brain volume between people with dementia and people without dementia. To make it, we used normalized whole brain volume (nWBV) and grouped participants based on whether their CDR score showed dementia or not. We plotted two boxplots (blue for dementia, orange for no dementia) and added light dots to show individual data points.

The plot shows that people with dementia have lower brain volume on average, and their values tend to cluster lower than the non-dementia group. This is important because it gives a clear visual connection between dementia and brain atrophy. It also supports what we know from neuroscience: as dementia develops, brain tissue shrinks, and this shows up as lower nWBV.

In [8]:
df['Status'] = df['Status'].astype(str)

boxplot = (
    alt.Chart(df)
    .mark_boxplot()
    .encode(
        x=alt.X('age_group:N', title='Age Group'),
        y=alt.Y('nWBV:Q', title='Normalized Brain Volume'),
        color=alt.Color(
            'Status:N',
            title='Cognitive Status'
        ),
        tooltip=[
            alt.Tooltip('Age:Q', title='Age'),
            alt.Tooltip('nWBV:Q', title='Brain Volume'),
            alt.Tooltip('Status:N', title='Cognitive Status')
        ]
    )
    .properties(
        title='Brain Volume Across Age Groups by Cognitive Status',
        width=450,
        height=300
    )
)

boxplot


  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


This chart shows how brain volume changes with age and cognitive status. The age groups are split into five ranges, and each box shows the spread of brain volume (nWBV) for people in that group. As the age groups go up, brain volume drops, which matches what we expect with normal aging. The chart also shows that people with dementia usually have lower brain volume than people without dementia in the same age group. This makes it easy to see how age and dementia work together to affect brain structure.

### Description for Team Bio - Wali Qureshi