In [3]:
"""
Q2: Load/create a dataset, run .describe(), and explain the outputs.
This script is self-contained: it creates a small CSV, loads it, and describes it.
"""

# Import pandas for data handling and NumPy for numeric utilities
import pandas as pd  # pd is the conventional alias for pandas
import numpy as np   # np is the conventional alias for NumPy

In [4]:
# ---------------------------
# 1) Create a small sample dataset (so the code works without external files)
# ---------------------------
# pd.DataFrame(data, columns=...) creates a table-like object from a dict/list
df = pd.DataFrame(
    data={  # 'data' is a dict where keys become column names
        "age":    [22, 25, 25, 29, 30, 31, 22, 27, 29, 26],   # numeric column
        "score":  [70, 72, 68, 65, 90, 85, 88, 92, 60, 75],   # numeric column
        "dept":   ["A","B","B","A","C","C","A","B","C","A"],  # categorical column
    }
    # 'columns' is optional here because dict keys are used as column names
)

In [6]:
# ---------------------------
# 2) (Optional) Save to CSV and load back to demonstrate loading
# ---------------------------
csv_path = "sample_students.csv"   # Path where CSV will be written
df.to_csv(csv_path, index=False)   # index=False → do not write row indices as a CSV column

# pd.read_csv(filepath, **kwargs) reads a CSV file into a DataFrame
df_loaded = pd.read_csv(csv_path)  # default separator is ',', header inferred from first row

In [7]:
# ---------------------------
# 3) Use .describe() to get numeric summary statistics
# ---------------------------
# DataFrame.describe(include=None) by default summarizes numeric columns:
# returns count, mean, std, min, 25%, 50% (median), 75%, max
summary_numeric = df_loaded.describe()  # only numeric columns summarized by default
print("Numeric describe():\n", summary_numeric, "\n")

# If you want to see *all* columns (including categorical), use include='all'
summary_all = df_loaded.describe(include='all')  # include='all' → attempts summary for all types
print("Describe(include='all'):\n", summary_all, "\n")

Numeric describe():
              age      score
count  10.000000  10.000000
mean   26.600000  76.500000
std     3.169297  11.394443
min    22.000000  60.000000
25%    25.000000  68.500000
50%    26.500000  73.500000
75%    29.000000  87.250000
max    31.000000  92.000000 

Describe(include='all'):
               age      score dept
count   10.000000  10.000000   10
unique        NaN        NaN    3
top           NaN        NaN    A
freq          NaN        NaN    4
mean    26.600000  76.500000  NaN
std      3.169297  11.394443  NaN
min     22.000000  60.000000  NaN
25%     25.000000  68.500000  NaN
50%     26.500000  73.500000  NaN
75%     29.000000  87.250000  NaN
max     31.000000  92.000000  NaN 



In [8]:
# ---------------------------
# 4) Explain the fields returned by .describe() for numeric columns
# ---------------------------
explanations = {
    "count": "Number of non-missing values in the column.",
    "mean":  "Arithmetic average (sum / count).",
    "std":   "Standard deviation (average distance from the mean).",
    "min":   "Smallest value.",
    "25%":   "First quartile (Q1) — 25% of values fall below this.",
    "50%":   "Median (Q2) — 50% of values fall below this.",
    "75%":   "Third quartile (Q3) — 75% of values fall below this.",
    "max":   "Largest value.",
}
print("Field meanings for numeric describe():")
for k, v in explanations.items():
    print(f" - {k}: {v}")

Field meanings for numeric describe():
 - count: Number of non-missing values in the column.
 - mean: Arithmetic average (sum / count).
 - std: Standard deviation (average distance from the mean).
 - min: Smallest value.
 - 25%: First quartile (Q1) — 25% of values fall below this.
 - 50%: Median (Q2) — 50% of values fall below this.
 - 75%: Third quartile (Q3) — 75% of values fall below this.
 - max: Largest value.
