
# Module 7 Lab — Pandas & Data Science in Python (Starter Notebook)

This notebook is a **starter template** you can use to complete your *Pandas for Data Science in 20 Minutes* lab.

### How to use this file
1. Replace `dataset_path = "dataset.csv"` below with the **actual CSV filename** you downloaded from your course.
2. Run the notebook cell-by-cell (or **Run > Run All**).
3. Add/modify steps to match the YouTube tutorial exactly as required by your assignment.
4. When you're finished: **File → Download as → Notebook (.ipynb)** and upload the file to GitHub.  
   Then submit the GitHub URL to your LMS.

> If the CSV path is wrong or missing, this notebook falls back to **demo data** so you can still practice.


In [1]:

# If pandas isn't installed in your environment, run:
# !pip install pandas

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Show pandas & Python versions for reproducibility
import sys
print(f"Python: {sys.version.split()[0]}  |  pandas: {pd.__version__}")


Python: 3.12.12  |  pandas: 2.2.2


In [2]:

# === Load your dataset ===
# Replace this with the actual filename you downloaded (e.g., 'data.csv' or 'Pokemon.csv', etc.)
dataset_path = "telco_churn.csv"


try:
    df = pd.read_csv(dataset_path)
    print(f"Loaded: {dataset_path}  -> shape={df.shape}")
except FileNotFoundError:
    print(f"Couldn't find '{dataset_path}'. Using demo dataset instead.")
    # --- DEMO DATA (so you can still follow along) ---
    rng = np.random.default_rng(7)
    n = 200
    df = pd.DataFrame({
        "id": np.arange(1, n+1),
        "category": rng.choice(["A", "B", "C", "D"], size=n, p=[0.25, 0.35, 0.25, 0.15]),
        "value": rng.normal(loc=50, scale=10, size=n).round(2),
        "flag": rng.choice([True, False], size=n),
        "score": rng.integers(0, 101, size=n)
    })
    # Introduce some missing values
    df.loc[rng.choice(df.index, size=10, replace=False), "value"] = np.nan
    df.loc[rng.choice(df.index, size=6, replace=False), "category"] = None
    print(f"Demo dataset created -> shape={df.shape}")

df.head()


Couldn't find 'telco_churn.csv'. Using demo dataset instead.
Demo dataset created -> shape=(200, 5)


Unnamed: 0,id,category,value,flag,score
0,1,,59.21,True,81
1,2,D,45.44,True,77
2,3,C,65.15,True,68
3,4,A,37.53,False,46
4,5,B,58.62,True,83


In [3]:

# Peek at the data
display(df.head(10))

# DataFrame structure
print("\nDataFrame info:\n")
print(df.info())

# Summary statistics (numeric columns)
display(df.describe(include='all'))


Unnamed: 0,id,category,value,flag,score
0,1,,59.21,True,81
1,2,D,45.44,True,77
2,3,C,65.15,True,68
3,4,A,37.53,False,46
4,5,B,58.62,True,83
5,6,D,54.94,False,37
6,7,A,58.74,True,66
7,8,C,68.79,False,55
8,9,C,64.84,True,30
9,10,B,38.55,True,20



DataFrame info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        200 non-null    int64  
 1   category  194 non-null    object 
 2   value     190 non-null    float64
 3   flag      200 non-null    bool   
 4   score     200 non-null    int64  
dtypes: bool(1), float64(1), int64(2), object(1)
memory usage: 6.6+ KB
None


Unnamed: 0,id,category,value,flag,score
count,200.0,194,190.0,200,200.0
unique,,4,,2,
top,,B,,True,
freq,,67,,103,
mean,100.5,,49.415263,,48.29
std,57.879185,,9.725419,,28.738396
min,1.0,,17.49,,1.0
25%,50.75,,43.1,,23.75
50%,100.5,,49.43,,46.0
75%,150.25,,55.5525,,75.25


In [4]:

# Column selection
cols_example = [c for c in df.columns[:3]]
print("Example selected columns:", cols_example)
display(df[cols_example].head())

# Row filtering (example conditions — edit to match your dataset)
numeric_cols = df.select_dtypes(include=[np.number]).columns
if len(numeric_cols) > 0:
    col0 = numeric_cols[0]
    sample_filter = df[col0] > df[col0].median()
    filtered = df[sample_filter]
    print(f"Filtered rows where {col0} > median: {len(filtered)} rows")
    display(filtered.head())
else:
    print("No numeric columns found to demonstrate row filtering.")

# To be more explicit, let's show a realistic filter using 'score' if present
if "score" in df.columns:
    high_scorers = df[df["score"] >= 90]
    print(f"High scorers (score >= 90): {len(high_scorers)} rows")
    display(high_scorers.head())
else:
    print("No 'score' column found; skipping the high-score filter example.")


Example selected columns: ['id', 'category', 'value']


Unnamed: 0,id,category,value
0,1,,59.21
1,2,D,45.44
2,3,C,65.15
3,4,A,37.53
4,5,B,58.62


Filtered rows where id > median: 100 rows


Unnamed: 0,id,category,value,flag,score
100,101,D,47.48,False,36
101,102,C,47.96,False,17
102,103,B,50.54,True,95
103,104,B,65.12,True,28
104,105,D,55.56,True,87


High scorers (score >= 90): 18 rows


Unnamed: 0,id,category,value,flag,score
47,48,B,47.09,True,97
58,59,D,40.59,True,98
73,74,C,49.93,False,93
86,87,B,48.22,False,91
102,103,B,50.54,True,95


In [5]:

# Missing values overview
missing_counts = df.isna().sum().sort_values(ascending=False)
print("Missing values per column:\n", missing_counts)

# Example: fill numeric NaNs with mean (edit as needed for your dataset)
num_cols = df.select_dtypes(include=[np.number]).columns
df_filled = df.copy()
df_filled[num_cols] = df_filled[num_cols].apply(lambda s: s.fillna(s.mean()))

# Example: fill categorical NaNs with a label
obj_cols = df.select_dtypes(include=['object']).columns
for c in obj_cols:
    df_filled[c] = df_filled[c].fillna("Unknown")

print("\nAfter filling missing values:")
print(df_filled.isna().sum())


Missing values per column:
 value       10
category     6
id           0
flag         0
score        0
dtype: int64

After filling missing values:
id          0
category    0
value       0
flag        0
score       0
dtype: int64


In [6]:

# Sort by a numeric column if present
if len(num_cols) > 0:
    num_col = num_cols[0]
    print(f"Sorting by numeric column: {num_col}")
    display(df_filled.sort_values(by=num_col, ascending=False).head())
else:
    print("No numeric columns to sort.")

# Value counts for a categorical column if present
cat_cols = df_filled.select_dtypes(include=['object', 'category', 'bool']).columns
if len(cat_cols) > 0:
    cat_col = cat_cols[0]
    print(f"\nValue counts for {cat_col}:")
    display(df_filled[cat_col].value_counts())
else:
    print("No categorical columns to count.")

# Unique values example
if len(cat_cols) > 0:
    print(f"Unique values in {cat_col}:", df_filled[cat_col].unique())


Sorting by numeric column: id


Unnamed: 0,id,category,value,flag,score
199,200,B,52.16,True,7
198,199,D,48.09,True,53
197,198,B,57.94,False,34
196,197,A,35.56,True,9
195,196,C,37.0,False,76



Value counts for category:


Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
B,67
A,50
C,44
D,33
Unknown,6


Unique values in category: ['Unknown' 'D' 'C' 'A' 'B']


In [7]:

# Example: create a z-score of 'value' if it exists
if "value" in df_filled.columns:
    df_filled["value_z"] = (df_filled["value"] - df_filled["value"].mean()) / df_filled["value"].std(ddof=0)
    display(df_filled[["value", "value_z"]].head())
else:
    print("No 'value' column found; skipping z-score example.")

# Example: categorizing score into bands if present
if "score" in df_filled.columns:
    bins = [-1, 59, 69, 79, 89, 100]
    labels = ["F", "D", "C", "B", "A"]
    df_filled["grade"] = pd.cut(df_filled["score"], bins=bins, labels=labels)
    display(df_filled[["score", "grade"]].head())


Unnamed: 0,value,value_z
0,59.21,1.036021
1,45.44,-0.420476
2,65.15,1.664314
3,37.53,-1.257143
4,58.62,0.973615


Unnamed: 0,score,grade
0,81,B
1,77,C
2,68,D
3,46,F
4,83,B


In [8]:

# Group by a category and aggregate numeric stats (edit the column names to match your dataset)
group_key = None
for c in ["category", "Type 1", "type", "class", "species"]:
    if c in df_filled.columns:
        group_key = c
        break

if group_key:
    grouped = df_filled.groupby(group_key).agg(['count', 'mean', 'median', 'std', 'min', 'max'])
    print(f"Grouped by '{group_key}':")
    display(grouped)
else:
    print("No obvious categorical column found for grouping. Edit 'group_key' to fit your dataset.")


TypeError: category dtype does not support aggregation 'mean'

In [None]:

# Try building a simple pivot table if we have both a categorical and numeric column
if group_key and len(num_cols) > 0:
    pivot = pd.pivot_table(df_filled, index=group_key, values=num_cols[0], aggfunc=['mean', 'count'])
    print("Pivot table (mean & count):")
    display(pivot)
else:
    print("Not enough columns to make a pivot table. Edit to match your dataset.")


In [None]:

# Histogram for a numeric column
if len(num_cols) > 0:
    plt.figure()
    df_filled[num_cols[0]].hist()
    plt.title(f"Histogram of {num_cols[0]}")
    plt.xlabel(num_cols[0])
    plt.ylabel("Frequency")
    plt.show()
else:
    print("No numeric columns for histogram.")

# Bar chart for category counts
if group_key:
    plt.figure()
    df_filled[group_key].value_counts().plot(kind="bar")
    plt.title(f"Counts by {group_key}")
    plt.xlabel(group_key)
    plt.ylabel("Count")
    plt.show()


In [None]:

# Save a cleaned version for your records
out_path = "/mnt/data/cleaned_dataset.csv"
df_filled.to_csv(out_path, index=False)
print(f"Cleaned CSV exported to: {out_path}")



## Next steps

- Replace the column names, filters, and examples with those from your tutorial dataset so your work matches the video.
- Add any **extra analysis** you perform (more plots, correlations, advanced filtering, etc.).
- When you're satisfied: **File → Download as → Notebook (.ipynb)** and upload to **GitHub**.

**GitHub quick steps:**
1. Create a new repo (or open an existing one) on github.com.
2. Click **Add file → Upload files**, drag your `.ipynb` here, and **Commit**.
3. Open the notebook file in GitHub; copy the URL and submit it to your LMS.
