# Exercises

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("births_2016.csv").sample(1_000_000).reset_index(drop=True)

Calculate the average birth weight for male and female babies. Fill in the blanks:

In [None]:
df.groupby('_____')['_____'].mean()

Find the minimum and maximum mother's age in the dataset.

In [None]:
print(f"Youngest mother: {df['____'].___()} years")
print(f"Oldest mother: {df['____'].___()} years")

Create a boxplot comparing birth weights between smokers and non-smokers. Fill in the blanks:

In [None]:
import seaborn as sns
sns._____(data=df, x='smoker', y='_____')

Calculate how many mothers in each education level category.

In [None]:
df['mother_edu'].value_counts().sort_index()

Find the average BMI for married vs unmarried mothers. Fill in the blanks:

In [None]:
df.groupby('_____')['mother_BMI']._____()

Create a scatter plot showing the relationship between mother's height and pre-pregnancy weight.

In [None]:
sns.scatterplot(data=df, x='mother_height', y='mother_pre_weight')

Calculate the percentage of smokers in the dataset. Fill in the blanks:

In [None]:
(df['smoker'] == '_____').mean() * 100

Find the average gestational weeks for each race category.

In [None]:
df.groupby('mother_race6')['gest_weeks'].mean()

Create a histogram of mother's ages. Fill in the blanks:

In [None]:
plt._____(df['mother_age'], bins=_____)
plt.xlabel('Mother Age')
plt.ylabel('Frequency')

Calculate the correlation between mother's BMI and baby's birth weight. Fill in the blanks:

In [None]:
df['_____'].corr(df['_____'])

Find the most common birth order number.

In [None]:
df['birth_order_num'].mode()

Create a bar plot showing average birth weight by education level. Fill in the blanks:

In [None]:
avg_weight = df.groupby('mother_edu')['birth_weight_g']._____)
sns.barplot(x=avg_weight.index, y=avg_weight.values)

Calculate the average height for mothers with BMI > 30 vs BMI ≤ 30.

In [None]:
high_bmi = df[df['mother_BMI'] > 30]['mother_height'].mean()
normal_bmi = df[df['mother_BMI'] <= 30]['mother_height'].mean()
print(f"Average height for BMI > 30: {high_bmi:.2f}")
print(f"Average height for BMI ≤ 30: {normal_bmi:.2f}")

Create a cross-tabulation of marital status and smoking status. Fill in the blanks:

In [None]:
pd._____(df['marital_status'], df['_____'])

Find the median birth weight for each combination of infant sex and smoking status. Fill in the blanks:

In [None]:
df.groupby(['infant_sex', '_____'])['birth_weight_g']._____()

Find any outliers in mother's age and investigate their other characteristics

In [None]:
# Part A: Create an outlier detection function (fill in the blanks)
def find_outliers(column):
    Q1 = column._____(0.25)
    Q3 = column._____(0.75)
    IQR = _____ - _____
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return column[(column < lower) | (column > upper)]

# Part B: Apply to mother's age and analyze outlier cases
outliers = find_outliers(df['mother_age'])
df[df['mother_age'].isin(outliers)][['mother_age', 'birth_weight_g', 'mother_edu', 'marital_status']]

Create a comprehensive profile of first-time mothers

In [None]:
first_time = df[df['birth_order_num'] == 1]
profile = {
    'avg_age': first_time['mother_age'].mean(),
    'education_dist': first_time['mother_edu'].value_counts().sort_index(),
    'avg_bmi': first_time['mother_BMI'].mean(),
    'smoking_rate': (first_time['smoker'] == 'Y').mean() * 100
}
# Add two more characteristics of your choice to the profile

Investigate if there's a relationship between mother's height and baby's birth weight, controlling for smoking status

In [None]:
# Create subplot for smokers vs non-smokers
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
# Complete the visualization code

Create a function that categorizes pregnancies as 'high risk' based on multiple factors

In [None]:
def assess_risk(row):
    risk_factors = 0
    if row['mother_age'] > 35:
        risk_factors += 1
    if row['mother_BMI'] > 30:
        risk_factors += 1
    # Add 3 more risk factors
    return 'High Risk' if risk_factors >= 2 else 'Normal Risk'

df['risk_category'] = df.apply(assess_risk, axis=1)

Compare birth weights across different demographic combinations

In [None]:
# Create a pivot table showing average birth weight by:
# - mother's education (rows)
# - marital status (columns)
# - smoking status (values)
pivot = pd.pivot_table(data=df, 
                      values='_____',
                      index='_____',
                      columns=['_____', '_____'],
                      aggfunc='mean')

Write a function to identify and report potential data issues

In [None]:
def data_quality_report(df):
    issues = []
    # Check for impossible values
    if any(df['mother_age'] < 12):
        issues.append("Found mothers younger than 12")
    # Add 4 more logical checks
    return issues

Analyze if taller mothers tend to have larger babies, but make it interesting!

In [None]:
# Create height categories
df['height_category'] = pd.qcut(df['mother_height'], q=4, labels=['Short', 'Below Average', 'Above Average', 'Tall'])
# Create a violin plot showing birth weight distribution by height category
sns._____(data=df, x='height_category', y='birth_weight_g')

Create a heatmap showing correlations between all numeric variables

In [None]:
# Select only numeric columns
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
# Create and plot correlation matrix
corr_matrix = df[numeric_cols].corr()
sns.heatmap(_____, annot=True, cmap='coolwarm', center=0)

Create BMI categories and analyze birth outcomes

In [None]:
def categorize_bmi(bmi):
    if bmi < 18.5: return 'Underweight'
    elif bmi < 25: return 'Normal'
    elif bmi < 30: return 'Overweight'
    else: return 'Obese'

df['bmi_category'] = df['mother_BMI'].apply(_____)
# Analyze average birth weight and gestational weeks by BMI category

Create a visualization showing how education relates to multiple factors

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 15))
# Create 4 different plots showing relationship between education and:
# - mother's age
# - birth weight
# - BMI
# - smoking status

Investigate health outcomes across different racial groups

In [None]:
# Calculate and visualize:
# - Average birth weight
# - Smoking rates
# - Average maternal age
# - Education levels
# Create a dashboard-style visualization

Create a simple model to predict birth weight

In [None]:
from sklearn.linear_model import LinearRegression
# Select features (fill in the blanks)
features = ['mother_age', '_____', '_____', '_____']
X = df[features]
y = df['birth_weight_g']
# Create and evaluate model

Create a comprehensive report function

In [None]:
def generate_report(df, group_by_col):
    """
    Creates a detailed report for any grouping variable
    """
    report = {
        'sample_size': df[group_by_col].value_counts(),
        # Add 5 more metrics that would be interesting to analyze
    }
    return report

Create a function to handle missing values intelligently

In [None]:
def smart_imputer(df):
    # For numeric columns: impute with median if skewed, mean if normal
    # For categorical: impute with mode
    # Add your logic here
    return df

Create a weighted risk scoring system

In [None]:
def calculate_health_score(row):
    score = 100
    # Deduct points based on various risk factors
    if row['smoker'] == 'Y':
        score -= 20
    # Add at least 5 more scoring rules
    return score

df['health_score'] = df.apply(calculate_health_score, axis=1)