<a href="https://colab.research.google.com/github/Reben80/Data110-32008--Sp25/blob/main/Week8_Visualizing_trends.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import seaborn as sns

In [None]:

# Load the dataset
df = pd.read_csv("/content/blue_jays.csv")  # Make sure the CSV file is in the same folder or provide the full path




In [None]:
# Scatter plot: Body Mass vs Head Length
plt.figure(figsize=(8, 6))
plt.scatter(df['head_length_mm'], df['body_mass_g'], color='blue', alpha=0.7)

# Adding titles and labels
plt.title("Scatter Plot of Body Mass vs Head Length", fontsize=14)
plt.xlabel("Head Length (mm)", fontsize=12)
plt.ylabel("Body Mass (g)", fontsize=12)

# Optional: add a grid
plt.grid(True)

# Show plot
plt.tight_layout()
plt.show()

In [None]:

# Define X and y
X = df[['head_length_mm']]  # Predictor
y = df['body_mass_g']       # Response

# Fit the linear regression model
model = LinearRegression()
model.fit(X, y)

# Predict values for the regression line
y_pred = model.predict(X)

# Calculate regression line parameters
slope = model.coef_[0]
intercept = model.intercept_
r2 = r2_score(y, y_pred)

# Plotting
plt.figure(figsize=(8, 6))
plt.scatter(X, y, color='blue', alpha=0.7, label="Data")
plt.plot(X, y_pred, color='red', label="Regression Line")

# Add labels and title
plt.title("Body Mass vs Head Length with Regression Line", fontsize=14)
plt.xlabel("Head Length (mm)", fontsize=12)
plt.ylabel("Body Mass (g)", fontsize=12)
plt.grid(True)
plt.legend()


plt.show()


In [None]:
# Scatter plot: Body Mass vs Head Length (colored by sex)
plt.figure(figsize=(8, 6))

# Male in blue, Female in orange
plt.scatter(df[df['sex'] == 'M']['head_length_mm'],
            df[df['sex'] == 'M']['body_mass_g'],
            color='blue', alpha=0.7, label='Male')
plt.scatter(df[df['sex'] == 'F']['head_length_mm'],
            df[df['sex'] == 'F']['body_mass_g'],
            color='orange', alpha=0.7, label='Female')

# Adding titles and labels
plt.title("Scatter Plot of Body Mass vs Head Length", fontsize=14)
plt.xlabel("Head Length (mm)", fontsize=12)
plt.ylabel("Body Mass (g)", fontsize=12)
plt.grid(True)
plt.legend()

# Show plot
plt.tight_layout()
plt.show()

In [None]:
# Filter for males only
male_df = df[df['sex'] == 'M']
x = male_df['head_length_mm'].values
y = male_df['body_mass_g'].values

# Linear regression using numpy
slope, intercept = np.polyfit(x, y, 1)
y_pred = slope * x + intercept

# R² calculation
ss_res = np.sum((y - y_pred) ** 2)
ss_tot = np.sum((y - np.mean(y)) ** 2)
r2 = 1 - (ss_res / ss_tot)

# Plot
plt.figure(figsize=(8, 6))
plt.scatter(x, y, color='blue', alpha=0.7, label='Male Data')
plt.plot(x, y_pred, color='black', label='Regression Line')

plt.title("Male Blue Jays: Body Mass vs Head Length", fontsize=14)
plt.xlabel("Head Length (mm)", fontsize=12)
plt.ylabel("Body Mass (g)", fontsize=12)
plt.grid(True)
plt.legend()

# Equation on plot
eq = f"y = {slope:.2f}x + {intercept:.2f}  |  $R^2$ = {r2:.3f}"
plt.text(min(x), max(y), eq, fontsize=12, verticalalignment='top')

plt.tight_layout()
plt.show()

In [None]:
# Filter for females only
female_df = df[df['sex'] == 'F']
x = female_df['head_length_mm'].values
y = female_df['body_mass_g'].values

# Linear regression using numpy
slope, intercept = np.polyfit(x, y, 1)
y_pred = slope * x + intercept

# R² calculation
ss_res = np.sum((y - y_pred) ** 2)
ss_tot = np.sum((y - np.mean(y)) ** 2)
r2 = 1 - (ss_res / ss_tot)

# Plot
plt.figure(figsize=(8, 6))
plt.scatter(x, y, color='orange', alpha=0.7, label='Female Data')
plt.plot(x, y_pred, color='black', label='Regression Line')

plt.title("Female Blue Jays: Body Mass vs Head Length", fontsize=14)
plt.xlabel("Head Length (mm)", fontsize=12)
plt.ylabel("Body Mass (g)", fontsize=12)
plt.grid(True)
plt.legend()

# Equation on plot
eq = f"y = {slope:.2f}x + {intercept:.2f}  |  $R^2$ = {r2:.3f}"
plt.text(min(x), max(y), eq, fontsize=12, verticalalignment='top')

plt.tight_layout()
plt.show()

In [None]:

# Split by sex
male_df = df[df['sex'] == 'M']
female_df = df[df['sex'] == 'F']

# Male data
x_m = male_df['head_length_mm'].values
y_m = male_df['body_mass_g'].values
slope_m, intercept_m = np.polyfit(x_m, y_m, 1)
y_pred_m = slope_m * x_m + intercept_m
r2_m = 1 - (np.sum((y_m - y_pred_m)**2) / np.sum((y_m - np.mean(y_m))**2))

# Female data
x_f = female_df['head_length_mm'].values
y_f = female_df['body_mass_g'].values
slope_f, intercept_f = np.polyfit(x_f, y_f, 1)
y_pred_f = slope_f * x_f + intercept_f
r2_f = 1 - (np.sum((y_f - y_pred_f)**2) / np.sum((y_f - np.mean(y_f))**2))

# Plot both
plt.figure(figsize=(8, 6))
plt.scatter(x_m, y_m, color='blue', alpha=0.7, label='Male')
plt.scatter(x_f, y_f, color='orange', alpha=0.7, label='Female')

plt.plot(x_m, y_pred_m, color='blue', linestyle='--', label=f"Male Fit: y = {slope_m:.2f}x + {intercept_m:.2f} (R² = {r2_m:.3f})")
plt.plot(x_f, y_pred_f, color='orange', linestyle='--', label=f"Female Fit: y = {slope_f:.2f}x + {intercept_f:.2f} (R² = {r2_f:.3f})")

# Labels
plt.title("Body Mass vs Head Length (Male & Female)", fontsize=14)
plt.xlabel("Head Length (mm)", fontsize=12)
plt.ylabel("Body Mass (g)", fontsize=12)
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:

# Create the plot
plt.figure(figsize=(8, 6))
sns.regplot(data=df, x='body_mass_g', y='head_length_mm', scatter_kws={'alpha':0.6})

# Set font size and labels
plt.title("Head Length vs Body Mass", fontsize=14)
plt.xlabel("Body Mass (g)", fontsize=12)
plt.ylabel("Head Length (mm)", fontsize=12)

plt.tight_layout()
plt.show()


Example: Fuel-tank capacity versus price in cars

In [None]:

# Load the dataset
df = pd.read_csv("/content/cars93.csv")  # use the exact filename if running locally

# Plot using seaborn
plt.figure(figsize=(8, 6))
sns.set_theme(style="whitegrid")  # clean theme like theme_bw()

sns.regplot(
    data=df,
    x="Price",
    y="Fuel.tank.capacity",
    scatter_kws={"alpha": 0.6},
    line_kws={"color": "red"},
    ci=95  # default 95% confidence interval
)

# Titles and labels
plt.title("Fuel Tank Capacity vs Price", fontsize=14)
plt.xlabel("Price (in thousands)", fontsize=12)
plt.ylabel("Fuel Tank Capacity (gallons)", fontsize=12)

plt.tight_layout()
plt.show()


In [None]:
sns.regplot(
    data=df,
    x="Price",
    y="Fuel.tank.capacity",
    lowess=True,           # <-- LOESS smoothing
    ci=None,               # <-- no confidence interval (se = FALSE)
    scatter_kws={"alpha": 0.6},
    line_kws={"color": "red"}
)

# Labels and title
plt.title("Fuel Tank Capacity vs Price (LOESS)", fontsize=14)
plt.xlabel("Price", fontsize=12)
plt.ylabel("Fuel Tank Capacity", fontsize=12)

plt.tight_layout()
plt.show()

Seaborn uses ```lowess=True``` under the hood (via statsmodels.nonparametric.lowess), but it doesn't expose the span (called frac) directly in regplot.

In [None]:

import statsmodels.api as sm



# Sort data by x before smoothing
x = df["Price"]
y = df["Fuel.tank.capacity"]
sorted_idx = np.argsort(x)
x_sorted = x.iloc[sorted_idx]
y_sorted = y.iloc[sorted_idx]

# Apply LOWESS smoothing with span = 0.25
lowess = sm.nonparametric.lowess
smoothed = lowess(y_sorted, x_sorted, frac=0.25)

# Plot
plt.figure(figsize=(8, 6))
sns.set_theme(style="whitegrid")

# Scatter plot
plt.scatter(x, y, alpha=0.6, label="Data")

# LOESS line
plt.plot(smoothed[:, 0], smoothed[:, 1], color="red", label="LOESS (span=0.25)")

# Labels
plt.title("Fuel Tank Capacity vs Price (Custom LOESS span=0.25)", fontsize=14)
plt.xlabel("Price", fontsize=12)
plt.ylabel("Fuel Tank Capacity", fontsize=12)
plt.grid(True)
plt.legend()

plt.tight_layout()
plt.show()
