# 01 Intro to EDA

In [None]:
import duckdb, pandas as pd, matplotlib.pyplot as plt
import plotly.express as px

In [None]:
df = duckdb.query("SELECT * FROM read_csv_auto('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/tips.csv')").df()
df.head()

In [None]:
df.info()
df.describe()
df.isna().sum()

In [None]:
df[df['total_bill'] > 20]

In [None]:
df['tip_pct'] = df.tip / df.total_bill

In [None]:
df.head()

In [None]:
#df.rename(columns={'tip_pct': 'new_name'}, inplace=True)
#df.head()

In [None]:
df.groupby('day')['tip_pct'].mean()

In [None]:
df['tip_pct'].hist()
plt.show()

In [None]:
px.scatter(df, x='total_bill', y='tip', color='day')

In [None]:
#best practices example
# --- 1. Import libraries ---
import duckdb
import pandas as pd
import os

# --- 2. Load dataset directly from seaborn's GitHub mirror ---
df = duckdb.query("""
    SELECT * FROM read_csv_auto('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/tips.csv')
""").df()

print("Original columns:", list(df.columns))

# --- 3. Clean column names ---
# Strip whitespace, lowercase, replace spaces with underscores
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# --- 4. Rename specific columns for clarity ---
df.rename(columns={
    'total_bill': 'bill_total_usd',
    'tip': 'tip_usd',
    'sex': 'gender',
    'smoker': 'is_smoker',
    'size': 'party_size'
}, inplace=True)

print("Renamed columns:", list(df.columns))

# --- 5. Add a computed column ---
df['tip_pct'] = df['tip_usd'] / df['bill_total_usd']

# --- 6. Verify ---
display(df.head())

# --- 7. Save cleaned version to your repo ---
os.makedirs('../data/processed', exist_ok=True)
df.to_csv('../data/processed/tips_cleaned.csv', index=False)

print("✅ Cleaned dataset saved to: data/processed/tips_cleaned.csv")


## Markdown examples

```python
print("Hello world")



**Bulleted list** (use dash or asterisk before each line) →  

* First
* Second
* Third

**Numbered list** 
1. Step One
2. Do this step
1. Step Two

---
Horizontal line →  `---`  (see above in markdown)

Link →  `[Link text](https://example.com)`  
e.g.: `[Google](https://www.google.com)` #Renders as: [Google](https://www.google.com)

Image →  `![Alt text](https://example.com/image.png)`  

Blockquote →  `> Quoted or referenced text`  

---

✅ **Tip:**  
In Jupyter, press **Esc → M** to switch a cell to Markdown,  
then **Shift + Enter** to render it.  

---

**Bold text**

_Italics (emphasis)_

---

> Quoted text

## Exploratory Analysis: Summary Statistics
This section summarizes the key features of our dataset using descriptive statistics.
