In [1]:
#| code-summary: Load Packages
#| code-fold: true

# numerical calculation & data frames
import numpy as np
import pandas as pd

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn.objects as so
import plotly.express as px

# statistics
import statsmodels.api as sm

In [2]:
#| echo: false
from IPython.display import display, HTML
HTML('<style>.output {flex-direction: row;}</style>')

In [3]:
#| code-summary: Options
#| code-fold: true

# pandas options
pd.options.display.precision = 2
pd.options.display.float_format = '{:.2f}'.format  # pd.reset_option('display.float_format')

# Numpy options
np.set_printoptions(precision = 2, suppress=True)

## Useful methods

`.head()`, `.tail()`, `.sample()`  
`.info()`, `.describe()`,  
`.value_counts()`,  
`.sort_values()`, `.nlargest()`

### Loading a Dataset: Tips
일정기간 한 웨이터가 얻은 팁에 대한 데이터

In [4]:
tips = sns.load_dataset("tips")
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [5]:
# 
tips.values

array([[16.99, 1.01, 'Female', ..., 'Sun', 'Dinner', 2],
       [10.34, 1.66, 'Male', ..., 'Sun', 'Dinner', 3],
       [21.01, 3.5, 'Male', ..., 'Sun', 'Dinner', 3],
       ...,
       [22.67, 2.0, 'Male', ..., 'Sat', 'Dinner', 2],
       [17.82, 1.75, 'Male', ..., 'Sat', 'Dinner', 2],
       [18.78, 3.0, 'Female', ..., 'Thur', 'Dinner', 2]], dtype=object)

In [6]:
tips.head() # 처음 N개 나열

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [7]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [8]:
#| layout-ncol: 2
tips.describe() # numerical type만 나열

Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.79,3.0,2.57
std,8.9,1.38,0.95
min,3.07,1.0,1.0
25%,13.35,2.0,2.0
50%,17.8,2.9,2.0
75%,24.13,3.56,3.0
max,50.81,10.0,6.0


In [9]:
tips.describe(include="all") # all types 나열

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
count,244.0,244.0,244,244,244,244,244.0
unique,,,2,2,4,2,
top,,,Male,No,Sat,Dinner,
freq,,,157,151,87,176,
mean,19.79,3.0,,,,,2.57
std,8.9,1.38,,,,,0.95
min,3.07,1.0,,,,,1.0
25%,13.35,2.0,,,,,2.0
50%,17.8,2.9,,,,,2.0
75%,24.13,3.56,,,,,3.0


In [10]:
#| layout-ncol: 2
tips.describe(include="category")

Unnamed: 0,sex,smoker,day,time
count,244,244,244,244
unique,2,2,4,2
top,Male,No,Sat,Dinner
freq,157,151,87,176


In [11]:
s1 = tips["day"].value_counts() # "day" 칼럼을 선택 후 각 카테고리별 counts
s2 = tips["day"].value_counts(normalize=True) # 카테고리별 비율

s3 = tips[["sex", "smoker"]].value_counts() # "sex", "smoker" 칼럼을 선택 후 유니크한 카테고리별 counts

In [12]:
#| layout-ncol: 2
display(s1); display(s2)
display(s3)

Sat     87
Sun     76
Thur    62
Fri     19
Name: day, dtype: int64

Sat    0.36
Sun    0.31
Thur   0.25
Fri    0.08
Name: day, dtype: float64

sex     smoker
Male    No        97
        Yes       60
Female  No        54
        Yes       33
dtype: int64

___

### Loading a Dataset: Penguins

In [17]:
penguins = sns.load_dataset("penguins")
penguins

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.10,18.70,181.00,3750.00,Male
1,Adelie,Torgersen,39.50,17.40,186.00,3800.00,Female
2,Adelie,Torgersen,40.30,18.00,195.00,3250.00,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.70,19.30,193.00,3450.00,Female
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.80,14.30,215.00,4850.00,Female
341,Gentoo,Biscoe,50.40,15.70,222.00,5750.00,Male
342,Gentoo,Biscoe,45.20,14.80,212.00,5200.00,Female


In [None]:
penguins.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [None]:
#| layout-ncol: 2
penguins.describe(include="object")

Unnamed: 0,species,island,sex
count,344,344,333
unique,3,3,2
top,Adelie,Biscoe,Male
freq,152,168,168


In [None]:
penguins[["island", "species"]].value_counts()

island     species  
Biscoe     Gentoo       124
Dream      Chinstrap     68
           Adelie        56
Torgersen  Adelie        52
Biscoe     Adelie        44
dtype: int64

In [19]:
penguins[["sex", "species"]].value_counts(dropna=False) # NaN은 기본적으로 생략

sex     species  
Female  Adelie       73
Male    Adelie       73
        Gentoo       61
Female  Gentoo       58
        Chinstrap    34
Male    Chinstrap    34
NaN     Adelie        6
        Gentoo        5
dtype: int64

In [None]:
s1 = tips["total_bill"]
s2 = tips["tip"]

pd.DataFrame([s1, s2]).T.head()

Unnamed: 0,total_bill,tip
0,16.99,1.01
1,10.34,1.66
2,21.01,3.5
3,23.68,3.31
4,24.59,3.61
