# 라이브러리 불러오기

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl

print(np.__version__)
print(pd.__version__)
print(sns.__version__)
print(mpl.__version__)

2.2.4
2.2.3
0.13.2
3.10.1


# 샘플 데이터 가져오기

In [10]:
iris = sns.load_dataset("iris")
iris.head(1)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa


In [8]:
tips = sns.load_dataset("tips")
tips.head(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2


# 결측치 확인
- 데이터가 비어 있나?

In [10]:
iris.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [11]:
iris.shape

(150, 5)

In [12]:
tips.shape

(244, 7)

In [13]:
a = tips['day']
type(a)

pandas.core.series.Series

In [14]:
type(tips)

pandas.core.frame.DataFrame

In [15]:
tips['day'].value_counts()

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

# 상위 5개만 보기
- 관련 메서드
  + DataFrame.nsmallest
  + DataFrame.sort_values
  + DataFrame.head

In [16]:
# 숫자열을 sort() 내림차순 정렬
# 상위 5개만 인덱싱
iris.nlargest(5, "sepal_length")

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
131,7.9,3.8,6.4,2.0,virginica
117,7.7,3.8,6.7,2.2,virginica
118,7.7,2.6,6.9,2.3,virginica
122,7.7,2.8,6.7,2.0,virginica
135,7.7,3.0,6.1,2.3,virginica


# 필터링
- NumPy와 문법 동일

In [17]:
# tips의 평균 구하기
# 평균보다 큰 데이터만 조회
mean_tip = tips['tip'].mean()
# numpy 문법 ==> a[a>12]
tips[tips['tip'] > mean_tip].head(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
2,21.01,3.5,Male,No,Sun,Dinner,3


In [18]:
tips[tips['tip'] > tips['tip'].mean()].head(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
2,21.01,3.5,Male,No,Sun,Dinner,3


- smoker가 No인 것만 조회

In [19]:
tips[tips['smoker'] == 'No']

# day Sat
tips[tips['day'] == 'Sat']

# time Dinner
tips[tips['time'] == 'Dinner']

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [20]:
tips[tips['day'] == 'Sat'].shape

(87, 7)

In [24]:
# day가 Sat인 조회
# 인덱스 번호 0번째부터 재정렬
# head(1) 조회
tips[tips['day'] == 'Sat'].reset_index(drop=True).head(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,20.65,3.35,Male,No,Sat,Dinner,3


# loc vs iloc
- 코드 비교

## loc

In [32]:
# tips[tips['day'] == 'Sat']
# 문법 : tips.loc[행,열]
tips.loc[0:1, ['total_bill', 'tip', 'day']]

Unnamed: 0,total_bill,tip,day
0,16.99,1.01,Sun
1,10.34,1.66,Sun


In [34]:
# iloc
tips.iloc[0:1, [0, 1, 4]]

Unnamed: 0,total_bill,tip,day
0,16.99,1.01,Sun


In [40]:
# total_bill 11이하인 것만 조회
# tips.loc[행 조건식만 만들자, :] 전체 데이터 조회
tips.loc[tips['total_bill'] <= 11, :].head(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,10.34,1.66,Male,No,Sun,Dinner,3


In [43]:
# time이 Dinner인 것만 조회, loc 적용
tips.loc[tips['time'] == 'Dinner', :].head(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2


In [55]:
# time이 Dinner이면서 total_bill이 11이하인 것만 조회
result = tips.loc[tips['time'] == 'Dinner', :]
result.loc[result['total_bill'] <= 11, :]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,10.34,1.66,Male,No,Sun,Dinner,3
6,8.77,2.0,Male,No,Sun,Dinner,2
10,10.27,1.71,Male,No,Sun,Dinner,2
16,10.33,1.67,Female,No,Sun,Dinner,3
30,9.55,1.45,Male,No,Sat,Dinner,2
43,9.68,1.32,Male,No,Sun,Dinner,2
51,10.29,2.6,Female,No,Sun,Dinner,2
53,9.94,1.56,Male,No,Sun,Dinner,2
67,3.07,1.0,Female,Yes,Sat,Dinner,1
75,10.51,1.25,Male,No,Sat,Dinner,2


In [54]:
# 조건식 2개
# 필터링 문법 : tips.loc(조건식1) & (조건식2), :]
tips.loc[(tips['time'] == 'Dinner') & (tips['total_bill'] <= 11), :]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,10.34,1.66,Male,No,Sun,Dinner,3
6,8.77,2.0,Male,No,Sun,Dinner,2
10,10.27,1.71,Male,No,Sun,Dinner,2
16,10.33,1.67,Female,No,Sun,Dinner,3
30,9.55,1.45,Male,No,Sat,Dinner,2
43,9.68,1.32,Male,No,Sun,Dinner,2
51,10.29,2.6,Female,No,Sun,Dinner,2
53,9.94,1.56,Male,No,Sun,Dinner,2
67,3.07,1.0,Female,Yes,Sat,Dinner,1
75,10.51,1.25,Male,No,Sat,Dinner,2


In [56]:
tips.loc[tips['day'] == 'Sat', ['total_bill', 'tip', 'day']].reset_index(drop=True)

Unnamed: 0,total_bill,tip,day
0,20.65,3.35,Sat
1,17.92,4.08,Sat
2,20.29,2.75,Sat
3,15.77,2.23,Sat
4,39.42,7.58,Sat
...,...,...,...
82,35.83,4.67,Sat
83,29.03,5.92,Sat
84,27.18,2.00,Sat
85,22.67,2.00,Sat


In [74]:
# iris
# 품종이 virginica 이면서 sepal_length >= 5인 값만 가져오고
# 컬럼은 sepal_length, petal_length, species만 가져오기
# loc기반으로
iris.loc[(iris['species'] == 'virginica') | (iris['sepal_length'] >= 5), ['sepal_length', 'petal_length', 'species']]
# 1. iris.loc[:, ['sepal_length', 'petal_length', 'species']] 먼저 열을 지정해주면 더 깔끔함
# 2. iris.loc[(조건식1) or (조건식), ['sepal_length', 'petal_length', 'species']]

Unnamed: 0,sepal_length,petal_length,species
0,5.1,1.4,setosa
4,5.0,1.4,setosa
5,5.4,1.7,setosa
7,5.0,1.5,setosa
10,5.4,1.5,setosa
...,...,...,...
145,6.7,5.2,virginica
146,6.3,5.0,virginica
147,6.5,5.2,virginica
148,6.2,5.4,virginica


# 파일 입출력
- csv
- excel

In [1]:
import seaborn as sns
import pandas as pd

iris = sns.load_dataset("iris")
result = iris.loc[:, ['sepal_length', 'species']]
result

Unnamed: 0,sepal_length,species
0,5.1,setosa
1,4.9,setosa
2,4.7,setosa
3,4.6,setosa
4,5.0,setosa
...,...,...
145,6.7,virginica
146,6.3,virginica
147,6.5,virginica
148,6.2,virginica


# CSV

In [10]:
# 파일 내보내기
result.to_csv("dataset/iris_result.csv", index=False)

In [13]:
# 파일 불러오기
iris_df = pd.read_csv("dataset/iris_result.csv")
iris_df

Unnamed: 0,sepal_length,species
0,5.1,setosa
1,4.9,setosa
2,4.7,setosa
3,4.6,setosa
4,5.0,setosa
...,...,...
145,6.7,virginica
146,6.3,virginica
147,6.5,virginica
148,6.2,virginica
