In [62]:
import pandas as pd

## pydataset에서 iris 데이터 불러와서 iris.csv 만들어서 저장하기

In [None]:
from pydataset import data
data = data('iris')
data = data.iloc[:,[0,1,4]]
data = data[['Sepal.Width','Species','Sepal.Length']]
data.columns = ['sepal_width', 'species', 'sepal_length']
data.to_csv('iris.csv',index=False)

## 데이터 불러오기

In [94]:
iris = pd.read_csv('iris.csv')
iris

Unnamed: 0,sepal_width,species,sepal_length
0,3.5,setosa,5.1
1,3.0,setosa,4.9
2,3.2,setosa,4.7
3,3.1,setosa,4.6
4,3.6,setosa,5.0
...,...,...,...
145,3.0,virginica,6.7
146,2.5,virginica,6.3
147,3.0,virginica,6.5
148,3.4,virginica,6.2


## 컬럼 순서 변경

In [72]:
iris.columns

Index(['sepal_width', 'species', 'sepal_length'], dtype='object')

In [73]:
col = iris.columns.to_numpy()
col

array(['sepal_width', 'species', 'sepal_length'], dtype=object)

In [74]:
col = col[[1,0,2]]
col

array(['species', 'sepal_width', 'sepal_length'], dtype=object)

In [75]:
iris = iris[col]
iris

Unnamed: 0,species,sepal_width,sepal_length
0,setosa,3.5,5.1
1,setosa,3.0,4.9
2,setosa,3.2,4.7
3,setosa,3.1,4.6
4,setosa,3.6,5.0
...,...,...,...
145,virginica,3.0,6.7
146,virginica,2.5,6.3
147,virginica,3.0,6.5
148,virginica,3.4,6.2


### 꽃 종류 확인

In [76]:
iris.species.unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

## 열 삽입

In [77]:
iris.head(3)

Unnamed: 0,species,sepal_width,sepal_length
0,setosa,3.5,5.1
1,setosa,3.0,4.9
2,setosa,3.2,4.7


In [78]:
iris.insert(loc=0,column='Classification',value='normal')
iris.head(3)

Unnamed: 0,Classification,species,sepal_width,sepal_length
0,normal,setosa,3.5,5.1
1,normal,setosa,3.0,4.9
2,normal,setosa,3.2,4.7


## 조건에 따라 검색 및 값 넣기

### 데이터 프레임 복사: 조건에 따라 값을 넣을 때는 복사해서 넣는 것이 선호됩니다.

In [83]:
df = iris.copy()
df.head(3)

Unnamed: 0,Classification,species,sepal_width,sepal_length
0,normal,setosa,3.5,5.1
1,normal,setosa,3.0,4.9
2,normal,setosa,3.2,4.7


### 조건 설정

In [87]:
print("sepal_length의 평균:", df.mean().sepal_length)
print("sepal_width의 평균:", df.mean().sepal_width)

sepal_length의 평균: 5.843333333333335
sepal_width의 평균: 3.057333333333334


In [89]:
length_condition = (df.sepal_length > df.mean().sepal_length)
width_condition = (df.sepal_width > df.mean().sepal_width)

condition = (length_condition & width_condition) # and: & , or: |
print(condition.head())

0    False
1    False
2    False
3    False
4    False
dtype: bool


In [85]:
df[condition].head()

Unnamed: 0,Classification,species,sepal_width,sepal_length
50,normal,versicolor,3.2,7.0
51,normal,versicolor,3.2,6.4
52,normal,versicolor,3.1,6.9
56,normal,versicolor,3.3,6.3
65,normal,versicolor,3.1,6.7


### 조건 만족할때 Classification 열의 값을 big으로 바꾸기

In [91]:
df.loc[condition ,'Classification'] = 'big'
df[condition].head(3)

Unnamed: 0,Classification,species,sepal_width,sepal_length
50,big,versicolor,3.2,7.0
51,big,versicolor,3.2,6.4
52,big,versicolor,3.1,6.9


## Groupby
groupby 잘 설명한 사이트: https://rfriend.tistory.com/392

### agg 함수를 사용하면 다양한 집계함수를 한 번에 여러 열에 적용시킬 수 있습니다.

In [92]:
df[df.Classification == 'big'].groupby('species').agg(
    {'species' : 'count',
     'sepal_width':['mean','std','max','min'],
     'sepal_length' : ['mean','std','max','min']} )

Unnamed: 0_level_0,species,sepal_width,sepal_width,sepal_width,sepal_width,sepal_length,sepal_length,sepal_length,sepal_length
Unnamed: 0_level_1,count,mean,std,max,min,mean,std,max,min
species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
versicolor,8,3.2,0.106904,3.4,3.1,6.4875,0.405101,7.0,5.9
virginica,17,3.311765,0.22606,3.8,3.1,6.805882,0.478893,7.9,6.2
