# pandas 데이터 파악과 조작

**분석할 데이터를 수집(확보)하면 데이터의 특징을 파악하고 다루기 쉽게 변형하는 작업을 수행해야 한다**

# #2. 데이터 조작(가공)

- 데이터 개수 세기 : count(), value_counts()
- 데이터 정렬 : sort_values(), sort_index()
- 데이터 집계 : 합계(sum()), 평균(mean()), 최대(max()), 최소(min())
- 데이터 삭제 : drop(axis=0/1)
- 결측치 처리 : dropna(axis=0/1, subset, inplace)
- 데이터 변경 : 
    - 자료형 변경 : astype()
    - 수치형 데이터를 범주형 데이터로 변경 : 
        - 구간을 지정하여 범주화 : cut(data, bins, labels)
        - 동일한 개수를 갖도록 범주화 : qcut(data, bins_num, labels)
- 행/열에 동일한 함수 적용 : apply()
- 열 재구성 : DataFrame.str.split(), Series.str.get()
- 필터링 : isin()

---

In [1]:
import pandas as pd
import numpy as np

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity='all'

# 8. 열 재구성

### 열 순서 변경

- 데이터프레임의 열 순서 변경
- 형식 : df[재구성한 열이름리스트]

In [6]:
import seaborn as sns

In [7]:
titanic = sns.load_dataset('titanic')
titanic.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True


In [8]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [9]:
df = titanic.loc[0:4, 'survived':'age']
df

Unnamed: 0,survived,pclass,sex,age
0,0,3,male,22.0
1,1,1,female,38.0
2,1,3,female,26.0
3,1,1,female,35.0
4,0,3,male,35.0


In [20]:
cols = titanic.columns
new_cols = list(cols)[:8]

In [14]:
new_cols.sort()

In [16]:
new_cols

['age', 'embarked', 'fare', 'parch', 'pclass', 'sex', 'sibsp', 'survived']

In [17]:
df2 = titanic[new_cols]
df2

Unnamed: 0,age,embarked,fare,parch,pclass,sex,sibsp,survived
0,22.0,S,7.2500,0,3,male,1,0
1,38.0,C,71.2833,0,1,female,1,1
2,26.0,S,7.9250,0,3,female,0,1
3,35.0,S,53.1000,0,1,female,1,1
4,35.0,S,8.0500,0,3,male,0,0
...,...,...,...,...,...,...,...,...
886,27.0,S,13.0000,0,2,male,0,0
887,19.0,S,30.0000,0,1,female,0,1
888,,S,23.4500,2,3,female,1,0
889,26.0,C,30.0000,0,1,male,0,1


### 열 분리

In [21]:
# 넷플렉스 주가 데이터
df_nflix = pd.read_csv('data/NFLX.csv')
df_nflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4874 entries, 0 to 4873
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       4874 non-null   object 
 1   Open       4874 non-null   float64
 2   High       4874 non-null   float64
 3   Low        4874 non-null   float64
 4   Close      4874 non-null   float64
 5   Adj Close  4874 non-null   float64
 6   Volume     4874 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 266.7+ KB


#### Date열의 연-월-일을 ['year','month','day']로 분리

In [23]:
dates = df_nflix.Date.str.split('-')

#### 시리즈의 문자열 리스트 인덱싱 : Series.str.get(인덱스)

In [24]:
dates.str.get(0)

0       2002
1       2002
2       2002
3       2002
4       2002
        ... 
4869    2021
4870    2021
4871    2021
4872    2021
4873    2021
Name: Date, Length: 4874, dtype: object

#### ['year','month','day']로 분리된 시리즈문자열 요소를 데이터프레임 열로 추가

In [26]:
df_nflix['year'] = dates.str.get(0)
df_nflix['month'] = dates.str.get(1)
df_nflix['day'] = dates.str.get(2)
df_nflix.head(3)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,year,month,day
0,2002-05-23,1.156429,1.242857,1.145714,1.196429,1.196429,104790000,2002,5,23
1,2002-05-24,1.214286,1.225,1.197143,1.21,1.21,11104800,2002,5,24
2,2002-05-28,1.213571,1.232143,1.157143,1.157143,1.157143,6609400,2002,5,28


In [34]:
df_nflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4874 entries, 0 to 4873
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       4874 non-null   object 
 1   Open       4874 non-null   float64
 2   High       4874 non-null   float64
 3   Low        4874 non-null   float64
 4   Close      4874 non-null   float64
 5   Adj Close  4874 non-null   float64
 6   Volume     4874 non-null   int64  
 7   year       4874 non-null   object 
 8   month      4874 non-null   object 
 9   day        4874 non-null   object 
dtypes: float64(5), int64(1), object(4)
memory usage: 380.9+ KB


## 9. 필터링

### 불린 인덱싱(boolean indexing)

#### 나이가 10대(10~19세) 승객 추출

In [40]:
(titanic['age'] >= 10) & (titanic['age'] <=19)
titanic.loc[(titanic['age'] >= 10) & (titanic['age'] <=19)]

0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887     True
888    False
889    False
890    False
Name: age, Length: 891, dtype: bool

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False
14,0,3,female,14.0,0,0,7.8542,S,Third,child,False,,Southampton,no,True
22,1,3,female,15.0,0,0,8.0292,Q,Third,child,False,,Queenstown,yes,True
27,0,1,male,19.0,3,2,263.0000,S,First,man,True,C,Southampton,no,False
38,0,3,female,18.0,2,0,18.0000,S,Third,woman,False,,Southampton,no,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
853,1,1,female,16.0,0,1,39.4000,S,First,woman,False,D,Southampton,yes,False
855,1,3,female,18.0,0,1,9.3500,S,Third,woman,False,,Southampton,yes,False
875,1,3,female,15.0,0,0,7.2250,C,Third,child,False,,Cherbourg,yes,True
877,0,3,male,19.0,0,0,7.8958,S,Third,man,True,,Southampton,no,True


#### 나이가 10세 미만이고 여성인 승객 추출

In [41]:
titanic.loc[(titanic.age < 10)&(titanic.sex == 'female')]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
10,1,3,female,4.0,1,1,16.7,S,Third,child,False,G,Southampton,yes,False
24,0,3,female,8.0,3,1,21.075,S,Third,child,False,,Southampton,no,False
43,1,2,female,3.0,1,2,41.5792,C,Second,child,False,,Cherbourg,yes,False
58,1,2,female,5.0,1,2,27.75,S,Second,child,False,,Southampton,yes,False
119,0,3,female,2.0,4,2,31.275,S,Third,child,False,,Southampton,no,False
147,0,3,female,9.0,2,2,34.375,S,Third,child,False,,Southampton,no,False
172,1,3,female,1.0,1,1,11.1333,S,Third,child,False,,Southampton,yes,False
184,1,3,female,4.0,0,2,22.025,S,Third,child,False,,Southampton,yes,False
205,0,3,female,2.0,0,1,10.4625,S,Third,child,False,G,Southampton,no,False
233,1,3,female,5.0,4,2,31.3875,S,Third,child,False,,Southampton,yes,False


#### 나이가 10세 미만 또는 60세 이상인 승객의 age, sex, alone 열 추출

In [42]:
titanic.loc[(titanic.age <10)|(titanic.age >= 60),['age','sex','alone']]

Unnamed: 0,age,sex,alone
7,2.00,male,False
10,4.00,female,False
16,2.00,male,False
24,8.00,female,False
33,66.00,male,True
...,...,...,...
831,0.83,male,False
850,4.00,male,False
851,74.00,male,True
852,9.00,female,False


### 참고. IPython Display

In [44]:
# IPython 디스플레이 설정값 가져오기: 출력할 최대 열의 개수
pd.get_option('display.max_columns')
pd.get_option('display.max_rows')

20

60

In [51]:
# IPython 디스플레이 설정 변경 : 출력할 최대 열의 개수
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)

In [46]:
titanic.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,...,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,...,True,,Southampton,no,False
1,1,1,female,38.0,1,...,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,...,False,,Southampton,yes,True


### isin() 메서드 활용

- isin() 메서드를 적용하면 특정 값을 가진 행들을 추출함
- 형식 : df.isin(추출값 리스트)

#### 함께 탑승한 형제 또는 배우자 수가 3,4,5인 승객만 추출

- 불린 인덱싱 사용

In [52]:
titanic.loc[(titanic.sibsp>=3) & (titanic.sibsp <=5)]

Unnamed: 0,survived,pclass,sex,age,sibsp,...,adult_male,deck,embark_town,alive,alone
7,0,3,male,2.0,3,...,False,,Southampton,no,False
16,0,3,male,2.0,4,...,False,,Queenstown,no,False
24,0,3,female,8.0,3,...,False,,Southampton,no,False
27,0,1,male,19.0,3,...,True,C,Southampton,no,False
50,0,3,male,7.0,4,...,False,,Southampton,no,False
...,...,...,...,...,...,...,...,...,...,...,...
787,0,3,male,8.0,4,...,False,,Queenstown,no,False
813,0,3,female,6.0,4,...,False,,Southampton,no,False
819,0,3,male,10.0,3,...,False,,Southampton,no,False
824,0,3,male,2.0,4,...,False,,Southampton,no,False


- isin() 메서드 사용

In [53]:
titanic[titanic.sibsp.isin([3,4,5])]

Unnamed: 0,survived,pclass,sex,age,sibsp,...,adult_male,deck,embark_town,alive,alone
7,0,3,male,2.0,3,...,False,,Southampton,no,False
16,0,3,male,2.0,4,...,False,,Queenstown,no,False
24,0,3,female,8.0,3,...,False,,Southampton,no,False
27,0,1,male,19.0,3,...,True,C,Southampton,no,False
50,0,3,male,7.0,4,...,False,,Southampton,no,False
...,...,...,...,...,...,...,...,...,...,...,...
787,0,3,male,8.0,4,...,False,,Queenstown,no,False
813,0,3,female,6.0,4,...,False,,Southampton,no,False
819,0,3,male,10.0,3,...,False,,Southampton,no,False
824,0,3,male,2.0,4,...,False,,Southampton,no,False


-----------------------------------------