# 열 분리

- 하나의 열이 여러 가지 정보를 담고 있으면 정보를 분리해야함
    - 예) 어떤 열에 연월일 정보가 있을 때, 연, 월, 일을 구분하여 3개의 열로 분리

In [1]:
import pandas as pd

In [6]:
df = pd.DataFrame({"name" : ['Alice Smith', "Bob Johnson", "Charlie Brown"],
                  "age" : [25, 30, 35]})

In [7]:
df

Unnamed: 0,name,age
0,Alice Smith,25
1,Bob Johnson,30
2,Charlie Brown,35


In [8]:
df2 = df.copy()

In [9]:
df.dtypes

name    object
age      int64
dtype: object

In [12]:
"Alice Smith".split()

['Alice', 'Smith']

In [14]:
split_name = df["name"].str.split()

In [15]:
split_name

0      [Alice, Smith]
1      [Bob, Johnson]
2    [Charlie, Brown]
Name: name, dtype: object

In [16]:
df["first_name"] = split_name.str.get(0) # split_name 변수의 원소 리스트의 0번째 인덱스 값
df["last_name"] = split_name.str.get(1) # split_name 변수의 원소 리스트의 1번째 인덱스 값

In [17]:
df.head()

Unnamed: 0,name,age,first_name,last_name
0,Alice Smith,25,Alice,Smith
1,Bob Johnson,30,Bob,Johnson
2,Charlie Brown,35,Charlie,Brown


In [18]:
df2.head()

Unnamed: 0,name,age
0,Alice Smith,25
1,Bob Johnson,30
2,Charlie Brown,35


In [25]:
df2[["first_name", "last_name"]] = df2["name"].str.split(expand = True)

In [26]:
df2.head()

Unnamed: 0,name,age,first_name,last_name
0,Alice Smith,25,Alice,Smith
1,Bob Johnson,30,Bob,Johnson
2,Charlie Brown,35,Charlie,Brown


# 필터링

- 시리즈 또는 데이터프레임의 데이터 중에서 특정 조건식을 만족하는 원소만 따로 추출

## 불리언 인덱싱

- 시리즈 객체에 어떤 조건식을 적용하면 각 원소에 대해 참 / 거짓을 판별하여 불리언 값으로 구성된 시리즈를 반환
- 이때 참에 해당하는 데이터 값을 따로 선택할 수 있음

In [30]:
scientists = pd.read_csv("./data/scientists.csv")

In [31]:
scientists.head()

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist


In [32]:
# 평균보다 나이가 많은 사람의 데이터 추출
sci_older = scientists[scientists["Age"] > scientists["Age"].mean()]
sci_older.head()

Unnamed: 0,Name,Born,Died,Age,Occupation
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [33]:
ages = scientists["Age"]

In [34]:
ages

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64

In [35]:
ages.mean()

59.125

In [36]:
ages[ages > ages.mean()]

1    61
2    90
3    66
7    77
Name: Age, dtype: int64

In [37]:
ages > ages.mean()

0    False
1     True
2     True
3     True
4    False
5    False
6    False
7     True
Name: Age, dtype: bool

In [39]:
ages[[True, True, False, False, True, True, False, True]] # True표시된 데이터값(인덱스 번호)만 표시

0    37
1    61
4    56
5    45
7    77
Name: Age, dtype: int64

In [40]:
# 40대의 데이터 추출
sci_40 = scientists[(scientists["Age"] >= 40) & (scientists["Age"] < 50)]

In [41]:
sci_40

Unnamed: 0,Name,Born,Died,Age,Occupation
5,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist


In [42]:
 # 나이가 50미만이거나 90이상인 데이터 추출
sci_5090 = scientists[(scientists["Age"] < 50) | (scientists["Age"] >= 90)]

In [43]:
sci_5090

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
5,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist


### str 메서드 활용

- 문자열 데이터를 다룰 때 유용하게 사용

In [44]:
# 이름이 R로 시작하는 데이터 추출
scientists[scientists["Name"].str.startswith("R")]

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist


In [47]:
# 이름이 e로 끝나는 데이터 추출
scientists[scientists["Name"].str.endswith("e")]

Unnamed: 0,Name,Born,Died,Age,Occupation
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist


In [46]:
# 이름에 r이 포함된 데이터 추출
scientists[scientists["Name"].str.contains("r")]

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist


### isin() 메서드 활용

- 특정 값을 가진 행동을 추출

In [51]:
scientists[scientists["Occupation"].isin(["Chemist", "Nurse", "Computer Scientist"])]

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
